## Loading Data

In [49]:
import pandas as pd
# import os
# os.getcwd()

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [50]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


## Preprocessing Data

In [51]:
#embarked
train = train.dropna(subset=["Embarked"])
#dropna removes all of the null embarked values, since there are only a few missing

#cabin
train = train.drop("Cabin", axis=1)
#there are a lot of missing values for cabin, so we are completely removing this column

#age
mean = train["Age"].mean()
train["Age"] = train["Age"].fillna(mean)
#we need the age metric, so we fill the null values with the averages


#these three metrics are irrelevant to the prediction problem at hand
train = train.drop("PassengerId", axis=1)
train = train.drop("Name", axis=1)
train = train.drop("Ticket", axis=1)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Fare      889 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 62.5+ KB


## Converting Categorical Variables to Numerical Variables

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in ["Sex", "Embarked"]: #for the two columns with categorical variables
    le.fit(train[col]) #set certain values to certain numbers
    train[col] = le.transform(train[col]) #change all of the categorical variables to numerical

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [54]:
#We also need to scale variables down, so each value is within the same range (preferably 0 and 1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(train)
train = scaler.transform(train)

In [59]:
from sklearn.model_selection import train_test_split
input_data = train[:, 1:8]
labels = train[:, 0]
#selecting the input metrics and the survived/not distinction

train_input, test_input, train_labels, test_labels = train_test_split(input_data, labels, test_size=0.2)

In [61]:
import numpy as np
with open("./data/train.py", 'wb') as f:
    np.save(f, train_input)
    np.save(f, train_labels)

with open('data/test.npy', 'wb') as f:
    np.save(f, test_input)
    np.save(f, test_labels) 

In [62]:
#creating a classifier that guesses as a BASELINE
import random
random.seed(a=None, version=2)

def classify(passenger):
    return(random.randint(0, 1))

In [63]:
def run(f_classify, x):
    return list(map(f_classify, x))

#It uses Python’s map function to call the classifier with each item in x and return
#an array of the results.

In [65]:
result = run(classify, train_input)

In [67]:
def evaluate(predictions, actual):
    total = 0
    for i in range(len(predictions)):
        if(predictions[i]==actual[i]):
            total += 1
    accuracy = total/len(predictions)
    return total, accuracy

print(evaluate(run(classify, train_input), train_labels))

(360, 0.5063291139240507)


In [76]:
def predict_death(data):
    return 0

In [77]:
from sklearn.metrics import confusion_matrix

predictions = run(predict_death, train_input)
confusion_matrix(train_labels, predictions)

#It returns a two-dimensional array. The first row shows the true negatives
#(TN) and the false positives (FP). And, the second row shows the false negatives
#(FN) and the true positives (TP).

array([[434,   0],
       [277,   0]])

In [78]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(train_labels, predictions)
print(precision)
#The precision is the “accuracy of the positive predictions.” It only looks at the positive predictions.

rs = recall_score(train_labels, predictions)
print(rs)
#The recall is the “accuracy of the actual positives.” It only looks at the actual positives.

0.0
0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
def specificity(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[0][1]) if (matrix[0][0]+matrix[0][1] > 0) else 0
#The specificity is the “accuracy of the actual negatives.” It only looks at actual negatives (deaths).

def npv(matrix):
    return matrix[0][0]/(matrix[0][0]+matrix[1][0]) if (matrix[0][0]+matrix[1][0] > 0) else 0
#And the “negative predictive value” (NPV) is the “accuracy of the negative predictions.

cm = confusion_matrix(train_labels, predictions)

In [81]:
print(specificity(cm), npv(cm))

1.0 0.6104078762306611


In [82]:
random_predictions = run(classify, train_input)
random_cm = confusion_matrix(train_labels, random_predictions)

In [83]:
print('The precision score of the random classifier is {:.2f}'.format(precision_score(train_labels, random_predictions)))
print('The recall score of the random classifier is {:.2f}'.format(recall_score(train_labels, random_predictions)))
print('The specificity score of the random classifier is {:.2f}'.format(specificity(random_cm)))
print('The npv score of the random classifier is {:.2f}'.format(npv(random_cm)))

The precision score of the random classifier is 0.37
The recall score of the random classifier is 0.47
The specificity score of the random classifier is 0.49
The npv score of the random classifier is 0.59


## Reusable function to evaluate classifiers

In [84]:
def classifier_report(name, run, classify, input, labels):
    cr_predictions = run(classify, input)
    cr_cm = confusion_matrix(labels, cr_predictions)

    cr_precision = precision_score(labels, cr_predictions)
    cr_recall = recall_score(labels, cr_predictions)
    cr_specificity = specificity(cr_cm)
    cr_npv = npv(cr_cm)
    cr_level = 0.25*(cr_precision + cr_recall + cr_specificity + cr_npv) #average of the four values of the confusion matrix

    print('The precision score of the {} classifier is {:.2f}'.format(name, cr_precision))
    print('The recall score of the {} classifier is {:.2f}'.format(name, cr_recall))
    print('The specificity score of the {} classifier is {:.2f}'.format(name, cr_specificity))
    print('The npv score of the {} classifier is {:.2f}'.format(name, cr_npv))
    print('The information level is: {:.2f}'.format(cr_level))

In [85]:
classifier_report(
    "Random PQC",
    run,
    classify,
    train_input,
    train_labels)

The precision score of the Random PQC classifier is 0.38
The recall score of the Random PQC classifier is 0.48
The specificity score of the Random PQC classifier is 0.50
The npv score of the Random PQC classifier is 0.60
The information level is: 0.49
