In [61]:
import numpy as np
import pandas as pd

import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plot

import math

In [62]:
trainIN = pd.read_csv("train.csv")
testIN = pd.read_csv("test.csv")

X = trainIN.iloc[:, 2:]
y = trainIN.iloc[:, 1]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [63]:
#print(X_train)

In [64]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(trainIN)

Unnamed: 0,Total,Percent
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


# Using the Titanic data set.  Replacing missing Age data with the mean of all ages.  This approach could result in systemic bias by age, which might lead to conflict with the ACM code of ethics, as there would be discrimination in the model.  

In [65]:
column_means = X_train["Age"].mean()
print(column_means)
X_train = X_train.fillna(column_means)
column_means = X_test["Age"].mean()
X_test = X_test.fillna(column_means)
print(column_means)

29.233373253493014
30.794600938967136


In [66]:
print(len(X_train))

623


# Drop unneeded columns and turn sex into binary classification

In [67]:
X_train = X_train.drop(["Name", "Ticket", "Cabin", "Embarked", "Fare"], axis=1)
X_test = X_test.drop(["Name", "Ticket", "Cabin", "Embarked", "Fare"], axis=1)

X_train["Sex"].replace(['female','male'], [0, 1], inplace=True)
X_test["Sex"].replace(['female','male'], [0, 1], inplace=True)

In [68]:
print(X_train)
print(y_train)

     Pclass  Sex        Age  SibSp  Parch
472       2    0  33.000000      1      2
597       3    1  49.000000      0      0
843       3    1  34.500000      0      0
112       3    1  22.000000      0      0
869       3    1   4.000000      1      1
838       3    1  32.000000      0      0
575       3    1  19.000000      0      0
377       1    1  27.000000      0      2
664       3    1  20.000000      1      0
84        2    0  17.000000      0      0
426       2    0  28.000000      1      0
692       3    1  29.233373      0      0
90        3    1  29.000000      0      0
672       2    1  70.000000      0      0
445       1    1   4.000000      0      2
620       3    1  27.000000      1      0
144       2    1  18.000000      0      0
12        3    1  20.000000      0      0
752       3    1  33.000000      0      0
564       3    0  29.233373      0      0
203       3    1  45.500000      0      0
386       3    1   1.000000      5      2
55        1    1  29.233373      0

# SVM

In [69]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [70]:
svm = SVC(kernel='linear', random_state=1, gamma='auto', C=1)
svm.fit(X_train, y_train)
print("Train accuracy: ", svm.score(X_train, y_train))
print("Test accuracy: ", svm.score(X_test, y_test))

Train accuracy:  0.7800963081861958
Test accuracy:  0.8022388059701493


In [71]:
svm_predictions = svm.predict(X_test)
TP = 0
TN = 0
FP = 0
FN = 0
correct = 0
incorrect = 0
total = 0

for y in y_test:
    if (y == svm_predictions[total]):
        correct += 1
        if (y == 1):
            TP += 1
        else:
            TN += 1
    else:
        incorrect += 1
        if (y == 1):
            FP += 1
        else:
            FN += 1
    total += 1
accuracy = correct/total

print("Accuracy: ", accuracy)
print("Confusion matrix: ")
print("S = Survived, D = Died")
print("Predicted:\n D     S    Actual:")
print(TN, " ", FP, "    D\n", FN, " ", TP, "    S")

Accuracy:  0.8022388059701493
Confusion matrix: 
S = Survived, D = Died
Predicted:
 D     S    Actual:
143   31     D
 22   72     S


# DNN

In [72]:
dnn = MLPClassifier(hidden_layer_sizes = 100, solver = 'adam', tol = 0.0001, learning_rate_init = 0.001, alpha = 0.0001, shuffle=False, max_iter = 2000, random_state = 1).fit(X_train, y_train)

In [73]:
print("Train accuracy: ", dnn.score(X_train, y_train))
print("Test accuracy: ", dnn.score(X_test, y_test))

Train accuracy:  0.8346709470304976
Test accuracy:  0.832089552238806


In [74]:
#number of layers in the model
dnn.n_layers_

3

In [75]:
dnn_predictions = dnn.predict(X_test)
TP = 0
TN = 0
FP = 0
FN = 0
correct = 0
incorrect = 0
total = 0

for y in y_test:
    if (y == dnn_predictions[total]):
        correct += 1
        if (y == 1):
            TP += 1
        else:
            TN += 1
    else:
        incorrect += 1
        if (y == 1):
            FP += 1
        else:
            FN += 1
    total += 1
accuracy = correct/total

print("Accuracy: ", accuracy)
print("Confusion matrix: ")
print("S = Survived, D = Died")
print("Predicted:\n D     S    Actual:")
print(TN, " ", FP, "    D\n", FN, " ", TP, "    S")

Accuracy:  0.832089552238806
Confusion matrix: 
S = Survived, D = Died
Predicted:
 D     S    Actual:
153   33     D
 12   70     S


# Predicting the competition test set (not part of the project, I'm just checking to see what results I get)

In [76]:
nums = testIN["PassengerId"]
testIN = testIN.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked", "Fare"], axis=1)
testIN["Sex"].replace(['female','male'], [0, 1], inplace=True)
column_means = testIN["Age"].mean()
testIN = testIN.fillna(column_means)
predictions = svm.predict(testIN)
predictions2 = dnn.predict(testIN)

In [77]:
print(len(nums))
print(len(predictions))

418
418


In [78]:
out = []
for x in range(418):
    out.append([nums[x], predictions[x]])
csvOUT = pd.DataFrame(out)
csvOUT.columns = ["PassengerId", "Survived"]
csvOUT.to_csv("SVMpredictions.csv", index=False)

In [79]:
out2 = []
for x in range(418):
    out2.append([nums[x], predictions[x]])
csvOUT2 = pd.DataFrame(out)
csvOUT2.columns = ["PassengerId", "Survived"]
csvOUT2.to_csv("DNNpredictions.csv", index=False)