In [402]:
import numpy as np
import pandas as pd

import sklearn as sk

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plot

import math

In [403]:
trainIN = pd.read_csv("train.csv")
testIN = pd.read_csv("test.csv")

X = trainIN.iloc[:, 2:]
y = trainIN.iloc[:, 1]

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [404]:
print(X_train)

     Pclass                                            Name     Sex   Age  \
472       2         West, Mrs. Edwy Arthur (Ada Mary Worth)  female  33.0   
597       3                             Johnson, Mr. Alfred    male  49.0   
843       3                      Lemberopolous, Mr. Peter L    male  34.5   
112       3                          Barton, Mr. David John    male  22.0   
869       3                 Johnson, Master. Harold Theodor    male   4.0   
838       3                                 Chip, Mr. Chang    male  32.0   
575       3                            Patchett, Mr. George    male  19.0   
377       1                       Widener, Mr. Harry Elkins    male  27.0   
664       3                     Lindqvist, Mr. Eino William    male  20.0   
84        2                             Ilett, Miss. Bertha  female  17.0   
426       2     Clarke, Mrs. Charles V (Ada Maria Winfield)  female  28.0   
692       3                                    Lam, Mr. Ali    male   NaN   

In [405]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data(trainIN)

Unnamed: 0,Total,Percent
Cabin,687,77.104377
Age,177,19.86532
Embarked,2,0.224467
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


# Using the Titanic data set.  Replacing missing Age data with the mean of all ages.  This approach could result in systemic bias by age, which might lead to conflict with the ACM code of ethics, as there would be discrimination in the model.  -----------use titles to refine age replacement

In [406]:
Miss = -1
Master = -1
Mrs = -1
Mr = -1

In [409]:
row = 0
count = 0
for x in X_train.iloc[:, 1]:
    if "Miss." in x:
        c = X_train.iloc[row, 3]
        if not math.isnan(c):
            Miss+=c
            count+=1
    row+=1

In [410]:
print(Miss/count)

41.81958762886598


In [None]:
column_means = X_train["Age"].mean()
print(column_means)
X_train = X_train.fillna(column_means)
column_means = X_test["Age"].mean()
X_test = X_test.fillna(column_means)
print(column_means)

# Drop unneeded columns

In [352]:
X_train = X_train.drop(["Name", "Ticket", "Cabin", "Embarked", "Fare"], axis=1)
X_test = X_test.drop(["Name", "Ticket", "Cabin", "Embarked", "Fare"], axis=1)

X_train["Sex"].replace(['female','male'], [0, 1], inplace=True)
X_test["Sex"].replace(['female','male'], [0, 1], inplace=True)

In [353]:
print(X_train)
print(y_train)

     Pclass  Sex        Age  SibSp  Parch
472       2    0  33.000000      1      2
597       3    1  49.000000      0      0
843       3    1  34.500000      0      0
112       3    1  22.000000      0      0
869       3    1   4.000000      1      1
838       3    1  32.000000      0      0
575       3    1  19.000000      0      0
377       1    1  27.000000      0      2
664       3    1  20.000000      1      0
84        2    0  17.000000      0      0
426       2    0  28.000000      1      0
692       3    1  29.233373      0      0
90        3    1  29.000000      0      0
672       2    1  70.000000      0      0
445       1    1   4.000000      0      2
620       3    1  27.000000      1      0
144       2    1  18.000000      0      0
12        3    1  20.000000      0      0
752       3    1  33.000000      0      0
564       3    0  29.233373      0      0
203       3    1  45.500000      0      0
386       3    1   1.000000      5      2
55        1    1  29.233373      0

# SVM

In [354]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [355]:
svm = SVC(kernel='linear', random_state=1, gamma='auto', C=1)
svm.fit(X_train, y_train)
print("Train accuracy: ", svm.score(X_train, y_train))
print("Test accuracy: ", svm.score(X_test, y_test))

Train accuracy:  0.7800963081861958
Test accuracy:  0.8022388059701493


In [356]:
testIN

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.2250,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.1500,,S


# DNN

In [328]:
dnn = MLPClassifier(hidden_layer_sizes = 100, solver = 'adam', tol = 0.0001, learning_rate_init = 0.001, alpha = 0.0001, shuffle=False, max_iter = 2000, random_state = 1).fit(X_train, y_train)

In [329]:
dnn.score(X_train, y_train)

0.8298555377207063

In [330]:
dnn.score(X_test, y_test)

0.8134328358208955

In [331]:
#number of layers in the model
dnn.n_layers_

3

# Predicting the competition test set (not part of the project, I'm just checking to see what results I get)

In [262]:
nums = testIN["PassengerId"]
testIN = testIN.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], axis=1)
testIN["Sex"].replace(['female','male'], [0, 1], inplace=True)
column_means = testIN["Age"].mean()
testIN = testIN.fillna(column_means)
predictions = svm.predict(testIN)

In [263]:
print(len(nums))
print(len(predictions))

418
418


In [264]:
out = []
for x in range(418):
    out.append([nums[x], predictions[x]])
csvOUT = pd.DataFrame(out)
csvOUT.columns = ["PassengerId", "Survived"]
csvOUT.to_csv("SVMpredictions.csv", index=False)