In [3]:
# Importing Necessary Libaries
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [4]:
# Reading data in
x = pd.read_csv('train.csv')
xtest = pd.read_csv('test.csv')

In [5]:
x.set_index('PassengerId', inplace=True)
xtest.set_index('PassengerId', inplace=True)

In [7]:
x

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [9]:
# Splitting data into tables of x and y variables

y = x['Survived']

In [10]:
y

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [11]:
# Dropping variables that cannot be easily used to quantify whether passenger survives or not
x = x.drop(['Name'], axis=1)
x = x.drop(['Survived'], axis=1)
x = x.drop(['Ticket'], axis=1)
xtest = xtest.drop(['Name'], axis=1)
xtest = xtest.drop(['Ticket'], axis=1)
xtest

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,male,34.5,0,0,7.8292,,Q
893,3,female,47.0,1,0,7.0000,,S
894,2,male,62.0,0,0,9.6875,,Q
895,3,male,27.0,0,0,8.6625,,S
896,3,female,22.0,1,1,12.2875,,S
...,...,...,...,...,...,...,...,...
1305,3,male,,0,0,8.0500,,S
1306,1,female,39.0,0,0,108.9000,C105,C
1307,3,male,38.5,0,0,7.2500,,S
1308,3,male,,0,0,8.0500,,S


In [12]:
# one hot encoding
# changing categorical variables to binary
x['Female'] = np.where(x['Sex'] == 'female', 1, 0)
x['Male'] = np.where(x['Sex'] == 'male', 1, 0)
x = x.drop(['Sex'], axis=1)

# ensuring that test data will be the same as the training data
xtest['Female'] = np.where(xtest['Sex'] == 'female', 1, 0)
xtest['Male'] = np.where(xtest['Sex'] == 'male', 1, 0)
xtest = xtest.drop(['Sex'], axis=1)

In [13]:
# Ticket Class - categorical variable
x['Pclass1'] = np.where(x['Pclass'] == 1, 1, 0)
x['Pclass2'] = np.where(x['Pclass'] == 2, 1, 0)
x['Pclass3'] = np.where(x['Pclass'] == 3, 1, 0)
x = x.drop(['Pclass'], axis=1)

xtest['Pclass1'] = np.where(xtest['Pclass'] == 1, 1, 0)
xtest['Pclass2'] = np.where(xtest['Pclass'] == 2, 1, 0)
xtest['Pclass3'] = np.where(xtest['Pclass'] == 3, 1, 0)
xtest = xtest.drop(['Pclass'], axis=1)

In [14]:
# Port of Embarkation
# C = Cherbourg, Q = Queenstown, S = Southampton
x['Cherbourg'] = np.where(x['Embarked'] == 'C', 1, 0)
x['Queenstown'] = np.where(x['Embarked'] == 'Q', 1, 0)
x['Southampton'] = np.where(x['Embarked'] == 'S', 1, 0)
x = x.drop(['Embarked'], axis=1)

xtest['Cherbourg'] = np.where(xtest['Embarked'] == 'C', 1, 0)
xtest['Queenstown'] = np.where(xtest['Embarked'] == 'Q', 1, 0)
xtest['Southampton'] = np.where(xtest['Embarked'] == 'S', 1, 0)
xtest = xtest.drop(['Embarked'], axis=1)

In [15]:
# dropping cabin since this variable cannot be easily quanitied and there is too much missing data
x = x.drop(['Cabin'], axis=1)
xtest = xtest.drop(['Cabin'], axis=1)

In [16]:
x

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,Cherbourg,Queenstown,Southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,22.0,1,0,7.2500,0,1,0,0,1,0,0,1
2,38.0,1,0,71.2833,1,0,1,0,0,1,0,0
3,26.0,0,0,7.9250,1,0,0,0,1,0,0,1
4,35.0,1,0,53.1000,1,0,1,0,0,0,0,1
5,35.0,0,0,8.0500,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
887,27.0,0,0,13.0000,0,1,0,1,0,0,0,1
888,19.0,0,0,30.0000,1,0,1,0,0,0,0,1
889,,1,2,23.4500,1,0,0,0,1,0,0,1
890,26.0,0,0,30.0000,0,1,1,0,0,1,0,0


In [17]:
xtest

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Female,Male,Pclass1,Pclass2,Pclass3,Cherbourg,Queenstown,Southampton
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
892,34.5,0,0,7.8292,0,1,0,0,1,0,1,0
893,47.0,1,0,7.0000,1,0,0,0,1,0,0,1
894,62.0,0,0,9.6875,0,1,0,1,0,0,1,0
895,27.0,0,0,8.6625,0,1,0,0,1,0,0,1
896,22.0,1,1,12.2875,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,0,0,8.0500,0,1,0,0,1,0,0,1
1306,39.0,0,0,108.9000,1,0,1,0,0,1,0,0
1307,38.5,0,0,7.2500,0,1,0,0,1,0,0,1
1308,,0,0,8.0500,0,1,0,0,1,0,0,1


In [18]:
# Checking correlation of variables
# seeing which variables are left in dataset
print(x.columns)
#print(ytrain.columns)

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Female', 'Male', 'Pclass1', 'Pclass2',
       'Pclass3', 'Cherbourg', 'Queenstown', 'Southampton'],
      dtype='object')


In [20]:
# Checking correlation of ticket class to survival
Pclass1_matrix = np.corrcoef(x['Pclass1'], y)
print("Pclass1_matrix\n", Pclass1_matrix) #NOTE: decent correlation to survival rate
Pclass2_matrix = np.corrcoef(x['Pclass2'], y)
print("\nPclass2_matrix\n", Pclass2_matrix) #NOTE: little correlation to survival rate
Pclass3_matrix = np.corrcoef(x['Pclass3'], y)
print("\nPclass3_matrix\n", Pclass3_matrix) #NOTE: negative correlation to survival rate

Pclass1_matrix
 [[1.         0.28590377]
 [0.28590377 1.        ]]

Pclass2_matrix
 [[1.         0.09334857]
 [0.09334857 1.        ]]

Pclass3_matrix
 [[ 1.         -0.32230836]
 [-0.32230836  1.        ]]


In [21]:
# Checking correlation of embarkment to survival
Cherbourg_matrix = np.corrcoef(x['Cherbourg'], y)
print("Cherbourg_matrix\n", Cherbourg_matrix) #NOTE: strong correlation to survival rate
Queenstown_matrix = np.corrcoef(x['Queenstown'], y)
print("\nQueenstown_matrix\n", Queenstown_matrix) #NOTE: little correlation to survival rate
Southampton_matrix = np.corrcoef(x['Southampton'], y)
print("\nSouthampton_matrix\n", Southampton_matrix) #NOTE: negative correlation to survival rate

Cherbourg_matrix
 [[1.         0.16824043]
 [0.16824043 1.        ]]

Queenstown_matrix
 [[1.         0.00365038]
 [0.00365038 1.        ]]

Southampton_matrix
 [[ 1.         -0.15566027]
 [-0.15566027  1.        ]]


In [22]:
# Checking correlation of sex to survival
Female_matrix = np.corrcoef(x['Female'], y)
print("Female_matrix\n", Female_matrix) #NOTE: strong correlation to survival rate
Male_matrix = np.corrcoef(x['Male'], y)
print("\nMale_matrix\n", Male_matrix) #NOTE: negative correlation to surival rate - shouldn't have both variables in model also due to multicollinearity

Female_matrix
 [[1.         0.54335138]
 [0.54335138 1.        ]]

Male_matrix
 [[ 1.         -0.54335138]
 [-0.54335138  1.        ]]


In [23]:
# Checking correlation of continuous variables to survival
Age_matrix = np.corrcoef(x['Age'], y) #maybe consider putting this to 0 or dropping age in general or dropping NaNs
print("Age_matrix\n", Age_matrix)
SibSp_matrix = np.corrcoef(x['SibSp'], y)
print("\nSibsp_matrix\n", SibSp_matrix) #NOTE: negative correlation found between have siblings/spouses aboard
Parch_matrix = np.corrcoef(x['Parch'], y)
print("\nParch_matrix\n", Parch_matrix) #NOTE: some correlation found

Age_matrix
 [[nan nan]
 [nan  1.]]

Sibsp_matrix
 [[ 1.        -0.0353225]
 [-0.0353225  1.       ]]

Parch_matrix
 [[1.         0.08162941]
 [0.08162941 1.        ]]


In [24]:
print("Number of Missing Age Values:", x['Age'].isna().sum())
# temporary possible solutions
print("Mean:", x['Age'].mean())
print("Mode:", x['Age'].mode())
print("Median:", x["Age"].median())


# setting up possible solutions
median_age = x['Age'].median()
mean_age = 30 # setting mean to be a whole value
mode_age = 24
x['Median_age'] = x['Age'].fillna(median_age)
x['Mean_age'] = x['Age'].fillna(mean_age)
x['Mode_age'] = x['Age'].fillna(mode_age)

Number of Missing Age Values: 177
Mean: 29.69911764705882
Mode: 0    24.0
Name: Age, dtype: float64
Median: 28.0


In [25]:
# Checking correlation of potential solutions to NaN
MedianAge_matrix = np.corrcoef(x['Median_age'], y)
print("MedianAge_matrix\n", MedianAge_matrix)
MeanAge_matrix = np.corrcoef(x['Mean_age'], y) #has the most negative significance to survival
print("\nMeanAge_matrix\n", MeanAge_matrix)
ModeAge_matrix = np.corrcoef(x['Mode_age'], y) #has the least negative significance to survival
print("\nModeAge_matrix\n", ModeAge_matrix)

MedianAge_matrix
 [[ 1.         -0.06491042]
 [-0.06491042  1.        ]]

MeanAge_matrix
 [[ 1.         -0.07065723]
 [-0.07065723  1.        ]]

ModeAge_matrix
 [[ 1.         -0.05287222]
 [-0.05287222  1.        ]]


In [26]:
# dropping unecessary values
x = x.drop(['Age'], axis=1)
x = x.drop(['Median_age'], axis=1)
x = x.drop(['Mode_age'], axis=1)
x = x.drop(['Male'], axis=1)

In [27]:
x

Unnamed: 0_level_0,SibSp,Parch,Fare,Female,Pclass1,Pclass2,Pclass3,Cherbourg,Queenstown,Southampton,Mean_age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0,7.2500,0,0,0,1,0,0,1,22.0
2,1,0,71.2833,1,1,0,0,1,0,0,38.0
3,0,0,7.9250,1,0,0,1,0,0,1,26.0
4,1,0,53.1000,1,1,0,0,0,0,1,35.0
5,0,0,8.0500,0,0,0,1,0,0,1,35.0
...,...,...,...,...,...,...,...,...,...,...,...
887,0,0,13.0000,0,0,1,0,0,0,1,27.0
888,0,0,30.0000,1,1,0,0,0,0,1,19.0
889,1,2,23.4500,1,0,0,1,0,0,1,30.0
890,0,0,30.0000,0,1,0,0,1,0,0,26.0


In [28]:
# adding new and deleting new changes to test set to match training
xtest['Mean_age'] = xtest['Age'].fillna(mean_age)
xtest = xtest.drop(['Age'], axis=1)
xtest = xtest.drop(['Male'], axis=1)

In [30]:
xtest

Unnamed: 0_level_0,SibSp,Parch,Fare,Female,Pclass1,Pclass2,Pclass3,Cherbourg,Queenstown,Southampton,Mean_age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,0,0,7.8292,0,0,0,1,0,1,0,34.5
893,1,0,7.0000,1,0,0,1,0,0,1,47.0
894,0,0,9.6875,0,0,1,0,0,1,0,62.0
895,0,0,8.6625,0,0,0,1,0,0,1,27.0
896,1,1,12.2875,1,0,0,1,0,0,1,22.0
...,...,...,...,...,...,...,...,...,...,...,...
1305,0,0,8.0500,0,0,0,1,0,0,1,30.0
1306,0,0,108.9000,1,1,0,0,1,0,0,39.0
1307,0,0,7.2500,0,0,0,1,0,0,1,38.5
1308,0,0,8.0500,0,0,0,1,0,0,1,30.0


In [35]:
# returning row where NaN can be found in testing set
for i in range(len(xtest)):
    if pd.isna(xtest['Fare'].iloc[i]):
        print(i)

152


In [36]:
# printing NaN
print(xtest.iloc[152])

SibSp           0.0
Parch           0.0
Fare            NaN
Female          0.0
Pclass1         0.0
Pclass2         0.0
Pclass3         1.0
Cherbourg       0.0
Queenstown      0.0
Southampton     1.0
Mean_age       60.5
Name: 1044, dtype: float64


In [37]:
# copying test set to test possible ways to do solution before making a final commit
xtestBlank = xtest.copy()
xtestBlank = xtestBlank[xtestBlank['Pclass3'] == 1] #since ticket classes are typically correlated with fare price

In [38]:
# because ticket classes are correlated with fare price the mean was grabbed and used to fill in the blank fare
meanFare = xtestBlank['Fare'].mean()
print(meanFare)
xtestBlank['Fare'] = xtestBlank['Fare'].fillna(meanFare)

12.459677880184334


In [40]:
xtestBlank

Unnamed: 0_level_0,SibSp,Parch,Fare,Female,Pclass1,Pclass2,Pclass3,Cherbourg,Queenstown,Southampton,Mean_age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,0,0,7.8292,0,0,0,1,0,1,0,34.5
893,1,0,7.0000,1,0,0,1,0,0,1,47.0
895,0,0,8.6625,0,0,0,1,0,0,1,27.0
896,1,1,12.2875,1,0,0,1,0,0,1,22.0
897,0,0,9.2250,0,0,0,1,0,0,1,14.0
...,...,...,...,...,...,...,...,...,...,...,...
1304,0,0,7.7750,1,0,0,1,0,0,1,28.0
1305,0,0,8.0500,0,0,0,1,0,0,1,30.0
1307,0,0,7.2500,0,0,0,1,0,0,1,38.5
1308,0,0,8.0500,0,0,0,1,0,0,1,30.0


In [41]:
print(xtestBlank['Fare'].isna().sum())

0


In [42]:
# solution works so commiting change to test
xtest['Fare'] = xtest['Fare'].fillna(meanFare)

In [43]:
# split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=2)

In [44]:
# create model
model = RandomForestClassifier(n_estimators = 100, random_state=2)  

In [45]:
# fit data to model
model.fit(x_train, y_train)

In [48]:
# Cross-Validating data
cv = StratifiedKFold(n_splits=5) # ensures data is distributed evenly
val_score = cross_val_score(model, x_val, y_val, cv=cv, scoring='accuracy') # estimates how well model will perform on the test data
print(f'Validation Accuracy: {val_score.mean()}')

Validation Accuracy: 0.7817460317460319


In [49]:
# officially predicting ytest
ypred = model.predict(xtest)

In [52]:
ypred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [54]:
# writing new df for submission
submission = pd.DataFrame({'Survived': ypred}, index=xtest.index)

In [55]:
submission

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,1
896,1
...,...
1305,0
1306,1
1307,0
1308,0


In [56]:
# writing to csv to submit
submission.to_csv('submission.csv', index=True)