In [140]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report

import time

In [141]:
np.random.seed(42)
pd.set_option('display.max_rows', None)

In [142]:
df = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")

In [143]:
df['Embarked'].fillna('S', inplace=True)
df_test['Fare'].fillna(26.55, inplace=True)

for i,r in df.iterrows():
    if pd.isnull(r['Age']):
        df.loc[df.index[i],'Age'] = df[df['Pclass'] == r['Pclass']]['Age'].median()

for i,r in df_test.iterrows():
    if pd.isnull(r['Age']):
        df_test.loc[df_test.index[i],'Age'] = df_test[df_test['Pclass'] == r['Pclass']]['Age'].median()

df.drop(columns='PassengerId', inplace=True)
df_test.drop(columns='PassengerId', inplace=True)

df.drop(columns='Cabin', inplace=True)
df_test.drop(columns='Cabin', inplace=True)

In [144]:
print(df.isnull().sum(),df_test.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64 Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64


In [145]:
df['Family']=df['Parch']+df['SibSp']+1
df_test['Family']=df_test['Parch']+df_test['SibSp']+1

In [146]:
df['Title'] = df['Name'].apply(lambda x: x.split('.')[0].split()[-1])
df_test['Title'] = df_test['Name'].apply(lambda x: x.split('.')[0].split()[-1])

Passangers =  ['Mrs', 'Mr', 'Miss', 'Ms', 'Mme', 'Mlle', 'Master']
Royals = ['Lady', 'Sir', 'Don', 'Jonkheer', 'Countess', 'Dona']
Crews = ['Major', 'Col', 'Rev', 'Capt', 'Dr']

df['Status']= df['Title'].apply(lambda x: 'Passanger' if x in Passangers  else ('Royale' if x in Royals else ('Crew' if x in Crews else 'Missing')))
df_test['Status']= df_test['Title'].apply(lambda x: 'Passanger' if x in Passangers  else ('Royale' if x in Royals else ('Crew' if x in Crews else 'Missing')))

df.drop(columns='Title', inplace=True)
df_test.drop(columns='Title', inplace=True)

df.drop(columns='Name', inplace=True)
df_test.drop(columns='Name', inplace=True)

df.drop(columns='Fare', inplace=True)
df_test.drop(columns='Fare', inplace=True)

In [154]:
df

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Ticket,Embarked,Status,Child,Family Size
0,0,3,male,1,0,A/5 21171,S,Passanger,0,"(1, 4]"
1,1,1,female,1,0,PC 17599,C,Passanger,0,"(1, 4]"
2,1,3,female,0,0,STON/O2. 3101282,S,Passanger,0,"(0, 1]"
3,1,1,female,1,0,113803,S,Passanger,0,"(1, 4]"
4,0,3,male,0,0,373450,S,Passanger,0,"(0, 1]"
5,0,3,male,0,0,330877,Q,Passanger,0,"(0, 1]"
6,0,1,male,0,0,17463,S,Passanger,0,"(0, 1]"
7,0,3,male,3,1,349909,S,Passanger,1,"(4, 8]"
8,1,3,female,0,2,347742,S,Passanger,0,"(1, 4]"
9,1,2,female,1,0,237736,C,Passanger,1,"(1, 4]"


In [148]:
df['Child']=df['Age'].apply(lambda x: 1 if x < 18 else 0 )
df_test['Child']=df_test['Age'].apply(lambda x: 1 if x < 18 else 0 )

df.drop(columns='Age', inplace=True)
df_test.drop(columns='Age', inplace=True)

In [153]:
df['Family Size'] = pd.cut(df['Family'], bins=[0,1,4,8,15])
df_test['Family Size'] = pd.cut(df_test['Family'], bins=[0,1,4,8,15])

df.drop(columns='Family', inplace=True)
df_test.drop(columns='Family', inplace=True)

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Survived  891 non-null    int64 
 1   Pclass    891 non-null    int64 
 2   Sex       891 non-null    object
 3   SibSp     891 non-null    int64 
 4   Parch     891 non-null    int64 
 5   Ticket    891 non-null    object
 6   Embarked  891 non-null    object
 7   Family    891 non-null    int64 
 8   Status    891 non-null    object
 9   Child     891 non-null    int64 
dtypes: int64(6), object(4)
memory usage: 69.7+ KB


In [155]:
df['Pclass']=df['Pclass'].astype('category')
df['Sex']=df['Sex'].astype('category')
df['Embarked']=df['Embarked'].astype('category')
df['Status']=df['Status'].astype('category')

df_test['Pclass']=df_test['Pclass'].astype('category')
df_test['Sex']=df_test['Sex'].astype('category')
df_test['Embarked']=df_test['Embarked'].astype('category')
df_test['Status']=df_test['Status'].astype('category')

In [156]:
df.drop(columns='Ticket', inplace=True)
df_test.drop(columns='Ticket', inplace=True)

In [160]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,881,882,883,884,885,886,887,888,889,890
Survived,0,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,1,0
SibSp,1,1,0,1,0,0,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0
Parch,0,0,0,0,0,0,0,1,2,0,...,0,0,0,0,5,0,0,2,0,0
Pclass_1,False,True,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
Pclass_2,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,True,False,False,False,False
Pclass_3,True,False,True,False,True,True,False,True,True,False,...,True,True,False,True,True,False,False,True,False,True
Sex_female,False,True,True,True,False,False,False,False,True,True,...,False,True,False,False,True,False,True,True,False,False
Sex_male,True,False,False,False,True,True,True,True,False,False,...,True,False,True,True,False,True,False,False,True,True
"Family Size_(0, 1]",False,False,True,False,True,True,True,False,False,False,...,True,True,True,True,False,True,True,False,True,True
"Family Size_(1, 4]",True,True,False,True,False,False,False,False,True,True,...,False,False,False,False,False,False,False,True,False,False


In [158]:
one_hot_encoded_df = pd.get_dummies(df, columns = ['Pclass', 'Sex','Family Size','Embarked', 'Status', 'Child'])
one_hot_encoded_df_test = pd.get_dummies(df_test, columns = ['Pclass', 'Sex','Family Size','Embarked', 'Status', 'Child'])

In [159]:
df = one_hot_encoded_df
df_test = one_hot_encoded_df_test

In [262]:
one_hot_encoded_df = pd.get_dummies(df, columns = ['Pclass', 'Sex'])
one_hot_encoded_df_test = pd.get_dummies(df_test, columns = ['Pclass', 'Sex'])

In [161]:
X= df.drop('Survived', axis=1)
y= df.Survived.values

In [162]:
from sklearn.model_selection import KFold

kfold_splitter = KFold(n_splits=5, random_state = 42, shuffle=True)

In [163]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

In [172]:
modelRF= RandomForestClassifier(n_estimators=350, random_state=42,min_samples_split=6,min_samples_leaf=4,max_depth=None)
modelRF.fit(X_train, y_train)

v_acc = cross_val_score(modelRF,
                         X,
                         y,
                         cv=kfold_splitter,
                         scoring="accuracy")

print(v_acc,np.mean(v_acc),modelRF.score(X_valid, y_valid))

[0.81564246 0.80337079 0.88202247 0.78089888 0.83707865] 0.8238026489234826 0.8100558659217877


In [165]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
clf.score(X_valid, y_valid)

0.8044692737430168

In [171]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=21)
clf.fit(X_train, y_train)
clf.score(X_valid, y_valid)

0.8100558659217877

In [173]:
finalModel=RandomForestClassifier(n_estimators=350, random_state=42,min_samples_split=6,min_samples_leaf=3,max_depth=None)
#finalModel=svm.SVC()
#finalModel= KNeighborsClassifier(n_neighbors=26)
finalModel.fit(X,y)
pred = finalModel.predict(df_test)

In [174]:
write= pd.read_csv("Data/test.csv")
result=pd.DataFrame({"PassengerId":write['PassengerId'],"Survived":pred})

In [175]:
result.to_csv("result.csv", index=False)

In [290]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,881,882,883,884,885,886,887,888,889,890
Survived,0,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,1,0
Pclass_1,False,True,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
Pclass_2,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,True,False,False,False,False
Pclass_3,True,False,True,False,True,True,False,True,True,False,...,True,True,False,True,True,False,False,True,False,True
Sex_female,False,True,True,True,False,False,False,False,True,True,...,False,True,False,False,True,False,True,True,False,False
Sex_male,True,False,False,False,True,True,True,True,False,False,...,True,False,True,True,False,True,False,False,True,True
Embarked_C,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
Embarked_Q,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
Embarked_S,True,False,True,True,True,False,True,True,True,False,...,True,True,True,True,False,True,True,True,False,False
Fare Value_Cheap,True,False,True,False,True,True,False,True,True,True,...,True,True,True,True,True,True,False,True,False,True
