In [305]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report

import time

In [306]:
np.random.seed(42)
pd.set_option('display.max_rows', None)

In [307]:
df = pd.read_csv("Data/train.csv")
df_test = pd.read_csv("Data/test.csv")

In [308]:
df['Embarked'].fillna('S', inplace=True)
df_test['Fare'].fillna(26.55, inplace=True)

for i,r in df.iterrows():
    if pd.isnull(r['Age']):
        df.loc[df.index[i],'Age'] = df[df['Pclass'] == r['Pclass']]['Age'].median()

for i,r in df_test.iterrows():
    if pd.isnull(r['Age']):
        df_test.loc[df_test.index[i],'Age'] = df_test[df_test['Pclass'] == r['Pclass']]['Age'].median()

df['Cabin Letter']=df['Cabin'].apply(lambda x: x[0] if not pd.isna(x) else 'N')
df_test['Cabin Letter']=df_test['Cabin'].apply(lambda x: x[0] if not pd.isna(x) else 'N')

df.drop(columns='PassengerId', inplace=True)
df_test.drop(columns='PassengerId', inplace=True)

df.drop(columns='Cabin Letter', inplace=True)
df_test.drop(columns='Cabin Letter', inplace=True)

df.drop(columns='Cabin', inplace=True)
df_test.drop(columns='Cabin', inplace=True)

In [309]:
print(df.isnull().sum(),df_test.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64 Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64


In [310]:
df['Family']=df['Parch']+df['SibSp']+1
df_test['Family']=df_test['Parch']+df_test['SibSp']+1

df.drop(columns='Parch', inplace=True)
df_test.drop(columns='Parch', inplace=True)

df.drop(columns='SibSp', inplace=True)
df_test.drop(columns='SibSp', inplace=True)

In [311]:
df['Fare Per P'] = df['Fare']/df['Family']
df_test['Fare Per P'] = df_test['Fare']/df_test['Family']

df['Fare Value']= df['Fare Per P'].apply(lambda x: 'Cheap' if x < 20  else ('Mid' if x >=20 and x < 70 else ('Expensive' if x >= 70 and x <200 else 'Deluxe')) )
df_test['Fare Value']= df_test['Fare Per P'].apply(lambda x: 'Cheap' if x < 20  else ('Mid' if x >=20 and x < 70 else ('Expensive' if x >= 70 and x <200 else 'Deluxe')) )

df.drop(columns='Fare Per P', inplace=True)
df_test.drop(columns='Fare Per P', inplace=True)

df.drop(columns='Family', inplace=True)
df_test.drop(columns='Family', inplace=True)

df.drop(columns='Fare', inplace=True)
df_test.drop(columns='Fare', inplace=True)


In [312]:
df['Title'] = df['Name'].apply(lambda x: x.split('.')[0].split()[-1])
df_test['Title'] = df_test['Name'].apply(lambda x: x.split('.')[0].split()[-1])

Passangers =  ['Mrs', 'Mr', 'Miss', 'Ms', 'Mme', 'Mlle', 'Master']
Royals = ['Lady', 'Sir', 'Don', 'Jonkheer', 'Countess', 'Dona']
Crews = ['Major', 'Col', 'Rev', 'Capt', 'Dr']

df['Status']= df['Title'].apply(lambda x: 'Passanger' if x in Passangers  else ('Royale' if x in Royals else ('Crew' if x in Crews else 'Missing')))
df_test['Status']= df_test['Title'].apply(lambda x: 'Passanger' if x in Passangers  else ('Royale' if x in Royals else ('Crew' if x in Crews else 'Missing')))

df.drop(columns='Title', inplace=True)
df_test.drop(columns='Title', inplace=True)

df.drop(columns='Name', inplace=True)
df_test.drop(columns='Name', inplace=True)


In [256]:
df['Age Int']=df['Age'].apply(lambda x: 'Child' if x < 18 else ('Young' if x < 28 else ('Mature' if x < 55 else 'Old')))
df_test['Age Int']=df_test['Age'].apply(lambda x: 'Child' if x < 18 else ('Young' if x < 28 else ('Mature' if x < 55 else 'Old')))

df.drop(columns='Age', inplace=True)
df_test.drop(columns='Age', inplace=True)


In [313]:
dizi = np.arange(0, 101, 10)


df['age_category'] = pd.cut(df['Age'], bins=dizi)
df_test['age_category'] = pd.cut(df_test['Age'], bins=dizi)

df.drop(columns='Age', inplace=True)
df_test.drop(columns='Age', inplace=True)

In [314]:
df.drop(columns='Ticket', inplace=True)
df_test.drop(columns='Ticket', inplace=True)


In [258]:
df.drop(columns='Embarked', inplace=True)
df_test.drop(columns='Embarked', inplace=True)

df.drop(columns='Fare Value', inplace=True)
df_test.drop(columns='Fare Value', inplace=True)

df.drop(columns='Status', inplace=True)
df_test.drop(columns='Status', inplace=True)

df.drop(columns='Age Int', inplace=True)
df_test.drop(columns='Age Int', inplace=True)

In [315]:
df

Unnamed: 0,Survived,Pclass,Sex,Embarked,Fare Value,Status,age_category
0,0,3,male,S,Cheap,Passanger,"(20, 30]"
1,1,1,female,C,Mid,Passanger,"(30, 40]"
2,1,3,female,S,Cheap,Passanger,"(20, 30]"
3,1,1,female,S,Mid,Passanger,"(30, 40]"
4,0,3,male,S,Cheap,Passanger,"(30, 40]"
5,0,3,male,Q,Cheap,Passanger,"(20, 30]"
6,0,1,male,S,Mid,Passanger,"(50, 60]"
7,0,3,male,S,Cheap,Passanger,"(0, 10]"
8,1,3,female,S,Cheap,Passanger,"(20, 30]"
9,1,2,female,C,Cheap,Passanger,"(10, 20]"


In [316]:
one_hot_encoded_df = pd.get_dummies(df, columns = ['Pclass', 'Sex','Embarked','Fare Value', 'Status', 'age_category'])
one_hot_encoded_df_test = pd.get_dummies(df_test, columns = ['Pclass', 'Sex','Embarked','Fare Value', 'Status', 'age_category'])


In [262]:
one_hot_encoded_df = pd.get_dummies(df, columns = ['Pclass', 'Sex'])
one_hot_encoded_df_test = pd.get_dummies(df_test, columns = ['Pclass', 'Sex'])


In [317]:
df= one_hot_encoded_df
df_test = one_hot_encoded_df_test

df.drop(columns='Cabin Letter_N', inplace=True)
df_test.drop(columns='Cabin Letter_N', inplace=True)

df.drop(columns='Cabin Letter_T', inplace=True)
df_test.drop(columns='Cabin Letter_T', inplace=True)

KeyError: "['Cabin Letter_N'] not found in axis"

In [318]:
X= df.drop('Survived', axis=1)
y= df.Survived.values

In [319]:
from sklearn.model_selection import KFold

kfold_splitter = KFold(n_splits=5, random_state = 42, shuffle=True)

In [320]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

In [321]:
modelRF= RandomForestClassifier(n_estimators=550, random_state=42,min_samples_split=6,min_samples_leaf=4,max_depth=None)
modelRF.fit(X_train, y_train)

v_acc = cross_val_score(modelRF,
                         X,
                         y,
                         cv=kfold_splitter, # 5-fold cross-validation
                         scoring="accuracy")

print(v_acc,np.mean(v_acc),modelRF.score(X_valid, y_valid))

[0.79888268 0.79775281 0.85393258 0.79213483 0.84269663] 0.817079907099366 0.7988826815642458


In [322]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
clf.score(X_valid, y_valid)

0.7988826815642458

In [336]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=26)
clf.fit(X_train, y_train)
clf.score(X_valid, y_valid)

0.7932960893854749

In [337]:
#finalModel=RandomForestClassifier(n_estimators=350, random_state=42,min_samples_split=3,min_samples_leaf=4,max_depth=None)
#finalModel=svm.SVC()
finalModel= KNeighborsClassifier(n_neighbors=26)
finalModel.fit(X,y)
pred = finalModel.predict(df_test)

In [338]:
write= pd.read_csv("Data/test.csv")
result=pd.DataFrame({"PassengerId":write['PassengerId'],"Survived":pred})

In [339]:
result.to_csv("result.csv", index=False)

In [290]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,881,882,883,884,885,886,887,888,889,890
Survived,0,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,1,0
Pclass_1,False,True,False,True,False,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
Pclass_2,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,True,False,False,False,False
Pclass_3,True,False,True,False,True,True,False,True,True,False,...,True,True,False,True,True,False,False,True,False,True
Sex_female,False,True,True,True,False,False,False,False,True,True,...,False,True,False,False,True,False,True,True,False,False
Sex_male,True,False,False,False,True,True,True,True,False,False,...,True,False,True,True,False,True,False,False,True,True
Embarked_C,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
Embarked_Q,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
Embarked_S,True,False,True,True,True,False,True,True,True,False,...,True,True,True,True,False,True,True,True,False,False
Fare Value_Cheap,True,False,True,False,True,True,False,True,True,True,...,True,True,True,True,True,True,False,True,False,True
