In [46]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [47]:
x_path = './Data/train.csv'
X_train = pd.read_csv(x_path)

In [48]:
x_path = './Data/test.csv'
X_test = pd.read_csv(x_path)

In [49]:
Y_train = X_train['Survived']

Replace NA in Cabins column

In [50]:
X_train["Cabin"] = X_train["Cabin"].fillna('NA')
X_test["Cabin"] = X_test["Cabin"].fillna('NA')

Calculate number of letters in Cabin 

In [51]:
X_train['#Cabins'] = X_train['Cabin'].apply(lambda x: len(x))
X_test['#Cabins'] = X_test['Cabin'].apply(lambda x: len(x))

Get Titles

In [52]:
X_train['title'] = X_train.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())
X_train["title"] = X_train["title"].fillna("NA")
X_test['title'] = X_test.Name.apply(lambda x: x.split('.')[0].split(',')[1].strip())
X_test["title"] = X_test["title"].fillna("NA")

Fill Age with average by title

In [53]:
Age_Age = X_train.groupby("title")["Age"].mean().reset_index()

In [54]:
fill_dict = Age_Age.set_index('title')['Age'].to_dict()
X_train['Age'] = X_train['Age'].fillna(X_train['title'].map(fill_dict))

In [55]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,#Cabins,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,3,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,4,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2,Mr


In [56]:
X_train[X_train['PassengerId']==6]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,#Cabins,title
5,6,0,3,"Moran, Mr. James",male,32.36809,0,0,330877,8.4583,,Q,2,Mr


Convert titles to numbers

In [57]:
Titles = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Don':4, 'Rev':5, 'Dr':6, 'Mme':7, 'Ms':8,
       'Major':9, 'Lady':10, 'Sir':11, 'Mlle':12, 'Col':13, 'Capt':14, 'the Countess':15,
       'Jonkheer':16, 'Dona':17}
X_train['title'] = X_train['title'].apply(lambda x: Titles[x])
X_test['title'] = X_test['title'].apply(lambda x: Titles[x])

convert Age into bins

In [58]:
bins = [0, 18, 40, 50, 70,100]
names = ['0', '1', '2', '3','4']
X_train['AgeRange'] = pd.cut(X_train['Age'], bins, labels=names, include_lowest = True)
X_test['AgeRange'] = pd.cut(X_test['Age'], bins, labels=names, include_lowest = True)

In [59]:
X_train[X_train['PassengerId']==6]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,#Cabins,title,AgeRange
5,6,0,3,"Moran, Mr. James",male,32.36809,0,0,330877,8.4583,,Q,2,0,1


Calculate survival priority

In [60]:
def sscore (cols):
    Sex=cols[0]
    AgeRange=cols[1]
    Parch=cols[2]
    if AgeRange=='0' and Parch == "0":
        return 1000
    elif AgeRange=='1' and Sex=="male": # 18-39
        return 1
    elif AgeRange=='2' and Sex=='male': #40-50
        return 1
    elif AgeRange=='0' and Sex=='male': #0-17
        return 100
    elif AgeRange=='3' and Sex=='male': #50-69
        return 10
    elif AgeRange=='4' and Sex=='male': #70 - 100
        return 10
    elif AgeRange=='1' and Sex=="female": #18-35
        return 10
    elif AgeRange=='2' and Sex=='female': #40-50
        return 100
    elif AgeRange=='0' and Sex=='female': #0-17
        return 100
    elif AgeRange=='3' and Sex=='female': #50-69
        return 10
    elif AgeRange=='4' and Sex=='female': #70 - 100
        return 10
    else:
        return 100 

In [61]:
X_train["Sscore"]=X_train[['Sex','AgeRange','Parch']].apply(sscore, axis=1)
X_test["Sscore"]=X_test[['Sex','AgeRange','Parch']].apply(sscore, axis=1)

Is unaccompanied minor?

In [62]:
def unaccompanied_minors (cols):
    Sex=cols[0]
    AgeRange=cols[1]
    Parch=cols[2]
    if AgeRange=='0' and Parch == "0":
        return 1
    else:
        return 0 

In [63]:
X_train["unaccompanied_minors"]=X_train[['Sex','AgeRange','Parch']].apply(unaccompanied_minors, axis=1)
X_test["unaccompanied_minors"]=X_test[['Sex','AgeRange','Parch']].apply(unaccompanied_minors, axis=1)

Calculate ticket length

In [64]:
X_train['Ticket2']=X_train.Ticket.apply(lambda x : len(x))
X_test['Ticket2']=X_test.Ticket.apply(lambda x : len(x))

In [65]:
X_train["Sex"] = X_train["Sex"].fillna("NA")
X_train["Embarked"] = X_train["Embarked"].fillna("NA")
X_train["Sex"] = X_train["Sex"].fillna("NA")
X_train["Embarked"] = X_train["Embarked"].fillna("NA")
X_train[['Pclass', 'SibSp', 'Fare']] = X_train[['Pclass', 'SibSp', 'Fare']].fillna(0)

In [66]:
X_test["Sex"] = X_test["Sex"].fillna("NA")
X_test["Embarked"] = X_test["Embarked"].fillna("NA")
X_test["Sex"] = X_test["Sex"].fillna("NA")
X_test["Embarked"] = X_test["Embarked"].fillna("NA")
X_test[['Pclass', 'Age', 'SibSp', 'Fare']] = X_test[['Pclass', 'Age', 'SibSp', 'Fare']].fillna(0)

Number of relatives

In [67]:
X_test["#Relatives"] = X_test['Parch'] + X_test['SibSp']
X_train["#Relatives"] = X_train['Parch'] + X_train['SibSp']

Gender and Embarked

In [70]:
genders = {'male': 0, 'female': 1, 'NA': 2}
embarks = {'C': 0, 'Q': 1, 'S': 2, 'NA': 3}
X_train['Sex'] = X_train['Sex'].apply(lambda x: genders[x])
X_train['Embarked'] = X_train['Embarked'].apply(lambda x: embarks[x])
X_test['Sex'] = X_test['Sex'].apply(lambda x: genders[x])
X_test['Embarked'] = X_test['Embarked'].apply(lambda x: embarks[x])

KeyError: 0

In [71]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,#Cabins,title,AgeRange,Sscore,unaccompanied_minors,Ticket2,#Relatives
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,2,2,0,1,1,0,9,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,0,3,1,1,10,0,8,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,2,2,2,1,10,0,16,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,2,4,1,1,10,0,6,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,2,2,0,1,1,0,6,0


# Classifiers

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis