In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [3]:
# libraries

import sys
print("Python version: {}".format(sys.version))
import time

import pandas as pd
print("Pandas version: {}".format(pd.__version__))
import numpy as np
print("Numpy version: {}".format(np.__version__))
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import rcParams
%matplotlib inline
print("Matplotlib version: {}".format(matplotlib.__version__))
import seaborn as sns
print("Seaborn version: {}".format(sns.__version__))
import scipy
print("Scipy version: {}".format(scipy.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))

# Modelling libraries
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# print('-'*25)
# # check inside input directory for the files
# !ls -lrth input

Python version: 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]
Pandas version: 0.22.0
Numpy version: 1.14.0
Matplotlib version: 2.1.2
Seaborn version: 0.8.1
Scipy version: 1.0.0
scikit-learn version: 0.19.1


In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train dataframe shape is: ", train_df.shape)
print("Test dataframe shape is: ", test_df.shape)

full_df = train_df.append(test_df, ignore_index=True)
print("Full dataframe shape is: ", full_df.shape)

Train dataframe shape is:  (891, 12)
Test dataframe shape is:  (418, 11)
Full dataframe shape is:  (1309, 12)


In [5]:
# imputing missing values
full_df['Age'] = full_df.Age.fillna(value = full_df.Age.median())
full_df['Fare'] = full_df.Fare.fillna(value = full_df.Fare.median())

# feature engineering
full_df['Sex'] = full_df['Sex'].map({'female': 0, 'male': 1})

# mark passengers as Minor if their name has 'Master.' in it Or if their age is < 16
full_df['Minor'] = full_df.apply(lambda x: 1 if (x['Name'].split(',')[-1].split()[0] == 'Master.') | (x['Age'] < 16) \
                             else 0, axis = 1)
# family size
full_df['FamilySize'] = full_df['Parch'] + full_df['SibSp'] + 1

# extracting surname
full_df['Surname'] = full_df.apply(lambda x: x['Name'].split(',')[0], axis = 1)

# extracting ticket class, and purposely ommiting the last digit in the ticket numbers and replacing it with 'X'
# as family members would be sitting right next to each other varying by a digit in the ticket no.
full_df['TicketClass'] = full_df.apply(lambda x: x['Ticket'][:-1]+'X', axis = 1)

# women-child-grouping
#full_df['WCG_Id'] = full_df.apply(lambda x: x['Surname'] + '-' + str(x['Pclass']) + '-' + str(x['TicketClass']) + '-' + str(x['Fare']) + '-' + str(x['Embarked']), axis = 1)

# surname-grouping
full_df['WCG_Id'] = full_df['Surname']

full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Minor,FamilySize,Surname,TicketClass,WCG_Id
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,0,2,Braund,A/5 2117X,Braund
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,0,2,Cumings,PC 1759X,Cumings
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0,1,Heikkinen,STON/O2. 310128X,Heikkinen
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,2,Futrelle,11380X,Futrelle
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0,1,Allen,37345X,Allen


In [33]:
# familyOneSurvived and familyAllDied

frame = full_df[:891].groupby(['Surname','Name'])['Survived'].mean().to_frame()
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Surname,Name,Unnamed: 2_level_1
Abbing,"Abbing, Mr. Anthony",0.0
Abbott,"Abbott, Mr. Rossmore Edward",0.0
Abbott,"Abbott, Mrs. Stanton (Rosa Hunt)",1.0
Abelson,"Abelson, Mr. Samuel",0.0
Abelson,"Abelson, Mrs. Samuel (Hannah Wizosky)",1.0
Adahl,"Adahl, Mr. Mauritz Nils Martin",0.0
Adams,"Adams, Mr. John",0.0
Ahlin,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",0.0
Aks,"Aks, Mrs. Sam (Leah Rosen)",1.0
Albimona,"Albimona, Mr. Nassef Cassem",1.0


In [34]:
frame = full_df[:891].groupby(['Surname','Name'])['Survived'].mean().to_frame()
full_df['WCG_AllDied'] = 0
full_df['WCG_OneLived'] = 0

for i in range(0,891):
    group = full_df.loc[i,'Surname']
    ss = full_df.loc[i,'Survived']
    s = int(frame.loc[group].sum()) - ss 
    c = int(frame.loc[group].count()) - 1 
    if c > 0 and s < 1:
        full_df.loc[i, ['WCG_AllDied']] = 1
    if c > 0 and s > 0:
        full_df.loc[i, ['WCG_OneLived']] = 1
for i in range(891,1309):
    group = full_df.loc[i,'Surname']
    try:
        s = int(frame.loc[group].sum()) 
        c = int(frame.loc[group].count()) 
    except:
        c = 0
    if c > 0 and s < 1:
        full_df.loc[i, ['WCG_AllDied']] = 1
    if c > 0 and s > 0:
        full_df.loc[i, ['WCG_OneLived']] = 1

full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Minor,FamilySize,Surname,TicketClass,WCG_Id,WCG_AllDied,WCG_OneLived,FareAdj
0,0.275,,S,7.25,"Braund, Mr. Owen Harris",0,1,1.0,1,1,0.0,A/5 21171,0,0.181818,Braund,A/5 2117X,Braund,1,0,0.014151
1,0.475,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,0.333333,0,1,1.0,PC 17599,0,0.181818,Cumings,PC 1759X,Cumings,0,0,0.139136
2,0.325,,S,7.925,"Heikkinen, Miss. Laina",0,3,1.0,0,0,1.0,STON/O2. 3101282,0,0.090909,Heikkinen,STON/O2. 310128X,Heikkinen,0,0,0.015469
3,0.4375,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,0.333333,0,1,1.0,113803,0,0.181818,Futrelle,11380X,Futrelle,1,0,0.103644
4,0.4375,,S,8.05,"Allen, Mr. William Henry",0,5,1.0,1,0,0.0,373450,0,0.090909,Allen,37345X,Allen,0,1,0.015713


In [35]:
# full_df.Age.max() # 80
full_df['Age'] = full_df['Age'] / 80

# full_df['FamilySize'].max() # 11
full_df['FamilySize'] = full_df['FamilySize'] / 11

# full_df.Pclass.max() # 3
full_df['Pclass'] = full_df['Pclass'] / 3

# full_df.Fare.max() # 512.3292
full_df['FareAdj'] = full_df['Fare']/512.3292

In [37]:
#X_train = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][:891]
#X_test = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][891:]

X_train = full_df[['Sex','Minor','FamilySize','Pclass','WCG_OneLived','WCG_AllDied']][:891]
X_test = full_df[['Sex','Minor','FamilySize','Pclass','WCG_OneLived','WCG_AllDied']][891:]
y_train = full_df[['Survived']][:891]

print("\nfull_df: ", full_df.shape,"\nX_train: ", X_train.shape, "\ny_train: ", y_train.shape,"\nX_test: ", X_test.shape)


full_df:  (1309, 20) 
X_train:  (891, 6) 
y_train:  (891, 1) 
X_test:  (418, 6)


In [38]:
# Validation Accuracy to choose classifier and tune parameters
trials = 100
sum = 0
for i in range(trials):
    X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_train, y_train, test_size=0.1)
    #logr = LogisticRegression() # Val_acc = 82.1333%
    #logr = DecisionTreeClassifier() # Val_acc = 82.5888%
    #logr = RandomForestClassifier() # Val_acc = 82.9777%
    logr = GradientBoostingClassifier() # Val_acc = 84.1888%
    #logr = KNeighborsClassifier() # Val_acc = 81.9666%
    logr.fit(X_train2, y_train2)
    logr_acc = logr.score(X_valid2, y_valid2) * 100
    sum = sum + logr_acc
print("Average validation accuracy of",trials,"trials = ",sum/trials)

Average validation accuracy of 100 trials =  84.18888888888887


In [39]:
logr = GradientBoostingClassifier()
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)

In [40]:
PassengerId = full_df[891:].PassengerId
submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': y_pred.astype(int)}, index=None)
print(submission.shape)
# This submission scores 81.8%
submission.to_csv('BoostedTrees.csv', index=False)

(418, 2)


In [41]:
submission.head()

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,1
893,894,0
894,895,0
895,896,1
