In [1]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [4]:
# libraries

import sys
print("Python version: {}".format(sys.version))
import time

import pandas as pd
print("Pandas version: {}".format(pd.__version__))
import numpy as np
print("Numpy version: {}".format(np.__version__))
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import rcParams
%matplotlib inline
print("Matplotlib version: {}".format(matplotlib.__version__))
import seaborn as sns
print("Seaborn version: {}".format(sns.__version__))
import scipy
print("Scipy version: {}".format(scipy.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))

# Modelling libraries
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# print('-'*25)
# # check inside input directory for the files
# !ls -lrth input

Python version: 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 10:22:32) [MSC v.1900 64 bit (AMD64)]
Pandas version: 0.22.0
Numpy version: 1.14.0
Matplotlib version: 2.1.2
Seaborn version: 0.8.1
Scipy version: 1.0.0
scikit-learn version: 0.19.1


In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print("Train dataframe shape is: ", train_df.shape)
print("Test dataframe shape is: ", test_df.shape)

full_df = train_df.append(test_df, ignore_index=True)
print("Full dataframe shape is: ", full_df.shape)

Train dataframe shape is:  (891, 12)
Test dataframe shape is:  (418, 11)
Full dataframe shape is:  (1309, 12)


In [6]:
# imputing missing values
full_df['Age'] = full_df.Age.fillna(value = full_df.Age.median())
full_df['Fare'] = full_df.Fare.fillna(value = full_df.Fare.median())

# feature engineering
full_df['Sex'] = full_df['Sex'].map({'female': 0, 'male': 1})

# mark passengers as Minor if their name has 'Master.' in it Or if their age is < 16
full_df['Minor'] = full_df.apply(lambda x: 1 if (x['Name'].split(',')[-1].split()[0] == 'Master.') | (x['Age'] < 16) \
                             else 0, axis = 1)
# family size
full_df['FamilySize'] = full_df['Parch'] + full_df['SibSp'] + 1

# extracting surname
full_df['Surname'] = full_df.apply(lambda x: x['Name'].split(',')[0], axis = 1)

# extracting ticket class, and purposely ommiting the last digit in the ticket numbers and replacing it with 'X'
# as family members would be sitting right next to each other varying by a digit in the ticket no.
full_df['TicketClass'] = full_df.apply(lambda x: x['Ticket'][:-1]+'X', axis = 1)

# women-child-grouping
full_df['WCG_Id'] = full_df.apply(lambda x: x['Surname'] + '-' + str(x['Pclass']) + '-' + str(x['TicketClass']) + '-' + str(x['Fare']) + '-' + str(x['Embarked']), axis = 1)

full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Minor,FamilySize,Surname,TicketClass,WCG_Id
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,0,2,Braund,A/5 2117X,Braund-3-A/5 2117X-7.25-S
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,0,2,Cumings,PC 1759X,Cumings-1-PC 1759X-71.2833-C
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0,1,Heikkinen,STON/O2. 310128X,Heikkinen-3-STON/O2. 310128X-7.925-S
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,2,Futrelle,11380X,Futrelle-1-11380X-53.1-S
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0,1,Allen,37345X,Allen-3-37345X-8.05-S


In [30]:
# familyOneSurvived and familyAllDied

# frame = full_df[:891].groupby(['WCG_Id','Name'])['Survived'].mean().to_frame()
# frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
WCG_Id,Name,Unnamed: 2_level_1
Abbing-3-C.A. 554X-7.55-S,"Abbing, Mr. Anthony",0.0
Abbott-3-C.A. 267X-20.25-S,"Abbott, Mr. Rossmore Edward",0.0
Abbott-3-C.A. 267X-20.25-S,"Abbott, Mrs. Stanton (Rosa Hunt)",1.0
Abelson-2-P/PP 338X-24.0-C,"Abelson, Mr. Samuel",0.0
Abelson-2-P/PP 338X-24.0-C,"Abelson, Mrs. Samuel (Hannah Wizosky)",1.0
Adahl-3-C 707X-7.25-S,"Adahl, Mr. Mauritz Nils Martin",0.0
Adams-3-34182X-8.05-S,"Adams, Mr. John",0.0
Ahlin-3-754X-9.475-S,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",0.0
Aks-3-39209X-9.35-S,"Aks, Mrs. Sam (Leah Rosen)",1.0
Albimona-3-269X-18.7875-C,"Albimona, Mr. Nassef Cassem",1.0


In [47]:
# frame.loc['Andersson-3-34708X-31.275-S'] #lets see for Andersson surnamed families

Unnamed: 0_level_0,Survived
Name,Unnamed: 1_level_1
"Andersson, Master. Sigvard Harald Elias",0.0
"Andersson, Miss. Ebba Iris Alfrida",0.0
"Andersson, Miss. Ellis Anna Maria",0.0
"Andersson, Miss. Ingeborg Constanzia",0.0
"Andersson, Miss. Sigrid Elisabeth",0.0
"Andersson, Mr. Anders Johan",0.0
"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",0.0


In [48]:
# frame.loc['Andersson-3-34708X-31.275-S'].sum() # total ppl survived in each of the WCG group

Survived    0.0
dtype: float64

In [49]:
# frame.loc['Andersson-3-34708X-31.275-S'].count() # total ppl in each of the WCG groups

Survived    7
dtype: int64

clearly there are three families with 'Andersson' as their surname of which 2 people are travelling solo and then there's a group of 7 Anderssons travelling together. 

Of these the ones travelling alone survived while the Anderssons travelling together had no survivors at all.

Lets see more such WCG groups where there's atleast one survivors and WCG groups where everyone dies...

In [50]:
# WCGOneLived and WCGAllDied

# len(set(full_df.WCG_Id.values)) # 1003 total groups
tot_grp = set(full_df[:891].WCG_Id.values)
frame = full_df[:891].groupby(['WCG_Id','Name'])['Survived'].mean().to_frame()
full_df['WCG_AllDied'] = 0
full_df['WCG_OneLived'] = 0

for group in tot_grp:
    s = int(frame.loc[group].sum()) #total ppl survived in that WCG group
    c = int(frame.loc[group].count()) #total ppl in that WCG group
    if s < 1:
        full_df.loc[full_df.WCG_Id == group, ['WCG_AllDied']] = 1
    
    if c >= 2 and s > 0:
        full_df.loc[full_df.WCG_Id == group, ['WCG_OneLived']] = 1

full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Minor,FamilySize,Surname,TicketClass,WCG_Id,WCG_AllDied,WCG_OneLived
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,0,2,Braund,A/5 2117X,Braund-3-A/5 2117X-7.25-S,1,0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,0,2,Cumings,PC 1759X,Cumings-1-PC 1759X-71.2833-C,0,0
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0,1,Heikkinen,STON/O2. 310128X,Heikkinen-3-STON/O2. 310128X-7.925-S,0,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,0,2,Futrelle,11380X,Futrelle-1-11380X-53.1-S,0,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0,1,Allen,37345X,Allen-3-37345X-8.05-S,1,0


In [69]:
le = LabelEncoder()
full_df['WCG'] = le.fit_transform(full_df['WCG_Id'])
full_df['Surname_emb'] = le.fit_transform(full_df['Surname'])

# full_df.Age.max() # 80
full_df['Age'] = full_df['Age'] / 80
# full_df['FamilySize'].max() # 11
full_df['FamilySize'] = full_df['FamilySize'] / 11
# full_df.Fare.max() # 512.3292
full_df['FareAdj'] = full_df['Fare']/512.3292

In [90]:
# X_train = full_df[['Sex','Age','Minor','WCG_OneLived','WCG_AllDied']][:891]
# X_test = full_df[['Sex','Age','Minor','WCG_OneLived','WCG_AllDied']][891:]
X_train = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][:891]
X_test = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][891:]

y_train = full_df[['Survived']][:891]
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y, test_size=0.3, random_state=29)

print("\nfull_df: ", full_df.shape,"\nX_train: ", X_train.shape, "\ny_train: ", y_train.shape,
      "\nX_valid: ", X_valid.shape, "\ny_valid: ", y_valid.shape, "\nX_test: ", X_test.shape)


full_df:  (1309, 22) 
X_train:  (891, 4) 
y_train:  (891, 1) 
X_valid:  (268, 19) 
y_valid:  (268,) 
X_test:  (418, 4)


In [91]:
logr = LogisticRegression()
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
logr_acc = logr.score(X_train, y_train) * 100
logr_acc

95.28619528619528

In [83]:
PassengerId = full_df[891:].PassengerId

submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': y_pred}, index=None)
print(submission.shape)
submission.to_csv('submission_WCGOneLived_WCGAllDied_LR.csv', index=False)

(418, 2)


In [65]:
submission.head()

Unnamed: 0,PassengerId,Survived
891,892,1.0
892,893,1.0
893,894,1.0
894,895,1.0
895,896,1.0
