In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
# libraries

import sys
print("Python version: {}".format(sys.version))
import time

import pandas as pd
print("Pandas version: {}".format(pd.__version__))
import numpy as np
print("Numpy version: {}".format(np.__version__))
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import rcParams
%matplotlib inline
print("Matplotlib version: {}".format(matplotlib.__version__))
import seaborn as sns
print("Seaborn version: {}".format(sns.__version__))
import scipy
print("Scipy version: {}".format(scipy.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))

# Modelling libraries
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
#from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

# print('-'*25)
# # check inside input directory for the files
# !ls -lrth input

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

print("Train dataframe shape is: ", train_df.shape)
print("Test dataframe shape is: ", test_df.shape)

full_df = train_df.append(test_df, ignore_index=True)
print("Full dataframe shape is: ", full_df.shape)

In [None]:
# imputing missing values
full_df['Age'] = full_df.Age.fillna(value = full_df.Age.median())
full_df['Fare'] = full_df.Fare.fillna(value = full_df.Fare.median())

# feature engineering
full_df['Sex'] = full_df['Sex'].map({'female': 0, 'male': 1})

# mark passengers as Minor if their name has 'Master.' in it Or if their age is < 16
full_df['Minor'] = full_df.apply(lambda x: 1 if (x['Name'].split(',')[-1].split()[0] == 'Master.') | (x['Age'] < 16) \
                             else 0, axis = 1)
# family size
full_df['FamilySize'] = full_df['Parch'] + full_df['SibSp'] + 1

# extracting surname
full_df['Surname'] = full_df.apply(lambda x: x['Name'].split(',')[0], axis = 1)

# extracting ticket class, and purposely ommiting the last digit in the ticket numbers and replacing it with 'X'
# as family members would be sitting right next to each other varying by a digit in the ticket no.
full_df['TicketClass'] = full_df.apply(lambda x: x['Ticket'][:-1]+'X', axis = 1)

# women-child-grouping
#full_df['WCG_Id'] = full_df.apply(lambda x: x['Surname'] + '-' + str(x['Pclass']) + '-' + str(x['TicketClass']) + '-' + str(x['Fare']) + '-' + str(x['Embarked']), axis = 1)

# surname-grouping
full_df['WCG_Id'] = full_df['Surname']

full_df.head()

In [None]:
# familyOneSurvived and familyAllDied

frame = full_df[:891].groupby(['WCG_Id','Name'])['Survived'].mean().to_frame()
frame

In [None]:
#frame.loc['Andersson-3-34708X-31.275-S'] #lets see for Andersson surnamed families

In [None]:
#frame.loc['Andersson-3-34708X-31.275-S'].sum()

In [None]:
#frame.loc['Andersson-3-34708X-31.275-S'].count()

In [None]:
frame = full_df[:891].groupby(['WCG_Id','Name'])['Survived'].mean().to_frame()
full_df['WCG_AllDied'] = 0
full_df['WCG_OneLived'] = 0

for i in range(0,891):
    group = full_df.loc[i,'WCG_Id']
    ss = full_df.loc[i,'Survived']
    s = int(frame.loc[group].sum()) - ss 
    c = int(frame.loc[group].count()) - 1 
    if c > 0 and s < 1:
        full_df.loc[i, ['WCG_AllDied']] = 1
    if c > 0 and s > 0:
        full_df.loc[i, ['WCG_OneLived']] = 1
for i in range(891,1309):
    group = full_df.loc[i,'WCG_Id']
    try:
        s = int(frame.loc[group].sum()) 
        c = int(frame.loc[group].count()) 
    except:
        c = 0
    if c > 0 and s < 1:
        full_df.loc[i, ['WCG_AllDied']] = 1
    if c > 0 and s > 0:
        full_df.loc[i, ['WCG_OneLived']] = 1

full_df.head()

In [None]:

#le = LabelEncoder()
#full_df['WCG'] = le.fit_transform(full_df['WCG_Id'])
#full_df['Surname_emb'] = le.fit_transform(full_df['Surname'])

# full_df.Age.max() # 80
full_df['Age'] = full_df['Age'] / 80

# full_df['FamilySize'].max() # 11
full_df['FamilySize'] = full_df['FamilySize'] / 11

# full_df.Pclass.max() # 3
full_df['Pclass'] = full_df['Pclass'] / 3

# full_df.Fare.max() # 512.3292
full_df['FareAdj'] = full_df['Fare']/512.3292

In [None]:
#X_train = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][:891]
#X_test = full_df[['Sex','Surname_emb','WCG_OneLived','WCG_AllDied']][891:]

X_train = full_df[['Sex','Minor','FamilySize','Pclass','WCG_OneLived','WCG_AllDied']][:891]
X_test = full_df[['Sex','Minor','FamilySize','Pclass','WCG_OneLived','WCG_AllDied']][891:]
y_train = full_df[['Survived']][:891]

print("\nfull_df: ", full_df.shape,"\nX_train: ", X_train.shape, "\ny_train: ", y_train.shape,
#      "\nX_valid: ", X_valid.shape, "\ny_valid: ", y_valid.shape, 
      "\nX_test: ", X_test.shape)

In [None]:
# Validation Accuracy to choose classifier and tune parameters
trials = 100
sum = 0
for i in range(trials):
    X_train2, X_valid2, y_train2, y_valid2 = train_test_split(X_train, y_train, test_size=0.1)
    #logr = LogisticRegression() # Val_acc = 82.0%
    logr = DecisionTreeClassifier() # Val_acc = 83.0%
    #logr = RandomForestClassifier() # Val_acc = 82.5%
    #logr = GradientBoostingClassifier() # Val_acc = 84.0%
    #logr = KNeighborsClassifier() # Val_acc = 82.0%
    logr.fit(X_train2, y_train2)
    logr_acc = logr.score(X_valid2, y_valid2) * 100
    sum = sum + logr_acc
print("Average validation accuracy of",trials,"trials = ",sum/trials)

In [None]:
logr = GradientBoostingClassifier()
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)

In [None]:
PassengerId = full_df[891:].PassengerId
submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': y_pred.astype(int)}, index=None)
print(submission.shape)
# This submission scores 81.8%
submission.to_csv('BoostedTrees.csv', index=False)

In [None]:
submission.head()