### Resources 
https://www.kaggle.com/startupsci/titanic-data-science-solutions   Basic feature engineering basic modeling. Initial fork.  
https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy  A look into hyperparameterization with sklearn  
https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever/notebook  Advanced feature engineering covering group/family survival assumption feature


https://www.analyticsvidhya.com/blog/2017/09/understaing-support-vector-machine-example-code/ Support Vector Machines  
https://medium.com/deep-math-machine-learning-ai/chapter-3-support-vector-machine-with-math-47d6193c82be Support Vector Machines with math

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random as rnd

In [None]:
train = pd.read_csv(r"C:\Users\heret\Downloads\titanic\train.csv")
test = pd.read_csv(r"C:\Users\heret\Downloads\titanic\test.csv")

all_data = pd.concat((train, test), sort=False).reset_index(drop=True)
ntrain = train.shape[0]
ntest = test.shape[0]
all_data.shape

### Feature engineering

In [None]:
# Discretise Sex variable
all_data['Sex'] = all_data['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

# Replace Age NaNs by imputing them based on Sex and Pclass median
guess_ages = np.zeros((2,3))
for i in range(0, 2):
    for j in range(0, 3):
        guess_df = all_data[(all_data['Sex'] == i) & (all_data['Pclass'] == j+1)]['Age'].dropna()
        age_guess = guess_df.median()

        # Convert random age float to nearest .5 age
        guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

for i in range(0, 2):
    for j in range(0, 3):
        all_data.loc[ (all_data.Age.isnull()) & (all_data.Sex == i) & (all_data.Pclass == j+1), 'Age'] = guess_ages[i,j]

all_data['Age'] = all_data['Age'].astype(int)


# Replacing Embarked NaNs with the mode
freq_port = all_data.Embarked.mode()[0]
all_data['Embarked'] = all_data['Embarked'].fillna(freq_port)
all_data['Embarked'] = all_data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) #turning into a numeric variable

# Replacing Fare NaN with median
all_data['Fare'].fillna(all_data['Fare'].dropna().median(), inplace=True)

In [None]:
# Creating a new IsAlone varible
all_data["FamilySize"] = all_data["SibSp"] + all_data["Parch"] + 1
all_data["IsAlone"] = 0
all_data.loc[all_data["FamilySize"] == 1, "IsAlone"] = 1
#print(all_data[["IsAlone","Survived"]].groupby(by="IsAlone", as_index=False).mean()) #there seems to be correlation within these groups

# Creating a new variable "Last name" so it can help us identify families
all_data['Last name'] = all_data['Name'].apply(lambda x: str.split(x, ",")[0])

In [None]:
# Creating a new variable Family survival with info whether the family survived or not, this is inferred by grouping last name and fare duplicated values
# all credit to https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever/notebook

grp_partial_age = 0
grp_partial_cabin = 0
grp_age_diff_df = pd.DataFrame()
all_data['Family survival'] = 0.5

for grp, grp_df in all_data[['Survived','Name', 'Last name', 'Fare', 'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last name', 'Fare']):
    if (len(grp_df) != 1):
        grp_missing_age = len(grp_df[grp_df['Age'].isnull()])
        is_partial_age = (grp_missing_age != 0) & (grp_missing_age != len(grp_df))
        grp_partial_age += is_partial_age
        
        sibsp_df = grp_df.loc[grp_df['SibSp']!=0, ['Age']]
        #print(sibsp_df.info())
        sibsp_age_diff = sibsp_df.max() - sibsp_df.min()
        grp_age_diff_df = grp_age_diff_df.append(sibsp_age_diff, ignore_index=True)
        
        grp_missing_cabin = len(grp_df[grp_df['Cabin'].isnull()])
        grp_partial_cabin += (grp_missing_cabin != 0) & (grp_missing_cabin != len(grp_df))

        for PassID, row in grp_df.iterrows():
            ## Find out if any family memebers survived or not
            smax = grp_df.drop(PassID)['Survived'].max()
            smin = grp_df.drop(PassID)['Survived'].min()

            ## If any family memebers survived, put this feature as 1
            if (smax==1.0): all_data.loc[PassID, 'Family survival'] = 1
            ## Otherwise if any family memebers perished, put this feature as 0
            elif (smin==0.0): all_data.loc[PassID, 'Family survival'] = 0


# Some ticket numbers and fares are the same suggesting they may be groups and not families, which leads to the assumption that they may have survived or died together.
# We will overload the 'Family survival' column instead of creating a seperate feature.
grp_partial_age = 0
grp_partial_cabin = 0
grp_age_diff_df = pd.DataFrame(columns=['Age diff'])
ticket_grpby = all_data.groupby('Ticket')
for _, grp_df in ticket_grpby:
    if (len(grp_df) > 1):
        grp_missing_age = len(grp_df[grp_df['Age'].isnull()])
        grp_partial_age += (grp_missing_age != 0) & (grp_missing_age != len(grp_df))

        grp_age_diff_df = grp_age_diff_df.append(pd.DataFrame(data=[grp_df['Age'].max() - grp_df['Age'].min()], columns=['Age diff']))

        grp_missing_cabin = len(grp_df[grp_df['Cabin'].isnull()])
        grp_partial_cabin += (grp_missing_cabin != 0) & (grp_missing_cabin != len(grp_df))
        for PassID, row in grp_df.iterrows():
            if (row['Family survival']==0)|(row['Family survival']==0.5):
                smax = grp_df.drop(PassID)['Survived'].max()
                smin = grp_df.drop(PassID)['Survived'].min()
                if (smax==1.0): 
                    all_data.loc[PassID, 'Family survival'] = 1
                elif (smin==0.0): 
                    all_data.loc[PassID, 'Family survival'] = 0

In [None]:
# Creating a new Title variable
all_data["Title"] = all_data.Name.str.extract(" ([A-Za-z]+)\.", expand=False)
print(all_data[["Title", "Survived"]].groupby(["Title"], as_index=False).count())

# Now grouping all minor categories into one
all_data["Title"] = all_data["Title"].replace(["Lady", "Countess","Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")
all_data["Title"] = all_data["Title"].replace("Mlle", "Miss")
all_data["Title"] = all_data["Title"].replace("Ms", "Miss")
all_data["Title"] = all_data["Title"].replace("Mme", "Mrs")

In [None]:
# Turning into a numeric variable
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
all_data['Title'] = all_data['Title'].map(title_mapping)
all_data['Title'] = all_data['Title'].fillna(0)

In [None]:
# Binning Fare variable
from sklearn.preprocessing import LabelEncoder

all_data['FareBin_4'] = pd.qcut(all_data["Fare"], 5) #group into 5 cuts based on 5 quartiles
all_data['FareBin_4'] = LabelEncoder().fit_transform(all_data["FareBin_4"]) #encodes it into a numerical variable

# Binning Age variable
all_data['Age_5'] = pd.qcut(all_data["Age"], 5) #group into 5 cuts based on 5 quartiles
all_data['Age_5'] = LabelEncoder().fit_transform(all_data["Age_5"]) #encodes it into a numerical variable

In [None]:
# Dropping unnecessary columns
all_data = all_data.drop(columns=["Name", "PassengerId", "Ticket", "Cabin",
                                  "Fare", "Age", "SibSp", "Parch", "Last name", "FamilySize"],axis=1)

In [None]:
train_y = train["Survived"].values
train_X = all_data.drop(columns=["Survived"],axis=1)[:ntrain].values
test_X = all_data.drop(columns=["Survived"],axis=1)[ntrain:].values
print("train_y: ", train_y.shape,"    train_X: ", train_X.shape,"   test_X: ", test_X.shape)

### Modeling

In [None]:
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, KFold, ShuffleSplit, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import RFECV

In [None]:
models = [LGBMClassifier(), LogisticRegression(), SVC(), LinearSVC(),
          RandomForestClassifier(), KNeighborsClassifier(),GaussianNB(),
          Perceptron(), SGDClassifier(), DecisionTreeClassifier()]

for model in models:
    classifier = model
    print(model.__class__.__name__, cross_val_score(classifier, train_X, train_y, cv=5, scoring="accuracy").mean().round(4))
    
#Logistic Regression CV 0.8406
#Support Vector Machines CV 0.844 / LB 0.80861 
#LightGBM CV 0.855 / LB 0.77990

In [None]:
# cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0)
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# } # 

# forest_rfe = RFECV(RandomForestClassifier(random_state = 0), step = 1, scoring = 'accuracy', cv = cv_split)
# forest_rfe.fit(train_X, train_y)

# grid_search = GridSearchCV(RandomForestClassifier(random_state = 0), param_grid=param_grid, scoring = 'accuracy', cv = cv_split, refit=True) #scoring = roc_auc
# grid_search.fit(train_X[:,forest_rfe.get_support()], train_y) # we split the train_X set with the variables obtained through rfe
# print(grid_search.best_params_)
# print(grid_search.best_score_.round(3)) 
# {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 100}
# accuracy = base model 0.8272  vs  tuned model 0.859  vs  Public LB 0.78468

# model = RandomForestClassifier(bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100)
# model.fit(train_X[:,forest_rfe.get_support()], train_y)
# predictions = model.predict(test_X[:,forest_rfe.get_support()])

In [None]:
# cv_split = ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0)
# param_grid = {"kernel": ["rbf"],
#               "C": [0.001, 0.01, 0.1, 1, 10],
#               "gamma": [0.001, 0.01, 0.1, 1]} #accuracy = 0.847 vs base model 0.843

# #svc_ref = RFECV(SVC(random_state = 0, kernel="rbf"), step = 1, scoring = 'accuracy', cv = cv_split) if kernel="lineal" all features are deemed meaningful to this model
# #svc_ref.fit(train_X, train_y)
# #svc_ref.get_support()

# grid_search = GridSearchCV(SVC(random_state = 0), param_grid=param_grid, scoring = 'accuracy', cv = cv_split, refit=True) #scoring = roc_auc
# grid_search.fit(train_X, train_y)
# print(grid_search.best_params_)
# print(grid_search.best_score_.round(3)) #accuracy = 0.847, LB = 0.76555

In [None]:
# Local CV improves with GridSearch params, however default params work best in LB
# LB score 0.80861
model = SVC()
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [None]:
my_submission = pd.DataFrame({"PassengerID": test["PassengerId"], "Survived": predictions})
my_submission.to_csv("titanic.csv", index=False)
print(my_submission.head(10))