In [59]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

#### Note that I am using the prepped files instead

In [60]:
df_train = pd.read_csv('train_prepped.csv', index_col=0)
df_test = pd.read_csv('test_prepped.csv', index_col=0)
df_final_submit = pd.read_csv('test.csv', index_col=0)
df_sample_submit = pd.read_csv('gender_submission.csv', index_col=0)

In [61]:
df_final_submit.head(3) #info()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [62]:
df_train.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_Id,Age_Label_Id,TtlFamMbr,Title_Id,Survived
0,3,22,1,0,7.25,0,4,1,11,0
1,1,38,1,0,71.2833,1,5,1,12,1
2,3,26,0,0,7.925,1,4,0,8,1


In [63]:
df_test.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_Id,Age_Label_Id,TtlFamMbr,Title_Id
0,3,34,0,0,7.8292,0,4,0,5
1,3,47,1,0,7.0,1,5,1,6
2,2,62,0,0,9.6875,0,6,0,5


In [64]:
Feature_Cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_Id', 'Age_Label_Id', 'TtlFamMbr', 'Title_Id']
Target_Cols = ['Survived']

In [65]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble
from sklearn import discriminant_analysis, gaussian_process
# from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

In [25]:
#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

In [24]:
#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = df_train[Target_Cols]

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, df_train[Feature_Cols], df_train[Target_Cols], cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(df_train[Feature_Cols], df_train[Target_Cols])
    MLA_predict[MLA_name] = alg.predict(df_train[Feature_Cols])
    
    row_index+=1

In [26]:
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
3,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.910112,0.826493,0.0845501,0.0621306
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.969101,0.81791,0.058338,0.0132599
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.96573,0.813806,0.0516505,0.015558
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': None, '...",0.811423,0.806716,0.0529325,0.269784
19,LinearDiscriminantAnalysis,"{'n_components': None, 'priors': None, 'shrink...",0.811236,0.805597,0.0518925,0.00238626
8,RidgeClassifierCV,"{'alphas': (0.1, 1.0, 10.0), 'class_weight': N...",0.810861,0.805597,0.054484,0.00478203
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.849251,0.804478,0.053964,0.059942
12,GaussianNB,{'priors': None},0.806929,0.801866,0.0668744,0.00209432
2,ExtraTreesClassifier,"{'bootstrap': False, 'class_weight': None, 'cr...",0.985955,0.800373,0.0757977,0.0121671
17,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.985955,0.781716,0.0727612,0.0021939


### Submission to Kaggle for scoring
#### Based on the table above, the model with the highest accuracy is 'GradientBoostingClassifier'

In [78]:
for alg in MLA:
    if alg.__class__.__name__ == 'GradientBoostingClassifier':
        print(alg.__class__.__name__)
        alg.fit(df_train[Feature_Cols], df_train[Target_Cols])
        pred = alg.predict(df_test)
        df_final_submit['Survived'] = pred
#         print(df_final_submit['Survived'].head(5))
        df_final_submit['Survived'].to_csv('final_submission.csv', sep=',', header=True)
        break
    

GradientBoostingClassifier
