
# Basic Overview
The objective is to build a xgboost model with sex,age and pclass as relevant predictors.

Comments/criticisms/appreciations are greatly accepted and appreciated. Do not be shy and send me an email at babinu@gmail.com !

Source of data : https://www.kaggle.com/c/titanic/data

In [32]:
import pandas as pd
import numpy as np

In [33]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test_data_processed_correct.csv")

In [34]:
test_data.Survived.unique()

array([0., 1.])

In [35]:
def getAlphabetClass(given_data):
    given_data['PclassAlph'] = given_data['Pclass']
    given_data['PclassAlph'].replace(to_replace=1, value='a', inplace=True)
    given_data['PclassAlph'].replace(to_replace=2, value='b', inplace=True)    
    given_data['PclassAlph'].replace(to_replace=3, value='c', inplace=True)        

In [36]:
# Gets classes as alphabets which makes it easier to be interpreted as  categorical predictors later on.
getAlphabetClass(train_data)
getAlphabetClass(test_data)

In [37]:
# Add a column called Predictions in training data, which will become handy while doing
# cross validation.
train_data['Predictions'] = -1

In [38]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PclassAlph,Predictions
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,c,-1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,a,-1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,c,-1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,a,-1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,c,-1


In [39]:
from sklearn.preprocessing import Imputer
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [40]:
def get_train_test_data(train_data,
                        X_columns=['Sex', 'PclassAlph', 'Age'], 
                        Y_columns=['Survived']):
    # Simple training and testing
    X = train_data[X_columns]
    Y = train_data[Y_columns]

    X_one_hot = pd.get_dummies(X)

    # Do imputation on relevant columns.
    X_train, X_test, Y_train, Y_test = train_test_split(X_one_hot, Y, random_state=0)

    return (X_train, X_test, Y_train, Y_test, X_one_hot, Y)


In [64]:
def create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test):
    # Rewrite everything as a pipeline
    my_pipeline = make_pipeline(Imputer(), XGBClassifier(seed=1))
    my_pipeline.fit(X_train, Y_train.values.ravel())
    predictions = my_pipeline.predict(X_test)
    out_of_sample_score = 1.0 - mean_absolute_error(predictions, Y_test)
    return (my_pipeline, out_of_sample_score, predictions)

In [42]:
def cross_validate(my_pipeline, X_one_hot, Y):
    cross_val_scores = cross_val_score(my_pipeline, X_one_hot, Y, scoring='neg_mean_absolute_error', cv=5)
    return 1.0  + cross_val_scores.mean()

In [63]:
def train_test_cross_validate(train_data, 
                              X_columns=['Sex', 'PclassAlph', 'Age'], 
                              Y_columns=['Survived']):
    (X_train, X_test, Y_train, Y_test, X_one_hot, Y) = get_train_test_data(train_data, X_columns, Y_columns)
    my_pipeline, out_of_sample_score, predictions_test = \
        create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test)

    print("Number of entries in training set is {0}".format(len(X_train)))
    predictions_train = my_pipeline.predict(X_train)
    num_correct_predictions_train = int((1.0 - mean_absolute_error(predictions_train, Y_train)) * len(Y_train))


    print("Number of correct predictions in training set is {0}".format(num_correct_predictions_train))    
    print("Number of entries in test set is {0}".format(len(X_test)))    
    print("Number of correct predictions in test set is {0}".format(int(out_of_sample_score * len(X_test))))

    cross_validation_score = cross_validate(make_pipeline(Imputer(), XGBClassifier(seed=1)), X_one_hot, Y.values.ravel())
    return (my_pipeline, out_of_sample_score, cross_validation_score)

In [44]:
# Make sure that unnecessary deprecation warnings are avoided.
# Thanks to https://stackoverflow.com/questions/49545947/sklearn-deprecationwarning-truth-value-of-an-array
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)


### Model 1: Sex as the only predictor

In [45]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex'], 
                              ['Survived'])
    
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 527
Number of entries in test set is 223
Number of correct predictions in test set is 174
Out of sample score is 0.7802690582959642
Cross val score is 0.7867150249291879


### Model 2: Sex and Pclass as predictors.

In [46]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 527
Number of entries in test set is 223
Number of correct predictions in test set is 174
Out of sample score is 0.7802690582959642
Cross val score is 0.7733072037001376


### Model 3: Sex, Pclass and Age as predictors

In [47]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 565
Number of entries in test set is 223
Number of correct predictions in test set is 184
Out of sample score is 0.8251121076233183
Cross val score is 0.8047557492817721


### Model 4 : Use Sex, Age, Pclass, Parch

In [48]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age', 'Parch'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 570
Number of entries in test set is 223
Number of correct predictions in test set is 184
Out of sample score is 0.8251121076233183
Cross val score is 0.8148494904047717


### Model 5 : Use Sex, Age, Pclass, Parch, SipSp

In [49]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age', 'Parch', 'SibSp'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 570
Number of entries in test set is 223
Number of correct predictions in test set is 183
Out of sample score is 0.820627802690583
Cross val score is 0.8283455452486305


### Model 6 : Use Sex, Age, Pclass, Parch, SipSp, FamilyNum

In [50]:
# Add another factor on familyNum
train_data['FamilyNum'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilyNum'] = test_data['SibSp'] + test_data['Parch'] + 1



In [51]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age', 'Parch', 'SibSp', 'FamilyNum'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 568
Number of entries in test set is 223
Number of correct predictions in test set is 185
Out of sample score is 0.8295964125560538
Cross val score is 0.8271903516116643


### Model 7 : Use Sex, Age, Pclass, Parch, SipSp, FamilyNum,IsAlone


In [52]:
train_data['IsAlone'] = (train_data['FamilyNum'] == 1)
test_data['IsAlone'] = (test_data['FamilyNum'] == 1)

In [53]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age', 'Parch', 'SibSp', 'FamilyNum', 'IsAlone'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 568
Number of entries in test set is 223
Number of correct predictions in test set is 185
Out of sample score is 0.8295964125560538
Cross val score is 0.8271903516116643


### Model 8 : Use Sex, Age, Pclass, Parch, SipSp, FamilyNum, IsAlone, FarePerPerson


In [54]:
train_data['FarePerPerson'] = train_data['Fare']/train_data['FamilyNum']
test_data['FarePerPerson'] = test_data['Fare']/test_data['FamilyNum']

In [55]:
(my_pipeline, out_of_sample_score, cross_validation_score) = \
    train_test_cross_validate(train_data,
                              ['Sex', 'PclassAlph', 'Age', 'Parch', 'SibSp', 'FamilyNum', 'IsAlone', 'FarePerPerson'], 
                              ['Survived'])
print("Out of sample score is {0}\nCross val score is {1}".format(out_of_sample_score, cross_validation_score))



Number of entries in training set is 668
Number of correct predictions in training set is 581
Number of entries in test set is 223
Number of correct predictions in test set is 191
Out of sample score is 0.8565022421524664
Cross val score is 0.8260917934581584


### Conclusion
With our emphasis on parsimonious models and cross validation score, we will go for model 5.

### Let us try to optimize the chosen model using early stoppage   

In [56]:
X_train, X_test, Y_train, Y_test,X_one_hot, Y = get_train_test_data(
     train_data, 
     X_columns=['Sex', 'PclassAlph', 'Age', 'Parch', 'SibSp'],
     Y_columns=['Survived'])


my_imputer = Imputer()

X_train = my_imputer.fit_transform(X_train)
X_test = my_imputer.transform(X_test)


In [57]:
def create_early_stopping_model(X_train, X_test, Y_train, Y_test, 
                                n_estimators=100, early_stopping_rounds=5, learning_rate=0.1):
    # Modify number of estimators and stopping rounds number.
    my_model = XGBClassifier(n_estimators=n_estimators, random_state=0, learning_rate=learning_rate)
    my_model.fit(X_train, Y_train.values.ravel(), early_stopping_rounds=early_stopping_rounds, 
                 eval_set=[(X_test, Y_test.values.ravel())], verbose=False)

    # Make predictions on the test set.
    my_predictions = my_model.predict(X_test)
    return(my_model, mean_absolute_error(my_predictions, Y_test))

In [58]:
(my_model, mean_error) = create_early_stopping_model(X_train, X_test, Y_train, Y_test, 100, 5)
print("Success rate with the current model is {0}".format(1.0 - mean_error))

Success rate with the current model is 0.820627802690583


In [59]:
(my_model, mean_error) = create_early_stopping_model(X_train, X_test, Y_train, Y_test, 
                                                     n_estimators=100, early_stopping_rounds=10, learning_rate=0.05)
print("Success rate with the current model is {0}".format(1.0 - mean_error))

Success rate with the current model is 0.820627802690583


In [60]:
(my_model, mean_error) = create_early_stopping_model(X_train, X_test, Y_train, Y_test, 
                                                     n_estimators=1000, early_stopping_rounds=20, learning_rate=0.02)
print("Success rate with the current model is {0}".format(1.0 - mean_error))
print(my_model.best_score, my_model.best_iteration, my_model.best_ntree_limit)

Success rate with the current model is 0.820627802690583
0.179372 18 19


In [61]:
my_model = XGBClassifier(seed=1)
my_model.fit(X_train, Y_train.values.ravel())
print("Success rate with the current model is {0}".format(
    1.0 - mean_absolute_error(my_model.predict(X_test), Y_test.values.ravel())))


Success rate with the current model is 0.820627802690583


We look to be getting diminishing returns from applying early stoppage method. That does not look to give us any advantage here.

NOTE: One may rightfully question the usage of encoding categorical variables for use in decision trees(https://roamanalytics.com/2016/10/28/are-categorical-variables-getting-lost-in-your-random-forests/). 

This should  not be issue for a binary predictor(as per page 310 in https://web.stanford.edu/~hastie/ElemStatLearn//printings/ESLII_print12.pdf) , but this is not the optimal way for other categorical variables. For the time being, we will continue in this, but it must be mentioned that this can create problems when we have categorical variables of high dimensionality.


### Conclusion.

Let us generate predictions on out of sample data and submit a file to kaggle using model 5.

In [69]:
#create_pipeline_and_out_of_sample_score(X_train, X_test, Y_train, Y_test)
X_columns = ['Sex', 'Age', 'Parch', 'SibSp']
Y_column = ['Survived']
X_train = train_data[X_columns]
X_train_one_hot = pd.get_dummies(X_train)
Y_train = train_data[Y_column]
X_test = test_data[X_columns]
X_test_one_hot = pd.get_dummies(X_test)
Y_test = test_data[Y_column]
(my_pipeline, out_of_sample_score, predictions_out_of_sample) = \
    create_pipeline_and_out_of_sample_score(X_train_one_hot, 
                                            X_test_one_hot,
                                            Y_train,
                                            Y_test)
print("Complete score on out of sample data is {0}".format(out_of_sample_score))

test_data['Predictions'] = predictions_out_of_sample
kaggle_data = test_data[['PassengerId', 'Predictions']].copy()
kaggle_data.rename(columns={'Predictions' : 'Survived'}, inplace=True)
kaggle_data.sort_values(by=['PassengerId']).to_csv('xgboost_kaggle_output.csv', index=False)

Complete score on out of sample data is 0.7727272727272727
