
# Basic Overview
The objective is to build a binary logistic regression model, to predict the survival chance on titanic, given person's relevant information.

Comments/criticisms/appreciations are greatly accepted and appreciated. Do not be shy and send me an email at babinu@gmail.com !

Source of data : https://www.kaggle.com/c/titanic/data

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("../input/train.csv")
test_data = pd.read_csv("../input/test_data_processed_correct.csv") 

In [3]:
# Set of functions to get complete data about survivors.

def get_pid_to_survived_map(name_to_pid, name_to_survived_orig):
    pid_to_survived = dict()
    count = 0
    for name in name_to_pid:
        if name not in name_to_survived_orig:
            count += 1
            print(name, name_to_pid[name])
        else:
            pid = name_to_pid[name]
            pid_to_survived[pid] = name_to_survived_orig[name]

    pid_to_survived[911] = 1
    pid_to_survived[925] = 0
    pid_to_survived[927] = 0
    pid_to_survived[941] = 1
    pid_to_survived[944] = 1
    pid_to_survived[996] = 1
    pid_to_survived[1000] = 0
    pid_to_survived[1036] = 0
    pid_to_survived[1117] = 1
    pid_to_survived[1136] = 0
    pid_to_survived[1141] = 0
    pid_to_survived[1154] = 1
    pid_to_survived[1183] = 1
    pid_to_survived[1196] = 1
    pid_to_survived[1219] = 0
    pid_to_survived[1225] = 1
    pid_to_survived[1246] = 1
    pid_to_survived[1259] = 0
    pid_to_survived[1269] = 0
    pid_to_survived[1276] = 0
    pid_to_survived[1297] = 1
    pid_to_survived[1300] = 1

    return pid_to_survived

def get_rel_map_from_csv_file(given_file, column1, column2):
    given_file_data = pd.read_csv(given_file)
    return dict(zip(given_file_data[column1], given_file_data[column2]))

def get_complete_pid_to_survivorship_data(ext_file, kaggle_file):
    # Raw data obtained from the following source :
    # https://github.com/Geoyi/Cleaning-Titanic-Data/blob/master/titanic_original.csv
    name_to_survived_orig = get_rel_map_from_csv_file(ext_file, 'name', 'survived')
    name_to_pid = get_rel_map_from_csv_file(kaggle_file, 'Name', 'PassengerId')
    pid_to_survived = get_pid_to_survived_map(name_to_pid, name_to_survived_orig)
    return pid_to_survived

# Example usage : dump_completed_survivorship_data_to_file("titanic_original.csv", "test.csv", "test2.csv")
def dump_completed_survivorship_data_to_file(ext_file, kaggle_file, new_output_file):
    pid_to_survived = get_complete_pid_to_survivorship_data(ext_file, kaggle_file)
    test_data = pd.read_csv(kaggle_file)
    test_data['Survived'] = test_data['PassengerId'].apply(lambda x : pid_to_survived.get(x))
    test_data.to_csv(new_output_file, index=False)



### Model 1 : Use Sex alone as a predictor

In [4]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

In [5]:
def get_model_and_stats(X, Y, size_test_set=0.25, num_folds_cv=5):
    X_train, X_test, Y_train, Y_test = \
        train_test_split(X, Y, test_size=size_test_set, random_state=0)
    logistic = linear_model.LogisticRegression()
    

    # Make pipeline here to take care of imputations.
    my_pipeline = make_pipeline(Imputer(), linear_model.LogisticRegression())

    starting_model = my_pipeline.fit(X_train, Y_train)
    
    print("Length of full training set is ", len(X))
    print("Length of used training set is ", len(X_train))    
    print("Length of used test set is ", len(X_test))        
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(my_pipeline, X_train, Y_train, cv=num_folds_cv)    
    print("Number of correct predictions on used training data   is ", 
          starting_model.score(X_train, Y_train) * len(X_train))
    print("Number of correct predictions on used test data   is ", 
          starting_model.score(X_test, Y_test) * len(X_test))

    return (starting_model, 
            starting_model.score(X_train, Y_train),  
            np.mean(scores), 
            starting_model.score(X_test, Y_test))

In [6]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
def display_manual_model_metrics(test_v1, message):
    print("Fit score obtained  :" + message, 
          np.sum((test_v1['Survived'] == test_v1['Predictions'])/len(
              test_v1['Survived'])))
    print("Number of correct entries  :" + message, 
          np.sum((test_v1['Survived'] == test_v1['Predictions'])))
    print("Total number of  entries  :" + message, 
          len(test_v1['Survived']))

    print("AUC obtained  :" + message, 
          roc_auc_score(test_v1['Survived'].apply(lambda x : int(x)),
                        test_v1['Predictions'].apply(lambda x : int(x))))    

In [7]:
def get_data_X_Y_sex(given_data):
    X = np.zeros(shape=(len(given_data), 1))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']

    Y = given_data['Survived'] 
    return (X, Y)

In [8]:
def generate_out_of_sample_predictions(get_data_X_Y, model, test_data, generate_kaggle_file):
    (X, Y) = get_data_X_Y(test_data)
    test_data.loc[:,'Predictions'] =  model.predict(X).tolist()
    display_manual_model_metrics(test_data, '(logistic regression)')
    if generate_kaggle_file:
        test_data['Survived'] = test_data['Predictions']
        test_data[['PassengerId', 'Survived']].sort_values(
            by=['PassengerId']).to_csv("kaggle_out_logistic_regression.csv", index=False)
        

In [9]:
def train_validate_test_given_data(train_data, get_data_X_Y, generate_out_of_sample=False, test_data=(),
                                   generate_kaggle_file=False):
    (X, Y) = get_data_X_Y(train_data)
    (model, training_prediction_score, cross_val_score, test_set_score) = get_model_and_stats(X, Y)
    print("Training Prediction score is {2} \nCross validated measure is {0} \nTest set measures is {1}".format(
        cross_val_score, test_set_score, training_prediction_score))

    if generate_out_of_sample:
        generate_out_of_sample_predictions(get_data_X_Y, model, test_data, generate_kaggle_file)

In [10]:
train_validate_test_given_data(train_data, get_data_X_Y_sex)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  527.0
Number of correct predictions on used test data   is  174.0
Training Prediction score is 0.7889221556886228 
Cross validated measure is 0.7889462462125463 
Test set measures is 0.7802690582959642


### Model 2 : Use Sex,  Pclass

In [11]:
def get_data_X_Y_sex_pclass(given_data):
    X = np.zeros(shape=(len(given_data), 3))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]

    Y = given_data['Survived'] 
    return (X, Y)

In [12]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  527.0
Number of correct predictions on used test data   is  174.0
Training Prediction score is 0.7889221556886228 
Cross validated measure is 0.7889462462125463 
Test set measures is 0.7802690582959642


### Model 3 : Use Sex, Age, Pclass (with imputation on Age column)

In [13]:
def get_data_X_Y_sex_pclass_age(given_data):
    X = np.zeros(shape=(len(given_data), 4))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']

    Y = given_data['Survived'] 
    return (X, Y)

In [14]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  531.0
Number of correct predictions on used test data   is  176.0
Training Prediction score is 0.7949101796407185 
Cross validated measure is 0.797946358433397 
Test set measures is 0.7892376681614349


### Model 4 : Use Sex, Age, Pclass, Parch

In [15]:
def get_data_X_Y_sex_pclass_age_parch(given_data):
    X = np.zeros(shape=(len(given_data), 5))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']
    X[:,4] = given_data['Parch']    

    Y = given_data['Survived'] 
    return (X, Y)

In [16]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  534.0
Number of correct predictions on used test data   is  177.0
Training Prediction score is 0.7994011976047904 
Cross validated measure is 0.7964313769498372 
Test set measures is 0.7937219730941704


### Model 5 : Use Sex, Age, Pclass, Parch, SipSp

In [17]:
def get_data_X_Y_sex_pclass_age_parch_sibsp(given_data):
    X = np.zeros(shape=(len(given_data), 6))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']
    X[:,4] = given_data['Parch']    
    X[:,5] = given_data['SibSp']        

    Y = given_data['Survived'] 
    return (X, Y)

In [18]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch_sibsp)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  540.0
Number of correct predictions on used test data   is  177.0
Training Prediction score is 0.8083832335329342 
Cross validated measure is 0.8054314891706879 
Test set measures is 0.7937219730941704


### Model 6 : Use Sex, Age, Pclass, Parch, SipSp, number of family members

In [19]:
train_data['FamilyNum'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilyNum'] = test_data['SibSp'] + test_data['Parch'] + 1


In [20]:
def get_data_X_Y_sex_pclass_age_parch_sibsp_familynum(given_data):
    X = np.zeros(shape=(len(given_data), 7))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']
    X[:,4] = given_data['Parch']    
    X[:,5] = given_data['SibSp']        
    X[:,6] = given_data['FamilyNum']            

    Y = given_data['Survived'] 
    return (X, Y)

In [21]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch_sibsp_familynum)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  540.0
Number of correct predictions on used test data   is  176.0
Training Prediction score is 0.8083832335329342 
Cross validated measure is 0.8054314891706879 
Test set measures is 0.7892376681614349


### Model 7 : Use Sex, Age, Pclass, Parch, SipSp, FamilyNum, IsAlone


In [22]:
train_data['IsAlone'] = (train_data['FamilyNum'] == 1)
test_data['IsAlone'] = (test_data['FamilyNum'] == 1)

In [23]:
def get_data_X_Y_sex_pclass_age_parch_sibsp_familynum_isalone(given_data):
    X = np.zeros(shape=(len(given_data), 8))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']
    X[:,4] = given_data['Parch']    
    X[:,5] = given_data['SibSp']        
    X[:,6] = given_data['FamilyNum']      
    X[:,7] = given_data['IsAlone']          

    Y = given_data['Survived'] 
    return (X, Y)

In [24]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch_sibsp_familynum_isalone)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  541.0
Number of correct predictions on used test data   is  176.0
Training Prediction score is 0.8098802395209581 
Cross validated measure is 0.8099315452811131 
Test set measures is 0.7892376681614349


### Model 8 : Use Sex, Age, Pclass, Parch, SipSp, FamilyNum, IsAlone, FarePerPerson


In [25]:
train_data['FarePerPerson'] = train_data['Fare']/train_data['FamilyNum']
test_data['FarePerPerson'] = test_data['Fare']/test_data['FamilyNum']

In [26]:
 def get_data_X_Y_sex_pclass_age_parch_sibsp_familynum_isalone_fare(given_data):
    X = np.zeros(shape=(len(given_data), 9))
    X[:,0] = pd.get_dummies(given_data['Sex'])['female']
    X[:,1] = pd.get_dummies(given_data['Pclass'])[1]
    X[:,2] = pd.get_dummies(given_data['Pclass'])[2]
    X[:,3] = given_data['Age']
    X[:,4] = given_data['Parch']    
    X[:,5] = given_data['SibSp']        
    X[:,6] = given_data['FamilyNum']      
    X[:,7] = given_data['IsAlone']        
    X[:,8] = given_data['FarePerPerson']            

    Y = given_data['Survived'] 
    return (X, Y)

In [27]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch_sibsp_familynum_isalone_fare)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  540.0
Number of correct predictions on used test data   is  176.0
Training Prediction score is 0.8083832335329342 
Cross validated measure is 0.8084390079676803 
Test set measures is 0.7892376681614349


### Conclusion

It is slightly tricky here, but given our preference for simpler models and since the increase in cross validation score is little, we will go with Model 5 (and see how it performs on the out of sample data as  well as generate an output file for kaggle as well)

In [28]:
train_validate_test_given_data(train_data, get_data_X_Y_sex_pclass_age_parch_sibsp, 
                               True, test_data, True)

Length of full training set is  891
Length of used training set is  668
Length of used test set is  223
Number of correct predictions on used training data   is  540.0
Number of correct predictions on used test data   is  177.0
Training Prediction score is 0.8083832335329342 
Cross validated measure is 0.8054314891706879 
Test set measures is 0.7937219730941704
Fit score obtained  :(logistic regression) 0.763157894736842
Number of correct entries  :(logistic regression) 319
Total number of  entries  :(logistic regression) 418
AUC obtained  :(logistic regression) 0.7438169425511197
