In [91]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import fbeta_score, accuracy_score
from time import time
from sklearn.metrics import make_scorer

In [92]:
training_set = pd.read_csv('titanic_data/train.csv')
test_set = pd.read_csv('titanic_data/test.csv')
training_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [93]:
print(training_set.shape)
print(test_set.shape)

(891, 12)
(418, 11)


In [94]:
test_set.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


We should consider dropping PassengerId, Name, Ticket Number as these won't influence survival

In [95]:
training_set = training_set.drop(['PassengerId', 'Name', 'Ticket'], axis =1)
test_set = test_set.drop(['Name', 'Ticket'], axis=1)
print(training_set.shape)
print(test_set.shape)

(891, 9)
(418, 9)


In [96]:
training_set.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object

Cabin, Embarked and Sex are of type object, so we should convert the categorical variables into numericals
However, we should first try check their null counts

In [97]:
print("Null values in Sex:", training_set['Sex'].isnull().sum())
print("Null values in Cabin:", training_set['Cabin'].isnull().sum())
print("Null values in Embarked:", training_set['Embarked'].isnull().sum())

Null values in Sex: 0
Null values in Cabin: 687
Null values in Embarked: 2


We observe that 687 out of the 819 entries in Cabin are null, so we should ideally drop the column

In [98]:
training_set = training_set.drop(['Cabin'], axis =1)
test_set = test_set.drop(['Cabin'], axis=1)
print(training_set.shape)
print(test_set.shape)
print("Training Set")
print(training_set.head())
print("Test Set")
print(test_set.head())

(891, 8)
(418, 8)
Training Set
   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
Test Set
   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0          892       3    male  34.5      0      0   7.8292        Q
1          893       3  female  47.0      1      0   7.0000        S
2          894       2    male  62.0      0      0   9.6875        Q
3          895       3    male  27.0      0      0   8.6625        S
4          896       3  female  22.0      1      1  12.2875        S


The Sex and Embarked attributes should now be One - hot encoded
But Embarked Attribute has 2 null values, so ideally we should fill those up
We should also check if any null values exist in the test set

In [99]:
print("Test set null counters")
print("Null values in Sex:", test_set['Sex'].isnull().sum())
print("Null values in Embarked:", test_set['Embarked'].isnull().sum())

Test set null counters
Null values in Sex: 0
Null values in Embarked: 0


Cool, No null values in test set, so we should ideally check out how values for each category exist in the embarked column

In [100]:
training_set['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

Since S dominates, we would fill up the 2 rows with Null values with 'S'

In [101]:
training_set['Embarked'] = training_set['Embarked'].fillna('S')
print("Null values in Embarked:", training_set['Embarked'].isnull().sum())

Null values in Embarked: 0


Now it's time to hot encode Sex and Embarked

In [102]:
training_set = pd.get_dummies(training_set, columns=["Sex", "Embarked"], prefix=["Sex", "Embarked"])
training_set.head()

test_set = pd.get_dummies(test_set, columns=["Sex", "Embarked"], prefix=["Sex", "Embarked"])
test_set.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,0,1,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1
2,894,2,62.0,0,0,9.6875,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,1,0,0,0,1


Now we can safely remove 1 column corresponding to each attribute to address the curse of dimensionality
Deliberately removing the column with Q since it has the minimum count

In [103]:
print("Before")
print(training_set.head())
print(test_set.head())
print("After")
training_set = training_set.drop(['Sex_female', 'Embarked_Q'], axis =1)
test_set = test_set.drop(['Sex_female', 'Embarked_Q'], axis = 1)
print(training_set.head())
print(test_set.head())

Before
   Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \
0         0       3  22.0      1      0   7.2500           0         1   
1         1       1  38.0      1      0  71.2833           1         0   
2         1       3  26.0      0      0   7.9250           1         0   
3         1       1  35.0      1      0  53.1000           1         0   
4         0       3  35.0      0      0   8.0500           0         1   

   Embarked_C  Embarked_Q  Embarked_S  
0           0           0           1  
1           1           0           0  
2           0           0           1  
3           0           0           1  
4           0           0           1  
   PassengerId  Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  \
0          892       3  34.5      0      0   7.8292           0         1   
1          893       3  47.0      1      0   7.0000           1         0   
2          894       2  62.0      0      0   9.6875           0         1   


In [104]:
training_set.dtypes

Survived        int64
Pclass          int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Sex_male        uint8
Embarked_C      uint8
Embarked_S      uint8
dtype: object

Let's check Age and Fare's correlation with survival

In [105]:
training_set['AgeBand'] = pd.cut(training_set['Age'], 5)
training_set[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)

Unnamed: 0,AgeBand,Survived
0,"(0.34, 16.336]",0.55
1,"(16.336, 32.252]",0.369942
2,"(32.252, 48.168]",0.404255
3,"(48.168, 64.084]",0.434783
4,"(64.084, 80.0]",0.090909


We will create Ageband features since age is highly correlated to survival

In [106]:
training_set.loc[training_set['Age'] <= 16, 'Age'] = 0
training_set.loc[(training_set['Age'] > 16) & (training_set['Age'] <= 32), 'Age'] = 1
training_set.loc[(training_set['Age'] > 32) & (training_set['Age'] <= 48), 'Age'] = 2
training_set.loc[(training_set['Age'] > 48) & (training_set['Age'] <= 64), 'Age'] = 3
training_set.loc[ training_set['Age'] > 64, 'Age']
training_set.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_S,AgeBand
0,0,3,1.0,1,0,7.25,1,0,1,"(16.336, 32.252]"
1,1,1,2.0,1,0,71.2833,0,1,0,"(32.252, 48.168]"
2,1,3,1.0,0,0,7.925,0,0,1,"(16.336, 32.252]"
3,1,1,2.0,1,0,53.1,0,0,1,"(32.252, 48.168]"
4,0,3,2.0,0,0,8.05,1,0,1,"(32.252, 48.168]"


In [107]:
test_set.loc[test_set['Age'] <= 16, 'Age'] = 0
test_set.loc[(test_set['Age'] > 16) & (test_set['Age'] <= 32), 'Age'] = 1
test_set.loc[(test_set['Age'] > 32) & (test_set['Age'] <= 48), 'Age'] = 2
test_set.loc[(test_set['Age'] > 48) & (test_set['Age'] <= 64), 'Age'] = 3
test_set.loc[ test_set['Age'] > 64, 'Age']
test_set.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_S
0,892,3,2.0,0,0,7.8292,1,0,0
1,893,3,2.0,1,0,7.0,0,0,1
2,894,2,3.0,0,0,9.6875,1,0,0
3,895,3,1.0,0,0,8.6625,1,0,1
4,896,3,1.0,1,1,12.2875,0,0,1


In [108]:
training_set = training_set.drop(['AgeBand'], axis = 1)

We will similarly check for Fare but we will first replace it's null values as observed above with median

In [109]:
training_set['Fare'].fillna(training_set['Fare'].dropna().median(), inplace=True)
training_set.head()

test_set['Fare'].fillna(test_set['Fare'].dropna().median(), inplace=True)
test_set.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_C,Embarked_S
0,892,3,2.0,0,0,7.8292,1,0,0
1,893,3,2.0,1,0,7.0,0,0,1
2,894,2,3.0,0,0,9.6875,1,0,0
3,895,3,1.0,0,0,8.6625,1,0,1
4,896,3,1.0,1,1,12.2875,0,0,1


In [110]:
training_set['FareBand'] = pd.qcut(training_set['Fare'], 4)
training_set[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

Unnamed: 0,FareBand,Survived
0,"(-0.001, 7.91]",0.197309
1,"(7.91, 14.454]",0.303571
2,"(14.454, 31.0]",0.454955
3,"(31.0, 512.329]",0.581081


In [111]:
training_set.loc[ training_set['Fare'] <= 7.91, 'Fare'] = 0
training_set.loc[(training_set['Fare'] > 7.91) & (training_set['Fare'] <= 14.454), 'Fare'] = 1
training_set.loc[(training_set['Fare'] > 14.454) & (training_set['Fare'] <= 31), 'Fare']   = 2
training_set.loc[ training_set['Fare'] > 31, 'Fare'] = 3
training_set['Fare'] = training_set['Fare'].astype(int)

test_set.loc[ test_set['Fare'] <= 7.91, 'Fare'] = 0
test_set.loc[(test_set['Fare'] > 7.91) & (test_set['Fare'] <= 14.454), 'Fare'] = 1
test_set.loc[(test_set['Fare'] > 14.454) & (test_set['Fare'] <= 31), 'Fare']   = 2
test_set.loc[ test_set['Fare'] > 31, 'Fare'] = 3
test_set['Fare'] = test_set['Fare'].astype(int)

In [112]:
training_set = training_set.drop(['FareBand'], axis=1)

In [129]:
training_set['Age'].fillna(training_set['Age'].dropna().median(), inplace=True)
test_set['Age'].fillna(test_set['Age'].dropna().median(), inplace=True)

We will combine the Parch and SibSp attribute to create a Family Attribute

In [114]:
training_set['FamilySize'] = training_set['SibSp'] + training_set['Parch'] + 1
test_set['FamilySize'] = test_set['SibSp'] + test_set['Parch'] + 1
#Now I will drop the SibSp and Parch attributes
training_set = training_set.drop('SibSp', axis=1)
test_set = test_set.drop('SibSp', axis=1)
training_set = training_set.drop('Parch', axis=1)
test_set = test_set.drop('Parch', axis=1)

In [115]:
classVar = training_set['Survived']
data = training_set.drop('Survived', axis = 1)

In [116]:
X_train, X_test, y_train, y_test = train_test_split(data, classVar, test_size=0.2, random_state=42)
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 712 samples.
Testing set has 179 samples.


In [117]:
def train_predict(learner, X_train, y_train, X_test, y_test): 
    results = {}
    learner.fit(X_train, y_train)
    preds = learner.predict(X_test)
        
    #Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test, preds)
    
    #Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, preds, beta=0.5)
        
    # Return the results
    return results

We will use Naive Bayes', Linear SVM and Random Forest to check which performs best

In [118]:
clf_NB = GaussianNB()
clf_SV = LinearSVC(random_state=0)
clf_RF = RandomForestClassifier(random_state=0)

results = {}
for clf in [clf_NB, clf_SV, clf_RF]:
    clf_name = clf.__class__.__name__
    results[clf_name] = \
    train_predict(clf, X_train, y_train, X_test, y_test)

In [119]:
results

{'GaussianNB': {'acc_test': 0.77653631284916202,
  'f_test': 0.71596244131455411},
 'LinearSVC': {'acc_test': 0.77653631284916202, 'f_test': 0.73699421965317913},
 'RandomForestClassifier': {'acc_test': 0.8044692737430168,
  'f_test': 0.76502732240437155}}

Random Forest seems to be performing the best, hence we will try Grid Search on it

In [120]:
clfRBIdeal = RandomForestClassifier()

parameters = {'n_estimators': [10, 50, 100],
             'min_samples_split': [2,10,20,50],
             'max_features': ['sqrt', 'log2', 'auto']}

scorer = make_scorer(fbeta_score, beta=0.5)

grid_obj = GridSearchCV(clfRBIdeal, parameters, scoring=scorer)

grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

predictions = (clfRBIdeal.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

Now, we will check the accuracies and also find the ideal set of hyper parameters

In [121]:
print("Unoptimized model")
print("Accuracy score on testing data: {:.10f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.10f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("Optimized Model")
print("Final accuracy score on the testing data: {:.10f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.10f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

Unoptimized model
Accuracy score on testing data: 0.8100558659
F-score on testing data: 0.7702702703
Optimized Model
Final accuracy score on the testing data: 0.8044692737
Final F-score on the testing data: 0.7748538012


The test accuracy is surprisingly lower for the best model

In [122]:
clfRBIdeal

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [123]:
best_clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [124]:
test_set.head()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Sex_male,Embarked_C,Embarked_S,FamilySize
0,892,3,2.0,0,1,0,0,1
1,893,3,2.0,0,0,0,1,2
2,894,2,3.0,1,1,0,0,1
3,895,3,1.0,1,1,0,1,1
4,896,3,1.0,1,0,0,1,3


In [125]:
passengerId = test_set['PassengerId']
test_set.drop('PassengerId', axis = 1, inplace = True)

In [139]:
pred = best_clf.predict(test_set)

In [140]:
pred.shape

(418,)

In [141]:
submission = pd.DataFrame({"PassengerId": passengerId,"Survived": pred})
submission.to_csv('titanic.csv', index=False)