In [60]:
## Imports
import pandas as pd
import numpy as np

In [61]:
# Read data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [62]:
# Combine training and test data
train_data['isTrain'] = 1
test_data['isTrain'] = 0
# Save outcomes and drop Survived from train
outcomes = train_data['Survived']; 
train_data = train_data.drop('Survived', axis=1)

# Finally join data
full_data = pd.concat([train_data, test_data])

In [63]:
# Look at the names, dimensions etc.
print full_data.columns
print full_data.shape

Index([u'PassengerId', u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch',
       u'Ticket', u'Fare', u'Cabin', u'Embarked', u'isTrain'],
      dtype='object')
(1309, 12)


In [64]:
# Missing values
print "Number of missing values in Age:", sum(pd.isnull(full_data['Age'])) 
print "Number of missing values in Pclass:", sum(pd.isnull(full_data['Pclass']))
print "Number of missing values in Sex:", sum(pd.isnull(full_data['Sex']))
print "Number of missing values in SibSp:", sum(pd.isnull(full_data['SibSp']))
print "Number of missing values in Parch:", sum(pd.isnull(full_data['Parch']))
print "Number of missing values in Fare:", sum(pd.isnull(full_data['Fare']))
print "Number of missing values in Cabin:", sum(pd.isnull(full_data['Cabin']))
print "Number of missing values in Embarked:", sum(pd.isnull(full_data['Embarked']))

Number of missing values in Age: 263
Number of missing values in Pclass: 0
Number of missing values in Sex: 0
Number of missing values in SibSp: 0
Number of missing values in Parch: 0
Number of missing values in Fare: 1
Number of missing values in Cabin: 1014
Number of missing values in Embarked: 2


In [65]:
# Impute missing values

# Na:Age --> med:Age
med_age = full_data['Age'].median()
full_data['Age'] = full_data['Age'].fillna(med_age)

# Na:Fare --> med:Fare
med_fare = full_data['Fare'].median()
full_data['Fare'] = full_data['Fare'].fillna(med_fare)

# Na:Embarked --> Most common value
groups = full_data.groupby('Embarked').size()
mostComm_Embarked = groups.idxmax()
full_data['Embarked'] = full_data['Embarked'].fillna(mostComm_Embarked)

In [66]:
# Too many passangers with missing cabin. Create a new feature has_cabin
full_data['has_cabin'] = pd.isnull(full_data['Cabin']).astype('int')
full_data = full_data.drop('Cabin', axis = 1)

In [67]:
# Check whether all na's are removed
pd.isnull(full_data).astype(int).sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
isTrain        0
has_cabin      0
dtype: int64

In [68]:
## Factor variables
Sex_factor = pd.get_dummies(full_data['Sex'], prefix='Sex', drop_first=True)
Embarked_factor = pd.get_dummies(full_data['Embarked'], prefix='Emb')

full_data = pd.concat([full_data, Sex_factor, Embarked_factor], axis = 1)

# Drop features that are not useful
full_data = full_data.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1)

In [69]:
# Standardize numeric features
for feat in ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']:
    mean, std = full_data[feat].mean(), full_data[feat].std()
    full_data.loc[:, feat] = (full_data[feat]-mean)/std

In [70]:
# Split back into train and test
train_data = full_data.loc[full_data['isTrain']==1]
test_data = full_data.loc[full_data['isTrain']==0] 

# Drop PassangerId, not needed for training
train_data = train_data.drop('PassengerId', axis = 1)

In [71]:
## Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

clf = LogisticRegression()
Crng = np.logspace(-2, 2, 40)
grid = GridSearchCV(clf, param_grid={'C': Crng}, scoring='roc_auc', cv = 10)
#scores = cross_val_score(classifier, train_data, outcomes , scoring = 'roc_auc', cv = 10)

## Fit the best model
grid.fit(train_data, outcomes)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-02,   1.26638e-02,   1.60372e-02,   2.03092e-02,
         2.57191e-02,   3.25702e-02,   4.12463e-02,   5.22335e-02,
         6.61474e-02,   8.37678e-02,   1.06082e-01,   1.34340e-01,
         1.70125e-01,   2.15443e-01,   2.72833e-01,   3.45511e-01,
         4.3754...,   3.07029e+01,   3.88816e+01,
         4.92388e+01,   6.23551e+01,   7.89652e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [74]:
print ("Best C:", "{:.6f}".format(grid.best_params_['C']), "Best AUC:", \
       "{:.6f}".format(grid.best_score_))

('Best C:', '0.554102', 'Best AUC:', '0.854875')


In [79]:
## Using the best model, prredict on the test set
pred_test = grid.best_estimator_.predict(test_data.drop('PassengerId', axis=1))
predictions = pd.DataFrame( { 'PassengerId': test_data['PassengerId'] , 'Survived': pred_test })

# Write to file
predictions.to_csv( 'titanic_pred.csv' , index = False )