In [48]:
import requests
import numpy as np
import pandas as pd
import sklearn.ensemble as sk
import sklearn
import statsmodels.api as sm
import matplotlib.pyplot as plt
import itertools
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.grid_search import GridSearchCV
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [76]:
train=pd.read_csv('train.csv')
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [77]:
train.shape

(891, 12)

In [78]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [81]:
title_dic={' Mr. ':0, ' Mrs.':1, ' Miss':2, ' Mast':3, ' Don.':4, ' Rev.':5, ' Dr. ':6,
       ' Mme.':7, ' Ms. ':8, ' Majo':9, ' Lady':10, ' Sir.':11, ' Mlle':12, ' Col.':13,
       ' Capt':14, ' the ':15, ' Jonk':16, ' Dona':17}
def add_title(train):
    #title
    train['Title']=map(lambda x: x[x.index(',')+1:x.index(',')+6],train['Name'])
    titles=train['Title'].unique()
    titles
    train['Title']=map(lambda x: title_dic[x],train['Title'])

    return train

def filling_age(train):
    meanlist=[train[train['Title']==x]['Age'].mean() for x in title_dic.values()]
    meanlist=pd.DataFrame(meanlist)
    meanlist['Title']=[title_dic[title_dic.keys()[x]] for x in np.arange(0,18,1)]
    meanlist.columns=['mean','Title']
    
    train=pd.merge(train,meanlist,how='outer',on='Title')
    train=train.dropna(subset=['PassengerId'])
    train=train.sort('PassengerId')
    train=train.reset_index(drop=True)
    
    train.loc[train['Age'].isnull(),'Age'] = train['mean']
#     train['Embarked'].fillna('Missing',inplace=True)
    train['Cabin'].fillna('Missing',inplace=True)
    train['Sex'].fillna('Missing',inplace=True)
#     train['Fare'].fillna(np.mean(train['Fare']),inplace=True)##########
    train['Age'].fillna(np.mean(train['Age']),inplace=True)##########
    
    sex_dic={'male':0,'female':1,'Missing':2}
    embarked_dic={'S':0,'C':1,'Q':2,'Missing':3}
    train['Sex']=map(lambda x: sex_dic[x],train['Sex'])
#     train['Embarked']=map(lambda x: embarked_dic[x],train['Embarked'])

    train['Cabin']=map(lambda x: x[0],train['Cabin'])
    train=train.drop('mean',axis=1)
    
    return train

def dropping(train):
    train=train.drop(['Name','Ticket','Cabin'],axis=1)
    return train

def cleaning(train):
    train=add_title(train)
    train=filling_age(train)
    train=dropping(train)
    train['PassengerId']=train['PassengerId'].astype(int)
    
    most_common_port = train['Embarked'].mode()[0]
    train['Embarked'] = train['Embarked'].fillna(most_common_port)
    dummies = pd.get_dummies(train['Embarked'], prefix='Port')
    train = pd.concat([train, dummies], axis=1)
    train=train.drop('Embarked',axis=1)
    
    fare_by_pclass = train[train['Fare'] > 0].groupby('Pclass')['Fare'].agg(np.median).to_dict()
    train['Fare'] = train.apply(lambda r: r['Fare'] if r['Fare'] > 0 
                      else fare_by_pclass.get(r['Pclass']), axis=1)
    train=train.drop('Title',axis=1)
    
    return train

In [82]:
train=cleaning(train)

In [83]:
train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Port_C,Port_Q,Port_S
886,887,0,2,0,27.0,0,0,13.0,0,0,1
887,888,1,1,1,19.0,0,0,30.0,0,0,1
888,889,0,3,1,21.773973,1,2,23.45,0,0,1
889,890,1,1,0,26.0,0,0,30.0,1,0,0
890,891,0,3,0,32.0,0,0,7.75,0,1,0


In [84]:
_train=train

train
---

In [85]:
X=train[train.columns[2:]].as_matrix()
y = train['Survived'].as_matrix()

In [86]:
cv = StratifiedKFold(y, n_folds=6)
tot_correct, tot_obs = 0, 0

for i, (train, test) in enumerate(cv):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    model = sk.RandomForestClassifier(random_state=321) 
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    correct, obs = (y_test == y_pred).sum(), len(y_test)
    tot_correct += correct
    tot_obs += obs

print 'accuracy: %f' % (tot_correct * 1.0 / tot_obs)

accuracy: 0.799102


In [87]:
x_cols =_train.columns[2:]
model = sk.RandomForestClassifier(random_state=321)
model.fit(X, y)

feature_rank = pd.Series(model.feature_importances_, index=x_cols).order(ascending=False)
feature_rank

Fare      0.270450
Sex       0.267859
Age       0.257774
Pclass    0.083892
SibSp     0.052189
Parch     0.039487
Port_C    0.012654
Port_S    0.008586
Port_Q    0.007109
dtype: float64

In [88]:
param_grid = {
    "n_estimators": [40,50,60],
    "criterion": ["gini", "entropy"],
    'max_features': [0.4,0.5,0.6, "sqrt"],
    'max_depth': [8,9,10,None],
}

model = sk.RandomForestClassifier(random_state=321)
grid_search = GridSearchCV(model, param_grid, cv=12, verbose=0)
grid_search.fit(X, y)

print grid_search.best_score_
print grid_search.best_params_

0.840628507295
{'max_features': 0.6, 'n_estimators': 50, 'criterion': 'gini', 'max_depth': 8}


In [89]:
test=pd.read_csv('test.csv')
#test.tail()
test=cleaning(test)
print test.shape
test.tail()

(418, 10)


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Port_C,Port_Q,Port_S
413,1305,3,0,32.0,0,0,8.05,0,0,1
414,1306,1,1,39.0,0,0,108.9,1,0,0
415,1307,3,0,38.5,0,0,7.25,0,0,1
416,1308,3,0,32.0,0,0,8.05,0,0,1
417,1309,3,0,7.406471,1,1,22.3583,1,0,0


In [90]:
# train model with best parameters from grid search
# and finally predict survival of people from test data

X_train = _train[x_cols].as_matrix()
y_train = _train['Survived'].as_matrix()

model = sk.RandomForestClassifier(
    n_estimators=100, 
    criterion='entropy', 
    max_features=0.5, 
    max_depth=9,
    random_state=321,
)

model.fit(X_train, y_train)

X_test = test[x_cols].as_matrix()
y_pred = model.predict(X_test).astype(int)

test['Survived'] = y_pred

final_df = test[['PassengerId', 'Survived']]
final_df.to_csv('predicted.csv', index=False)
print 'boom.'

boom.
