In [27]:
import pandas as pd
import numpy as np
import random as rnd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [28]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [29]:
import time                                                
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [30]:
def titanic(train, test):
    train_df = pd.read_csv(train)
    test_df = pd.read_csv(test)
    combine = [train_df, test_df]

    train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
    test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
    combine = [train_df, test_df]

    for dataset in combine:
        dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

    pd.crosstab(train_df['Title'], train_df['Sex'])

    for dataset in combine:
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
        'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    for dataset in combine:
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(0)

    train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
    test_df = test_df.drop(['Name'], axis=1)
    combine = [train_df, test_df]

    for dataset in combine:
        dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

    guess_ages = np.zeros((2,3))
    guess_ages

    for dataset in combine:
        for i in range(0, 2):
            for j in range(0, 3):
                guess_df = dataset[(dataset['Sex'] == i) & \
                                      (dataset['Pclass'] == j+1)]['Age'].dropna()

                # age_mean = guess_df.mean()
                # age_std = guess_df.std()
                # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

                age_guess = guess_df.median()

                # Convert random age float to nearest .5 age
                guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

        for i in range(0, 2):
            for j in range(0, 3):
                dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                        'Age'] = guess_ages[i,j]

        dataset['Age'] = dataset['Age'].astype(int)

    for dataset in combine:
        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1



    for dataset in combine:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

    combine = [train_df, test_df]

    freq_port = train_df.Embarked.dropna().mode()[0]

    for dataset in combine:
        dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

    test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

    test_df.head(10)
    return [train_df,test_df]

In [35]:
train_total,test_total = titanic('/Users/arunabhsingh/Desktop/greyatom/titanic_whynot/csv/train.csv','/Users/arunabhsingh/Desktop/greyatom/titanic_whynot/csv/test.csv')
test_total.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone
0,892,3,0,34,0,0,7.8292,Q,1,1,1
1,893,3,1,47,1,0,7.0,S,3,2,0
2,894,2,0,62,0,0,9.6875,Q,1,1,1
3,895,3,0,27,0,0,8.6625,S,1,1,1
4,896,3,1,22,1,1,12.2875,S,3,3,0


In [12]:
train_total = pd.get_dummies(train_total)
test_total = pd.get_dummies(test_total)


In [13]:
X=train_total.iloc[:,1:]
y=train_total['Survived']
X_test = test_total.iloc[:,1:]l

In [14]:
X.shape


(891, 12)

In [15]:
test_total.columns

Index([u'PassengerId',      u'Pclass',         u'Sex',         u'Age',
             u'SibSp',       u'Parch',        u'Fare',       u'Title',
        u'FamilySize',     u'IsAlone',  u'Embarked_C',  u'Embarked_Q',
        u'Embarked_S'],
      dtype='object')

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X, y,
                            test_size=0.2, random_state=23)

In [17]:
len(X_train)

712

In [18]:
len(y_train)

712

In [19]:
#oobscore is used for out of bag checking. use a test fold along with all the kfolds and provide accuracy metrics
clf = RandomForestClassifier(oob_score=True)


In [20]:
#create a gridsearch object
def grid_obj_creator(classifier, parameters, X, y):
    acc_scorer = make_scorer(accuracy_score)
    grid_obj = GridSearchCV(classifier, parameters, scoring=acc_scorer)
    grid_obj = grid_obj.fit(X, y)
    return grid_obj

In [21]:
#plot mean scores for a particular grid object
def hp_cv_scores(grid_obj):
    grid_obj.cv_results_
    mean_test_scores = grid_obj.cv_results_['mean_test_score']
    mean_train_scores = grid_obj.cv_results_['mean_train_score']
    plt.figure(figsize=(10,6))
    
    param_values =[str(x) for x in grid_obj.param_grid.values()[0]]
    x = np.arange(1, len(param_values)+1)
    plt.plot(x,mean_train_scores,c='r')
    plt.xticks(x,param_values)
    plt.plot(x,mean_test_scores,c='g')
    plt.xlabel(grid_obj.param_grid.keys()[0])
    plt.ylabel('mean scores')
    plt.show()

In [57]:
# random1 =RandomForestClassifier(oob_score=True)
# grid1 = grid_obj_creator(random1,{'n_estimators':[2,4,8,16,32,64,128,256]},X_train,y_train)
# hp_cv_scores(grid1)

In [56]:
# grid2 = grid_obj_creator(random1,{'max_depth':[2,4,5,6,8]},X_train,y_train)
# hp_cv_scores(grid2)

In [55]:
# grid3 = grid_obj_creator(random1,{'max_features':['log2','sqrt','auto']},X_train,y_train)
# hp_cv_scores(grid3)

In [54]:
# grid4 = grid_obj_creator(random1,{'min_samples_split':[2,3,5]},X_train,y_train)
# hp_cv_scores(grid4)

In [53]:
# grid6 = grid_obj_creator(random1,{'min_samples_split':[2,3,5]},X_train,y_train)
# hp_cv_scores(grid6)

In [52]:
# grid5 = grid_obj_creator(random1,{'criterion': ['entropy', 'gini']},X_train,y_train)
# hp_cv_scores(grid5)

In [22]:
parameters = {'n_estimators':[10,100,300,400],'max_depth':[2,5,7,9], 'min_samples_leaf':[1,3,5]}

In [24]:
# prediction_model = RandomForestClassifier(oob_score=True, verbose=True, n_jobs=-1)
acc_scorer = make_scorer(accuracy_score)

In [25]:
@timeit
def run_grid():   
    grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
    grid_obj = grid_obj.fit(X_train, y_train)
    return grid_obj.best_estimator_

In [26]:
best_pred_model = run_grid()

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


'run_grid'  151807.82 ms


In [48]:
best_pred_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [104]:
y_pred = best_pred_model.predict(X_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 256 out of 256 | elapsed:    0.3s finished


In [105]:
accuracy_score(y_pred,y_test)

0.83240223463687146

In [106]:
X_total_test = test_total.iloc[:,1:]
y_pred = best_pred_model.predict(X_total_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 256 out of 256 | elapsed:    0.3s finished


In [107]:
best_pred_model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=256, n_jobs=-1, oob_score=True, random_state=None,
            verbose=True, warm_start=False)

In [97]:
rf_prediction = pd.DataFrame(y_pred, index=test_total['PassengerId'],columns=['Survived'])
rf_prediction=rf_prediction.reset_index()

rf_prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [98]:
rf_prediction.to_csv('prediction.csv',index=False)