In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from binary_classifier_wrappers import KfoldBinaryClassifierWrapper
from metric_wrappers import RSquare, AUC, RMSE

In [2]:
titanic = pd.read_csv("./data/train.csv")

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
titanic["Age"].fillna(titanic["Age"].median(), inplace=True)

In [5]:
titanic = titanic.drop(['PassengerId','Name','Ticket', 'Cabin', 'Embarked'], axis=1)

In [6]:
all_feature_names = titanic.columns.tolist()
all_feature_names.remove('Survived')
all_feature_names

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']

In [7]:
categorical_feature_names = ['Pclass', 'Sex']

In [8]:
label_name = 'Survived'

In [9]:
lr_model = LogisticRegression()
svn_model = SVC(probability = True)
rf_model = RandomForestClassifier()

In [10]:
model_dict = {'Logistic Regression': lr_model,
             'SVM': svn_model,
             'Random Forest': rf_model}

In [11]:
k_fold_binary = KfoldBinaryClassifierWrapper(titanic, label_name, 
                                             all_feature_names, categorical_feature_names, k = 5)

In [12]:
model_performance_table = pd.DataFrame(index = range(len(model_dict)), columns = ['Model', 'AUC', 'r^2', 'RMSE'])

In [13]:
for n, name in enumerate(model_dict.keys()):
    k_fold_binary.set_model(model_dict[name])
    pred_result = k_fold_binary.run()
    
    model_performance_table.ix[n,'Model'] = name
    model_performance_table.ix[n,'AUC'] = AUC.measure(pred_result.label, pred_result.pred_prob)
    model_performance_table.ix[n,'r^2'] = RSquare.measure(pred_result.label, pred_result.pred_prob)
    model_performance_table.ix[n,'RMSE'] = RMSE.measure(pred_result.label, pred_result.pred_prob)

In [14]:
model_performance_table = model_performance_table.sort_values(by='AUC', ascending=False).reset_index(drop=True)
model_performance_table

Unnamed: 0,Model,AUC,r^2,RMSE
0,Logistic Regression,0.840779,0.385932,0.381104
1,Random Forest,0.835837,0.361881,0.397521
2,SVM,0.795862,0.25899,0.418909
