In [407]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pandas import Series
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_mldata
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler

%matplotlib inline

import warnings
# warnings.filterwarnings('ignore') # To ingnore warnings entirely
warnings.filterwarnings(action="once") # To see warning only once
TITANIC_PATH = "/home/zhach/code/scikit_ml/datasets/titanic/"

In [336]:
class CustomBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lb = LabelBinarizer()
    def fit(self, X, y=None,**fit_params):
        return self.lb.fit(X)
    def transform(self, X):
        return self.lb.transform(X)

In [329]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [372]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        a = ["A" in s[0] for s in X]
        b = ["B" in s[0] for s in X]
        c = ["C" in s[0] for s in X]
        d = ["D" in s[0] for s in X]
        e = ["E" in s[0] for s in X]
        f = ["F" in s[0] for s in X]
        return np.c_[a, b, c, d, e, f]

In [373]:
tit_train = pd.read_csv(os.path.join(TITANIC_PATH, "train.csv"))
y_train = tit_train["Survived"]
X_train = tit_train.drop("Survived", axis=1)
tit_test = pd.read_csv(os.path.join(TITANIC_PATH, "test.csv"))
y_test = tit_test["Survived"]
X_test = tit_test.drop("Survived", axis=1)
print "Loaded Data into Program"

Loaded Data into Program


In [374]:
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 76.6+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [438]:
replacement_value = X_train['Age'].mean()
X_train.fillna({'Age': replacement_value,
                'Embarked': '',
                'Cabin': ''
               }, inplace=True)
X_test.fillna({'Age': replacement_value,
                'Embarked': '',
                'Cabin': ''
               }, inplace=True)

In [439]:
num_attribs = ["Age", "SibSp", "Parch", "Fare"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

cab_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Cabin"])),
    ('attribs_adder', CombinedAttributesAdder()),
])

sex_pipeline = Pipeline([
    ('selector', DataFrameSelector(['Sex'])),
    ('label_bin', CustomBinarizer()),
])

emb_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Embarked"])),
    ('label_bin', CustomBinarizer()),
])

pc_pipeline = Pipeline([
    ('selector', DataFrameSelector(["Pclass"])),
    ('label_bin', CustomBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ("cab_pipeline", cab_pipeline),
    ('sex_pipeline', sex_pipeline),
    ('emb_pipeline', emb_pipeline),
    ('pc_pipeline', pc_pipeline),
])

In [440]:
full_pipeline.fit(X_train)
X_train_prepared = full_pipeline.transform(X_train)
X_train_prepared = np.c_[X_train_prepared[:,:11], X_train_prepared[:,12:]]
print X_train_prepared[61]
print X_train.iloc[61]

[ 0.63878901 -0.4745452  -0.47367361  0.96235332  0.          1.          0.
  0.          0.          0.          0.          0.          0.          0.
  1.          0.          0.        ]
PassengerId                     62
Pclass                           1
Name           Icard, Miss. Amelie
Sex                         female
Age                             38
SibSp                            0
Parch                            0
Ticket                      113572
Fare                            80
Cabin                          B28
Embarked                          
Name: 61, dtype: object


In [399]:
def please_precision_recall_f1(clf, X, y, cv):
    y_pred = cross_val_predict(clf, X, y, cv=cv)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred, average="macro")
    return precision, recall, f1

In [443]:
for_clf = RandomForestClassifier()
sgd_clf = SGDClassifier()
knn_clf = KNeighborsClassifier()

print "Forest Accuracy:", cross_val_score(for_clf, X_train_prepared, y_train, cv=5, scoring="accuracy").mean()
print "SGD Accuracy", cross_val_score(sgd_clf, X_train_prepared, y_train, cv=5, scoring="accuracy").mean()
print "KNN Accuracy", cross_val_score(knn_clf, X_train_prepared, y_train, cv=5, scoring="accuracy").mean()
print "Forest PRF1:", please_precision_recall_f1(for_clf, X_train_prepared, y_train, 5)
print "SGD PRF1", please_precision_recall_f1(sgd_clf, X_train_prepared, y_train, 5)
print "KNN PRF1", please_precision_recall_f1(knn_clf, X_train_prepared, y_train, 5)

Forest Accuracy: 0.807040815494
SGD Accuracy 0.757545126635
KNN Accuracy 0.793582281215
Forest PRF1: (0.7682539682539683, 0.70760233918128657, 0.79145205479452052)
SGD PRF1 (0.69476744186046513, 0.69883040935672514, 0.75350599050882083)
KNN PRF1 (0.75320512820512819, 0.6871345029239766, 0.77776693343743908)


In [452]:
attribs = ["Age", "SibSp", "Parch", "Fare", 
           "Cabin A", "Cabin B", "Cabin C", "Cabin D", "Cabin E", "Cabin F", 
           "Male", 
           "Embarked from Southampton", "Embarked from Cherbourg", "Embarked from Queenstown",
          ]
X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared = np.c_[X_test_prepared[:,:11], X_test_prepared[:,12:]]

for_clf.fit(X_train_prepared, y_train)
y_pred = for_clf.predict(X_test_prepared)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
print accuracy, precision, recall, f1
sorted(zip(for_clf.feature_importances_, attribs), reverse=True)

0.83014354067 0.787234042553 0.730263157895 0.813462058215


[(0.27647193769088296, 'Male'),
 (0.26211985207369026, 'Age'),
 (0.20487845611037009, 'Fare'),
 (0.044513145355716363, 'Parch'),
 (0.038162866249505205, 'SibSp'),
 (0.019903482247835148, 'Embarked from Southampton'),
 (0.017685115171905443, 'Cabin E'),
 (0.011163303853410505, 'Embarked from Queenstown'),
 (0.010492723188565384, 'Cabin B'),
 (0.0099369569529755546, 'Cabin D'),
 (0.0059505007837758628, 'Cabin F'),
 (0.0057655270975319854, 'Cabin C'),
 (0.004060507411502671, 'Embarked from Cherbourg'),
 (0.0035429694208949868, 'Cabin A')]