In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score
# With the goal above, I will import just what I need. 
# The model to use (I already imported it above, but will do it again here so each example is self-contained)
from sklearn.ensemble import RandomForestRegressor
# The error metric. In this case, we will use c-stat (aka ROC/AUC)
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import math




In [2]:
%matplotlib inline

In [3]:

#reading the data from the disk into memory
df = pd.read_csv("train.csv")

In [4]:
#Just a reminder, here are all the column names
df.columns

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [5]:
#I'm going to create a new dataframe and put 6 variables I'm going to be using into it.
X = pd.DataFrame()
X['age'] = df['Age']
X['survived'] = df['Survived']
X['Pclass'] = df['Pclass']
X['Embarked'] = df['Embarked']
X['Fare'] = df['Fare']
X['Sex'] = df['Sex']

In [6]:
#I'm going to drop missing values.   That's probably NOT the best strategy, but it's usually good to start simple and 
#build complexity as you go.
X = X.dropna(axis=0)

In [7]:
#survived will be my dependent variable, y.   I'll assign it to y and remove it from X
y = X['survived']
X = X.drop(['survived'], axis=1)

In [8]:
# The following code is from: http://brettromero.com/wordpress/data-science-kaggle-walkthrough-data-transformation-feature-extraction 
# Home made One Hot Encoding function
def convert_to_binary(df, column_to_convert):
    categories = list(df[column_to_convert].drop_duplicates())

    for category in categories:
        cat_name = str(category).replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_").replace("-", "").lower()
        col_name = column_to_convert[:5] + '_' + cat_name[:10]
        df[col_name] = 0
        df.loc[(df[column_to_convert] == category), col_name] = 1

    return df

In [9]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
# Use the above One Hot Encoding for the columns shown below
print("One Hot Encoding categorical data...")
columns_to_convert = ['Pclass', 'Embarked','Sex']

for column in columns_to_convert:
    X = convert_to_binary(df=X, column_to_convert=column)
    X.drop(column, axis=1, inplace=True)

One Hot Encoding categorical data...


In [11]:
#I chose only Pclas_3, Embar_q and Sex_male columns (i.e. delet the following)
X = X.drop(['Pclas_1'], axis=1)

In [12]:
X = X.drop(['Pclas_2'],axis=1)

In [13]:
X = X.drop(['Sex_female'], axis=1)

In [14]:
X = X.drop(['Embar_c'], axis=1)

In [15]:
X = X.drop(['Embar_s'], axis=1)

In [16]:
X.head()

Unnamed: 0,age,Fare,Pclas_3,Embar_q,Sex_male
0,22.0,7.25,1,0,1
1,38.0,71.2833,0,0,0
2,26.0,7.925,1,0,0
3,35.0,53.1,0,0,0
4,35.0,8.05,1,0,1


In [17]:
#remember to scale our features, as with linear regression
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X= scaler.fit_transform(X)

In [18]:
#build test and training sets (10% for testing, It could be overfiting)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [19]:
#This function looks for male, Pclass: 3 and Emarked:Q in the test set and returns 1, survived, otherwise it returns 0
def base_rate_model(X):
    y = np.zeros(X.shape[0])
    return y

In [20]:
#how accurate is my base rate model?
y_base_rate = base_rate_model(X_test)
from sklearn.metrics import accuracy_score
print("Base rate accuracy is %2.2f" % accuracy_score(y_test, y_base_rate))

Base rate accuracy is 0.58


In [21]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', C=1)

In [22]:
model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
print("Logistic accuracy is %2.2f" % accuracy_score(y_test,model.predict(X_test)))

Logistic accuracy is 0.82


In [24]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [25]:

print("---Base Model---")
#base rate AUC
base_roc_auc = roc_auc_score(y_test, base_rate_model(X_test))
print("Base Rate AUC = %2.2f" % base_roc_auc)
print(classification_report(y_test,base_rate_model(X_test) ))
print("\n\n---Logistic Model---")
#logistic AUC
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
print("Logistic AUC = %2.2f" % logit_roc_auc)
print(classification_report(y_test, model.predict(X_test) ))

---Base Model---
Base Rate AUC = 0.50
             precision    recall  f1-score   support

          0       0.58      1.00      0.74        42
          1       0.00      0.00      0.00        30

avg / total       0.34      0.58      0.43        72



---Logistic Model---
Logistic AUC = 0.81
             precision    recall  f1-score   support

          0       0.82      0.88      0.85        42
          1       0.81      0.73      0.77        30

avg / total       0.82      0.82      0.82        72



  'precision', 'predicted', average, warn_for)


In [26]:
## Random Forest classifier for the training data sub data (subset of kaggel's training data)
### Grid Search
n_estimators = [100,150]   # beyond 100 it takes forever
max_features = ['auto', 'sqrt','log2']
min_samples_split = [3,5] #beyond 5 it takes forever


rfc = RandomForestClassifier(n_jobs=1)
#Parameters of pipelines can be set using ‘__’ separated parameter names:
estimator = GridSearchCV(rfc,
                         dict(n_estimators=n_estimators,
                              max_features=max_features,
                              min_samples_split=min_samples_split
                              ), cv=None, n_jobs=-1)

In [27]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [3, 5], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [100, 150]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [28]:
best_rfc = estimator.best_estimator_

In [29]:
accuracy = accuracy_score(y_test, best_rfc.predict(X_test))
print ("Accuracy: ", accuracy)

('Accuracy: ', 0.81944444444444442)


In [30]:
y_hat = best_rfc.predict(X_test)

In [31]:
y_hat

array([1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0], dtype=int64)

In [32]:
correct = 0
total = y_test.shape[0]
for pred_val, truth_val in zip(y_hat, y_test):
    if pred_val == truth_val:
        correct +=1

print (correct / float(total))

0.819444444444


In [33]:
from sklearn.metrics import classification_report
print (classification_report(y_test, best_rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.82      0.88      0.85        42
          1       0.81      0.73      0.77        30

avg / total       0.82      0.82      0.82        72



In [34]:
from sklearn.metrics import confusion_matrix
print (confusion_matrix(y_test, best_rfc.predict(X_test)))

[[37  5]
 [ 8 22]]


In [35]:
roc = roc_auc_score(y_test, best_rfc.predict_proba(X_test)[:,1])
print ("AUC Score: ", roc)

('AUC Score: ', 0.87698412698412698)
