In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation
import pandas as pd
import re
import math
%matplotlib inline



### Strategy
* PassengerId - Remove 
* Cabin - Remove as 887 missing values and of low importance
* Pclass - Numerical, keep it as it is
* SibSp - Numerical, keep it as it is
* Parch - Numerical, keep it as it is
* Fare - Numerical, keep it as it is
* Sex - Categorical, creat dummy variables
* Embarked - Categorical, creat dummy variables, handle missing values
* Ticket, Keep first letter
* Age - Numerical, predict missing values
* Name - Find salutation from the name

In [2]:
X = pd.read_csv("data/train.csv")
y = X.pop("Survived")

In [3]:
def create_dummies(X, fields):
    for field in fields:
        X[field].fillna("Missing", inplace=True)
        dummies = pd.get_dummies(X[field], prefix=field)
        X = pd.concat([X, dummies], axis=1)
        X.drop([field], axis=1, inplace=True)
    return X

In [4]:
def fillna_with_prediction(X):
    for field in X:
        # If this field is not numeric
        if X[field].dtype == "object":
            continue
        # If this field does not contain null
        if not X[field].isnull().any():
            continue
        X_full = X.dropna()
        y_full = X_full.pop(field)
        model = RandomForestRegressor(n_jobs=-1, n_estimators=1000, random_state=42)
        model.fit(X_full, y_full)
        X_null = X[X[field].isnull()]
        X_null.pop(field)
        y_null = model.predict(X_null)
        indexes = X[X[field].isnull()].index.tolist()
        y_null_series = pd.Series(y_null, index=indexes)
        X[field] = X[field].fillna(value=y_null_series)    
        #na_models[field] = model
    return X

In [5]:
def fix_tickets(X):
    # Fix ticket, get the first letter if available, else set X
    tickets = []
    for ticket in X.Ticket:
        ticket_str = re.sub('[^A-Z]', '', ticket)
        if ticket_str:
            tickets.append(ticket_str[0])
        else:
            tickets.append('X')
    X['Ticket'] = tickets
    return X

In [6]:
def fix_salutations(X):
    # Figure out salutations from name
    salutations =  []
    for index, row in X.iterrows():
        salutation = str(re.search(", (\w+).", row['Name']).group(1)).strip()
        salutations.append(salutation)
    X['Salutation'] = salutations
    return X

In [7]:
X = fix_tickets(X)
X = fix_salutations(X)
# Drop variables that have very little importance
X.drop(['Name', 'PassengerId', 'Cabin'], axis=1, inplace=True)
X = create_dummies(X, ['Sex', 'Ticket', 'Embarked', 'Salutation'])
X = fillna_with_prediction(X)

### Random Forest Model

In [8]:
model_rf = RandomForestRegressor(n_estimators=5000, 
                              oob_score=True, 
                              n_jobs=-1, 
                              random_state=42, 
                              max_features="auto", 
                              min_samples_leaf=7)
model_rf.fit(X, y)
roc = roc_auc_score(y, model_rf.oob_prediction_)
print ("AUC Score: ", roc)

AUC Score:  0.887951512053


### Multi-layer Perceptron Classifier

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)  

model_mlp = model = MLPClassifier(hidden_layer_sizes=(1000, 5), max_iter=1000, random_state=42)
model_mlp.fit(X_test, y_test)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [10]:
auc = roc_auc_score(y_test, model_mlp.predict_proba(X_test)[:,1])
print ("AUC Score: ", auc)

AUC Score:  0.999099099099


### Calculate CV for both models

In [12]:
def print_cv(model, X, y, cv=10):
    scores = cross_validation.cross_val_score(model, X, y, cv=cv)
    mean_score = scores.mean()
    std_dev = scores.std()
    std_error = scores.std() / math.sqrt(scores.shape[0])
    ci =  2.262 * std_error
    lower_bound = mean_score - ci
    upper_bound = mean_score + ci

    print("Score is %f +/-  %f" % (mean_score, ci))
    print('95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound))    

In [15]:
print('CV Score for RandomForest Model')
print_cv(model_rf, X, y)

CV Score for RandomForest Model
Score is 0.452679 +/-  0.069591
95 percent probability that if this experiment were repeated over and over the average score would be between 0.383088 and 0.522270


In [16]:
print('CV Score for MLP Model')
print_cv(model_mlp, X, y)

CV Score for MLP Model
Score is 0.666648 +/-  0.036409
95 percent probability that if this experiment were repeated over and over the average score would be between 0.630239 and 0.703057
