In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


# one-hot encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#xgboost
import xgboost as xgb

#bayesian optimization
from bayes_opt import BayesianOptimization

#garbage collection
import gc




# Import Data

In [2]:
import os

train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')

# Exploratory Analysis

In [3]:
print("Training examples are",len(train_df))
print("Test data is",len(test_df))
print("The columns are",train_df.columns.values)

Training examples are 891
Test data is 418
The columns are ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [4]:
# preview the first 5 rows
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
#Data cleaning

combined_data = train_df.append(test_df)

combined_data.Age.fillna(value=combined_data.Age.mean(), inplace=True)
combined_data.Fare.fillna(value=combined_data.Fare.mean(), inplace=True)
combined_data.Embarked.fillna(value=(combined_data.Embarked.value_counts().idxmax()), inplace=True)
combined_data.Survived.fillna(value=-1, inplace=True) 

# drop columns that are not needed
combined_data.drop('Name', axis=1, inplace=True)
combined_data.drop('Cabin', axis=1, inplace=True)
combined_data.drop('Ticket', axis=1, inplace=True)
combined_data.drop('Embarked', axis=1, inplace=True)

In [6]:
# Write cleaned data out

train = combined_data[combined_data['Survived']!=-1]
# train.to_csv("./Data/train-clean.csv")

test = combined_data[combined_data['Survived']==-1]
test.drop('Survived', axis=1, inplace=True)
# test.to_csv("./Data/test-clean.csv")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [7]:
# One-hot encoding
train_encoded = pd.get_dummies(train, columns = ['Sex'])
test_encoded = pd.get_dummies(test, columns = ['Sex'])

# Rearrange columns
list_of_features = ['Age','Fare','Parch','Sex_female','Sex_male','SibSp']
list_of_columns = list_of_features + ['Survived']
train_encoded = train_encoded[list_of_columns]
test_encoded = test_encoded[list_of_features]

# Transform training and testing data into np arrays
train_x = train_encoded[list_of_features].values
test_x = test_encoded[list_of_features].values
train_y = train_encoded['Survived'].values

print(list_of_columns)

['Age', 'Fare', 'Parch', 'Sex_female', 'Sex_male', 'SibSp', 'Survived']


In [8]:
#collect some trash
gc.collect()

0

# Accuracy Evaluators

In [9]:
"""
This cell defines functions to compute the performance of any given model.
"""

def compute_f1(model, X, y,k_folds):
    """
    Given a model and the evaluation data, returns the F1 score.
    """
    return np.mean(cross_val_score(model, X, y, cv=k_folds, scoring='f1_weighted'))

def accuracy(model, X, y,k_folds):
    """
    Given a model and the evaluation data, returns the accuracy
    score evaluated using cross validation.
    """
    return np.mean(cross_val_score(model, X, y, cv=k_folds, scoring='accuracy'))

def print_score_model(model,train_x,train_y, k_folds):
    print("F1 score is",compute_f1(model,train_x,train_y,k_folds))
    print("Accuracy is",accuracy(model,train_x,train_y,k_folds))

# Different Model Definitions

In [10]:
def dtree(max_depth=None):
    # Decision tree classifier
    clf = DecisionTreeClassifier(max_depth=max_depth)
    return clf

def dtree_adaboost(n_estimators = 50):
    # Single layer decision trees with AdaBoost
    single_tree = dtree(max_depth=1)
    clf = AdaBoostClassifier(single_tree, algorithm='SAMME', n_estimators= n_estimators)
    return clf

def random_forest():
    #Random forest classifier
    clf = RandomForestClassifier(n_estimators=100)
    return clf

def SVM(kernel, degree=3, C=1.0, gamma='auto'):
    # Support vector machines
    clf = svm.SVC(kernel=kernel, degree=degree, C=C, gamma=gamma)
    return clf

def kNN(n_neighbor=3):
    # k nearest neighbours
    clf = KNeighborsClassifier(n_neighbor)
    return clf

def xgboost(max_depth=3, n_estimators=200, learning_rate=0.05):
    #gradient boosting for decision trees
    clf = xgb.XGBClassifier(max_depth = max_depth, 
                            n_estimators = n_estimators, 
                            learning_rate=learning_rate)
    return clf

# 0.0 Logistic Regression

In [11]:
logistic_regression_model = LogisticRegression()
print_score_model(logistic_regression_model,train_x,train_y,10)
# logistic_regression_model.fit(train_x,train_y)

F1 score is 0.786988243645
Accuracy is 0.790132221087


# 0.1 kNN

In [12]:
knn_model = kNN(3)
print_score_model(knn_model,train_x,train_y,10)
# knn_model.fit(train_x,train_y)

F1 score is 0.719810544843
Accuracy is 0.72410225854


# 0.2 Random Forest

In [13]:
randomforest_model = random_forest()
print_score_model(randomforest_model,train_x,train_y,10)

F1 score is 0.813744334154
Accuracy is 0.812680456248


# 0.3 Adaboost

In [16]:
adaboosted_model = dtree_adaboost(100)
print_score_model(adaboosted_model,train_x,train_y,10)

F1 score is 0.789856384786
Accuracy is 0.791356259221


# 1. XGBoost

In [17]:
xgboost_model = xgboost(max_depth=3, 
                        n_estimators=300, 
                        learning_rate=0.05).fit(train_x, train_y)
print_score_model(xgboost_model, train_x, train_y, 10)

F1 score is 0.825289484397
Accuracy is 0.827224775848


In [18]:
xgb_tuned = xgb.XGBClassifier(reg_alpha = 4.1764, 
                              colsample_bytree = 1, 
                              gamma=0,
                              max_depth = 9, 
                              min_child_weight=19.7516,
                              subsample =1.00,
                              n_estimators=200, 
                              learning_rate=0.05)


In [20]:
print_score_model(xgb_tuned, train_x, train_y, 10)

F1 score is 0.782991024969
Accuracy is 0.785612019067


In [21]:
#collect some trash
gc.collect()

125

# 3. SVM

In [None]:
# Linear SVM with parameters obtained from bayesian optimization in MATLAB
linear_svm_model = svm.SVC(C=60.397, kernel='linear')
print_score_model(linear_svm_model, train_x, train_y, 10)

In [None]:
linear_svm_model = svm.SVC(C=60.397, kernel='poly', degree=4)
print_score_model(linear_svm_model, train_x, train_y, 10)

# Bayesian Optimization on XGBoost

In [68]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(10)])

    return -cv_result['test-mae-mean'].values[-1]

num_rounds = 3000
random_state = 2016
num_iter = 25
init_points = 5
params = {
        'eta': 0.1,
        'silent': 1,
        'eval_metric': 'mae',
        'verbose_eval': True,
        'seed': random_state}
    
xgtrain = xgb.DMatrix(train_x, label=train_y)

xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (0, 15),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })
xgbBO.maximize(init_points=init_points, n_iter=num_iter)

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[96]	train-mae:0.396608+0.00498	test-mae:0.398111+0.00874543

    1 | 00m01s | [35m  -0.39811[0m | [32m   7.8772[0m | [32m            0.9587[0m | [32m   5.3540[0m | [32m     7.4250[0m | [32m            2.9227[0m | [32m     0.5324[0m | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[90]	train-mae:0.362848+0.00436844	test-mae:0.368699+0.00774363

    2 | 00m00s | [35m  -0.36870[0m | [32m   0.9008[0m | [32m    

  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[151]	train-mae:0.473295+0.00274958	test-mae:0.473646+0.00377031

   20 | 00m14s |   -0.47365 |    9.5244 |             0.8257 |    0.1530 |      0.5590 |             1.6991 |      0.6825 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[246]	train-mae:0.2959+0.00555601	test-mae:0.308512+0.00729588

   21 | 00m18s |   -0.30851 |    9.9885 |             0.2724 |    0.1129 |     14.4539 |             1.0537 |      0.7565 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[102]	train-mae:0.129887+0.00514447	test-mae:0.252965+0.0149328

   22 | 00m19s |   -0.25296 |    0.0914 |             0.1420 |    0.0682 |      6.5575 |             1.9435 |      0.8628 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[68]	train-mae:0.177992+0.00363329	test-mae:0.261678+0.0149299

   23 | 00m19s |   -0.26168 |    0.0000 |             0.1000 |    0.0000 |      6.0112 |             5.2923 |      1.0000 | 


  " state: %s" % convergence_dict)


Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[78]	train-mae:0.140637+0.00458577	test-mae:0.254911+0.01333

   24 | 00m24s |   -0.25491 |    0.0000 |             0.1000 |    0.0000 |      6.0311 |             1.0000 |      1.0000 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[68]	train-mae:0.090042+0.00662761	test-mae:0.25577+0.0160027

   25 | 00m22s |   -0.25577 |    0.0000 |             0.1000 |    0.0000 |     10.5178 |             2.3652 |      1.0000 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 10 rounds.
Stopping. Best iteration:
[106]	train-mae:0.475908+0.00250691	test-mae:0.476234+0.00335549

   26 | 00m14s |   -0.47623 |   10.0000 |             1.00

# Submission for Kaggle 

In [73]:
#kNN

predictions = knn_model.predict(test_x)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("knn.csv", 
                  index=False)

In [78]:
#logistic regression

predictions =logistic_regression_model.predict(test_x)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("logistic.csv", 
                  index=False)

In [75]:
#Adaboost

predictions = adaboosted_model.predict(test_x)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("ada.csv", 
                  index=False)

In [42]:
#XGBoost
predictions = xgb_tuned.predict(test_x)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("non_one_hot_encoded_embarkedremoved_optimized_submission.csv", 
                  index=False)

In [48]:
#Random Forest
predictions = randomforest_model.predict(test_x)
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
submission.to_csv("random_forest.csv", 
                  index=False)