In [25]:
# Following section in book: Using the majority voting principle to make predictions

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import numpy as np
from sklearn.pipeline import Pipeline

def feature_eng(data):
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    eng_title(data)
    eng_age(data)

def eng_title(data):
    data['Title']=0
    data['Title']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    data['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col',
                         'Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

def eng_age(data):
    data.loc[(data.Age.isnull())&(data.Title=='Mr'),'Age']= data.Age[data.Title=="Mr"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Mrs'),'Age']= data.Age[data.Title=="Mrs"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Master'),'Age']= data.Age[data.Title=="Master"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Miss'),'Age']= data.Age[data.Title=="Miss"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Other'),'Age']= data.Age[data.Title=="Other"].mean()


train_data = pd.read_csv('train.csv')
feature_eng(train_data)

test_data = pd.read_csv('test.csv')
feature_eng(test_data)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Embarked']

X_train = train_data[features]
y_train = train_data['Survived']

X_test = test_data[features]

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, random_state=1)

# No need to scale this guy.
clf1 = xgb.XGBClassifier(max_depth=4, learning_rate=.01, n_estimators=300,random_state=1)

clf2 = LogisticRegression(penalty='l2', C=0.001,solver='lbfgs',random_state=1)
pipe2 = Pipeline([['sc', StandardScaler()], ['clf', clf2]])

clf3 = KNeighborsClassifier(n_neighbors=1,p=2,metric='minkowski')
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf2]])

clf_labels = ['XGBoost', 'Logistic regression', 'KNN']

print('10-fold cross validation:\n')
#TODO: understand python zip
for clf, label in zip([clf1, pipe2, pipe3], clf_labels):
    #TODO: try to with both accuracy and ROC/AUC scoring
    scores = cross_val_score(estimator=clf,X=X_train,y=y_train,cv=10,scoring='accuracy')
    print(f'ROC AUC: {scores.mean():.2f} '
    f'(+/- {scores.std():.2f}) [{label}]')



mv_hard_clf = VotingClassifier(estimators=list(zip(clf_labels, [clf1, pipe2, pipe3])), voting='hard')
mv_soft_clf = VotingClassifier(estimators=list(zip(clf_labels, [clf1, pipe2, pipe3])), voting='soft')

#use mv_clf.get_params() to see what parameters you can optimize
clf_labels += ['Voting-hard', 'Voting-soft']
all_clf = [clf1, pipe2, pipe3, mv_hard_clf, mv_soft_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf, X=X_train,y=y_train,cv=10, scoring='accuracy')
    print(f'Accuracy: {scores.mean():.2f} ' f'(+/- {scores.std():.2f}) [{label}]')

# #TODO: write to CSV

10-fold cross validation:

ROC AUC: 0.82 (+/- 0.03) [XGBoost]
ROC AUC: 0.68 (+/- 0.03) [Logistic regression]
ROC AUC: 0.68 (+/- 0.03) [KNN]
Accuracy: 0.82 (+/- 0.03) [XGBoost]
Accuracy: 0.68 (+/- 0.03) [Logistic regression]
Accuracy: 0.68 (+/- 0.03) [KNN]
Accuracy: 0.68 (+/- 0.03) [Voting-hard]
Accuracy: 0.83 (+/- 0.03) [Voting-soft]
