In [17]:
# Following section in book: Using the majority voting principle to make predictions

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, NuSVC
import xgboost as xgb
import numpy as np
from sklearn.pipeline import Pipeline

from sklearn.exceptions import ConvergenceWarning
ConvergenceWarning('ignore')

def feature_eng(data):
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['Fare'].fillna(data['Fare'].mean(),inplace=True)
    eng_title(data)
    eng_age(data)

def eng_title(data):
    data['Title']=0
    data['Title']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    data['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col',
                         'Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

def eng_age(data):
    data.loc[(data.Age.isnull())&(data.Title=='Mr'),'Age']= data.Age[data.Title=="Mr"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Mrs'),'Age']= data.Age[data.Title=="Mrs"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Master'),'Age']= data.Age[data.Title=="Master"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Miss'),'Age']= data.Age[data.Title=="Miss"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Other'),'Age']= data.Age[data.Title=="Other"].mean()


train_data = pd.read_csv('train.csv')
feature_eng(train_data)

test_data = pd.read_csv('test.csv')
feature_eng(test_data)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Embarked']

X_train = train_data[features]
y_train = train_data['Survived']

X_test = test_data[features]

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# X_train, X_valid, y_train, y_valid = train_test_split(X_train_orig, y_train_orig, stratify=y_train_orig, random_state=1)

all_clf = []
scalers = []
hyper_param_grid = {}

def add_clf(clf, hyper_params = {}, scaler=StandardScaler):
    all_clf.append(clf)
    name = clf.__class__.__name__
    hyper_param_grid[name] = hyper_params
    scalers.append(scaler)

add_clf(xgb.XGBClassifier(max_depth=4, learning_rate=.01, n_estimators=300,random_state=1), {}, None)
add_clf(LogisticRegression(random_state=1, solver='lbfgs', max_iter=100, penalty='l2', C=.1)) 
add_clf(SGDClassifier(random_state=1, alpha=.01, eta0=.01, learning_rate='constant', loss='modified_huber', max_iter=1500, penalty='l1', tol=.001))
add_clf(KNeighborsClassifier(n_neighbors=11, weights='uniform', p=1, leaf_size=15))
# add_clf(LinearSVC(random_state=1, dual='auto', C=.1, max_iter=1000, penalty='l2', tol=1e-4))
add_clf(RandomForestClassifier(random_state=1, max_depth=10, min_samples_leaf=2, n_estimators=100, min_samples_split=6), {}, None)
# add_clf(GaussianNB())
add_clf(AdaBoostClassifier(algorithm='SAMME.R', learning_rate=.1, random_state=1, n_estimators=400), {}, False)
add_clf(NuSVC(random_state=1, kernel='poly', coef0=.5, nu=.4, probability=True))
add_clf(ExtraTreesClassifier(random_state=1, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=250), {}, None)
add_clf(LinearDiscriminantAnalysis())
add_clf(MLPClassifier(random_state=1, max_iter=1500))



clf_labels = [model.__class__.__name__ for model in all_clf]

tuned_clfs = []

# Tune each classifier...
for i, clf in enumerate(all_clf):
    scaler = scalers[i]
    name = clf.__class__.__name__ 
    hyper_params = hyper_param_grid[name]

    if scaler:
        clf = Pipeline([('sc', scaler()), (name, clf)])

    grid_search = GridSearchCV(clf, param_grid=hyper_params, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_} score={grid_search.best_score_}")
    tuned_clfs.append(grid_search.best_estimator_)

mv_hard_clf = VotingClassifier(estimators=list(zip(clf_labels, tuned_clfs)), voting='hard')
mv_hard_clf.fit(X_train, y_train)
accuracy = mv_hard_clf.score(X_train, y_train)
print(f"Accuracy of the hard voting classifier: {accuracy}")

predictions = mv_hard_clf.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_ensemble_hard.csv', index=False)

mv_soft_clf = VotingClassifier(estimators=list(zip(clf_labels, tuned_clfs)), voting='soft')
mv_soft_clf.fit(X_train, y_train)
accuracy = mv_soft_clf.score(X_train, y_train)
print(f"Accuracy of the soft voting classifier: {accuracy}")

predictions = mv_soft_clf.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_ensemble_soft.csv', index=False)


Best parameters for XGBClassifier: {} score=0.822666499278137
Best parameters for LogisticRegression: {} score=0.7878978093026175
Best parameters for SGDClassifier: {} score=0.803603038101814
Best parameters for KNeighborsClassifier: {} score=0.8181595631159375
Best parameters for RandomForestClassifier: {} score=0.8294269035214361
Best parameters for GaussianNB: {} score=0.7879103634423452
Best parameters for AdaBoostClassifier: {} score=0.8137342288619672
Best parameters for NuSVC: {} score=0.8282719226664993
Best parameters for ExtraTreesClassifier: {} score=0.8249262444291006
Best parameters for LinearDiscriminantAnalysis: {} score=0.7912497646098802
Best parameters for MLPClassifier: {} score=0.8024919967359236
Accuracy of the hard voting classifier: 0.8428731762065096
Accuracy of the soft voting classifier: 0.8428731762065096
