In [31]:
# Following section in book: Using the majority voting principle to make predictions

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, NuSVC
import xgboost as xgb
import numpy as np
from sklearn.pipeline import Pipeline

def feature_eng(data):
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace = True)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['Fare'].fillna(data['Fare'].mean(),inplace=True)
    eng_title(data)
    eng_age(data)

def eng_title(data):
    data['Title']=0
    data['Title']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    data['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col',
                         'Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

def eng_age(data):
    data.loc[(data.Age.isnull())&(data.Title=='Mr'),'Age']= data.Age[data.Title=="Mr"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Mrs'),'Age']= data.Age[data.Title=="Mrs"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Master'),'Age']= data.Age[data.Title=="Master"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Miss'),'Age']= data.Age[data.Title=="Miss"].mean()
    data.loc[(data.Age.isnull())&(data.Title=='Other'),'Age']= data.Age[data.Title=="Other"].mean()


train_data = pd.read_csv('train.csv')
feature_eng(train_data)

test_data = pd.read_csv('test.csv')
feature_eng(test_data)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Fare', 'Embarked']

X_train = train_data[features]
y_train = train_data['Survived']

X_test = test_data[features]

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# X_train, X_valid, y_train, y_valid = train_test_split(X_train_orig, y_train_orig, stratify=y_train_orig, random_state=1)

all_clf = []
needs_scaling = []

def add_clf(clf, scale=True):
    all_clf.append(clf)
    needs_scaling.append(scale)

add_clf(xgb.XGBClassifier(max_depth=4, learning_rate=.01, n_estimators=300,random_state=1), False) # Pre-optimzed
add_clf(LogisticRegression(random_state=1))
add_clf(KNeighborsClassifier())
add_clf(LinearSVC(random_state=1, dual='auto'))
add_clf(RandomForestClassifier(random_state=1), False)
add_clf(GaussianNB())
add_clf(AdaBoostClassifier(random_state=1), False)
add_clf(NuSVC(random_state=1))
add_clf(ExtraTreesClassifier(random_state=1), False)

clf_labels = [model.__class__.__name__ for model in all_clf]

# Using https://www.kaggle.com/code/korfanakis/titanic-a-beginner-friendly-approach-to-top-3 to generate ranges
hyper_param_grid = {
    'XGBClassifier': {}, #Already optimized
    'LogisticRegression': [
        {'LogisticRegression__max_iter': [100, 200, 300], 'LogisticRegression__penalty': ['l2'], 'LogisticRegression__C': [0.001, 0.01, 0.1, 1], 'LogisticRegression__solver' : ['lbfgs']},
        {'LogisticRegression__max_iter': [100, 200, 300], 'LogisticRegression__penalty': ['l1', 'l2'], 'LogisticRegression__C': [0.001, 0.01, 0.1, 1], 'LogisticRegression__solver' : ['liblinear']},
    ],
    'KNeighborsClassifier': {'KNeighborsClassifier__n_neighbors': [3, 5, 7, 9, 11, 15], 'KNeighborsClassifier__weights': ['uniform', 'distance'], 'KNeighborsClassifier__leaf_size': [15,30,45,90], 'KNeighborsClassifier__p': [1,2]},
    'LinearSVC': {'LinearSVC__C': [.01, 0.1, 1], 'LinearSVC__penalty': ['l1', 'l2'], 'LinearSVC__tol': [1e-4, 1e-3, 1e-2], 'LinearSVC__max_iter': [1000, 2500, 5000]},
    'RandomForestClassifier': {'n_estimators': [100, 250, 500], 'max_depth': [5, 10, 15], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 4, 6]},
    'GaussianNB': {},  # GaussianNB doesn't have hyperparameters to tune in this context
    'AdaBoostClassifier': {'n_estimators': [10, 50, 100, 200, 400], 'learning_rate': [0.01, 0.05, 0.1, 0.5, 1], 'algorithm': ['SAMME', 'SAMME.R']},
    'NuSVC': {'NuSVC__nu': [0.1, 0.4, 0.7], 'NuSVC__kernel': ['linear', 'poly', 'rbf',], 'NuSVC__coef0': [0.0, 0.5, 1]},
    'ExtraTreesClassifier': {'n_estimators': [100, 250, 500], 'max_depth': [10, 20, 30, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
}

tuned_clfs = []

# Tune each classifier...
for i, clf in enumerate(all_clf):
    need_scaling = needs_scaling[i]
    name = clf.__class__.__name__ 
    hyper_params = hyper_param_grid[name]

    if need_scaling:
        clf = Pipeline([('sc', StandardScaler()), (name, clf)])

    grid_search = GridSearchCV(clf, param_grid=hyper_params, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_} score={grid_search.best_score_}")
    tuned_clfs.append(grid_search.best_estimator_)


mv_hard_clf = VotingClassifier(estimators=list(zip(clf_labels, tuned_clfs)), voting='hard')
mv_hard_clf.fit(X_train, y_train)
accuracy = mv_hard_clf.score(X_train, y_train)
print(f"Accuracy of the voting classifier: {accuracy}")

predictions = mv_hard_clf.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_ensemble_hard.csv', index=False)

Best parameters for XGBClassifier: {} score=0.822666499278137
Best parameters for LogisticRegression: {'LogisticRegression__C': 0.1, 'LogisticRegression__max_iter': 100, 'LogisticRegression__penalty': 'l2', 'LogisticRegression__solver': 'lbfgs'} score=0.7878978093026175
Best parameters for KNeighborsClassifier: {'KNeighborsClassifier__leaf_size': 15, 'KNeighborsClassifier__n_neighbors': 11, 'KNeighborsClassifier__p': 1, 'KNeighborsClassifier__weights': 'uniform'} score=0.8181595631159375
Best parameters for LinearSVC: {'LinearSVC__C': 0.1, 'LinearSVC__max_iter': 1000, 'LinearSVC__penalty': 'l2', 'LinearSVC__tol': 0.0001} score=0.7923733601154981
Best parameters for RandomForestClassifier: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 100} score=0.8294269035214361
Best parameters for GaussianNB: {} score=0.7879103634423452
Best parameters for AdaBoostClassifier: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 400} score=0.813734228861967