In [None]:
import pandas as pd
print("pandas version: {}". format(pd.__version__))

# numpy: support for large, multi-dimensional arrays and matrices and high-level mathematical functions
import numpy as np
print("numpy version: {}". format(np.__version__))

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit, train_test_split, cross_val_score, learning_curve
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import precision_recall_curve, roc_curve, confusion_matrix, roc_auc_score
print("sklearn version: {}". format(sklearn.__version__))

import xgboost
from xgboost import XGBClassifier
print("xgboost version: {}". format(xgboost.__version__))

import lightgbm
from lightgbm import LGBMClassifier
print("lightgbm version: {}". format(lightgbm.__version__))

import optuna
print("optuna version: {}". format(optuna.__version__))

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
from mlflow.tracking import MlflowClient
print("mlflow version: {}". format(mlflow.__version__))

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import warnings
warnings.simplefilter('ignore')

# Use Ensemble models to improve the accuracy of the model

## Used ensembling techniques
- AdaBoostClassifier

In [None]:
def load_data():
    # load prepared training and test dataset
    df_train = pd.read_pickle('../03_dataCleaningPreparation/df_train_prepared.pkl')
    df_test = pd.read_pickle('../03_dataCleaningPreparation/df_test_prepared.pkl')

    # split the training and test dataset to the input features (x_train, x_test) and the survival class (y_train)
    y_train = df_train['Survived']
    x_train = df_train.drop(['Survived'], axis=1)
    x_test = df_test

    x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size=0.3, stratify = y_train, random_state = 42)

    return x_train, y_train, x_validate, y_validate, x_test

x_train, y_train, x_validate, y_validate, x_test = load_data()



def create_submission(best_model, x_test, name):
    # predict the test values with the training classification model
    y_pred = best_model.predict(x_test).astype(int)
    
    df_submission = pd.read_csv("../01_rawdata/gender_submission.csv")
    df_submission['Survived'] = y_pred
  
    df_submission.to_csv('submissions/%s.csv'%name, index=False)


In [None]:
# load all results from the experiment
df = mlflow.search_runs(experiment_names=["Titanic"])

# filter the best model for each algorithm
df_best = df[df.groupby(['params.algo'])['metrics.cv_score'].transform(max) == df['metrics.cv_score']]

In [None]:
# recreate the best model for each algorithm
estimators = []

for index, row in df_best.iterrows():
    print(row["params.algo"])

    if row["params.algo"] == 'LGBMClassifier':
        model = LGBMClassifier(
            n_estimators=pd.to_numeric(row['params.lgb_n_estimators']),
            learning_rate=pd.to_numeric(row['params.lgb_learning_rate']),
            max_depth=pd.to_numeric(row['params.lgb_max_depth']),
            num_leaves=pd.to_numeric(row['params.lgb_num_leaves']),
            min_data_in_leaf=pd.to_numeric(row['params.lgb_min_data_in_leaf']),
            subsample=pd.to_numeric(row['params.lgb_subsample']),
            feature_fraction=pd.to_numeric(row['params.lgb_feature_fraction']),
            reg_lambda=pd.to_numeric(row['params.lgb_reg_lambda']),
            reg_alpha=pd.to_numeric(row['params.lgb_reg_alpha']),
            num_boost_round  = 100
        )

    elif row["params.algo"] == 'XGBClassifier':
        model = XGBClassifier(
            n_estimators=pd.to_numeric(row['params.xgb_n_estimators']),
            learning_rate=pd.to_numeric(row['params.xgb_learning_rate']),
            reg_lambda=pd.to_numeric(row['params.xgb_reg_lambda']),
            reg_alpha=pd.to_numeric(row['params.xgb_reg_alpha'])
        )

    elif row["params.algo"] == 'DecisionTreeClassifier':
        model = DecisionTreeClassifier(
            max_depth=pd.to_numeric(row['params.dt_max_depth']),
            criterion=row['params.dt_criterion'],
            max_leaf_nodes=pd.to_numeric(row['params.dt_max_leaf_nodes']),
        )

    elif row["params.algo"] == 'SVC':
        model = SVC(
            kernel=row['params.svm_kernel'],
            C=pd.to_numeric(row['params.svm_C']),
            degree=pd.to_numeric(row['params.svm_degree']),
            probability=True
        )

    elif row["params.algo"] == 'LogisticRegression':
        model = LogisticRegression(
            C=pd.to_numeric(row['params.lr_C']),
            penalty=row['params.lr_penalty'],
            solver=row['params.lr_solver'],
        )

    estimators.append((row["params.algo"], model))


# create the voting classifier
voting_clf = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

# train the voting classifier
voting_clf.fit(x_train, y_train)

In [None]:
# create the submission
create_submission(voting_clf, x_test, name="voting_clf")