In [1]:
import pandas as pd
import glob
import os
import mlflow

from sklearn.preprocessing import MinMaxScaler

In [8]:
# load all results from the experiment
df = mlflow.search_runs(experiment_names=["Titanic"])

df_best = df[df["run_id"].isin([
    "0f17ada2ca224d1ebd2ceea939c90e0b",
    "bb4cc975082b4558a5aa36fa5f10caa7",
    "e17cba6520e746beba20beed0f3b1fbc",
])][["run_id", "metrics.best_cv_score", "metrics.kaggle_score"]]

df_best["score_scaled"] = MinMaxScaler(feature_range=(0.1,0.9)).fit_transform(df_best["metrics.kaggle_score"].values.reshape(len(df_best), -1))

df_best.head()

Unnamed: 0,run_id,metrics.best_cv_score,metrics.kaggle_score,score_scaled
664,bb4cc975082b4558a5aa36fa5f10caa7,0.826,0.78229,0.1
823,e17cba6520e746beba20beed0f3b1fbc,0.835,0.78229,0.1
946,0f17ada2ca224d1ebd2ceea939c90e0b,0.84,0.78708,0.9


In [9]:
# load the submissions of the best results
path = r'E:\Data Science Projects\Kaggle\titanic\04_MachineLearningModels\submissions'
li = []

for submission in df_best.run_id:
    submission_path = path + "\\" + submission + ".csv"
    df = pd.read_csv(submission_path, index_col="PassengerId", header=0)
    df.rename(columns={"Survived": submission}, inplace=True)
    li.append(df)

df_voting = pd.concat(li, axis=1, ignore_index=False).sort_values(by="PassengerId")

df_voting.head()

Unnamed: 0_level_0,bb4cc975082b4558a5aa36fa5f10caa7,e17cba6520e746beba20beed0f3b1fbc,0f17ada2ca224d1ebd2ceea939c90e0b
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
892,0,0,0
893,1,0,0
894,0,0,0
895,0,0,0
896,1,1,1


In [10]:
# stack the results by the run_id so that all runs are in one column
df_voting_stack = df_voting.stack().reset_index().rename(columns={"level_1": "run_id", 0: "survived"})
df_voting_stack.head()

Unnamed: 0,PassengerId,run_id,survived
0,892,bb4cc975082b4558a5aa36fa5f10caa7,0
1,892,e17cba6520e746beba20beed0f3b1fbc,0
2,892,0f17ada2ca224d1ebd2ceea939c90e0b,0
3,893,bb4cc975082b4558a5aa36fa5f10caa7,1
4,893,e17cba6520e746beba20beed0f3b1fbc,0


In [11]:
# merge the score_scaled to each run_id 
df_voting_stack = df_voting_stack.merge(df_best[["run_id", "score_scaled"]], on="run_id")
df_voting_stack.head()

Unnamed: 0,PassengerId,run_id,survived,score_scaled
0,892,bb4cc975082b4558a5aa36fa5f10caa7,0,0.1
1,893,bb4cc975082b4558a5aa36fa5f10caa7,1,0.1
2,894,bb4cc975082b4558a5aa36fa5f10caa7,0,0.1
3,895,bb4cc975082b4558a5aa36fa5f10caa7,0,0.1
4,896,bb4cc975082b4558a5aa36fa5f10caa7,1,0.1


In [12]:
# create a pivot table for each of the binary results and sum the score_scaled by each of the binary results
df_voting_stack = pd.pivot_table(data=df_voting_stack, values="score_scaled", index="PassengerId", columns="survived", aggfunc="sum")
df_voting_stack.head()

survived,0,1
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
892,1.1,
893,1.0,0.1
894,1.1,
895,1.1,
896,,1.1


In [13]:
# use the idmax function to get the column name for the row where the cell value is the greatest 
df_submission = pd.read_csv("../01_RawData/gender_submission.csv")
df_submission['Survived'] = df_voting_stack.idxmax(axis=1).reset_index()[0].astype(int)

df_submission.to_csv('submissions/voting.csv', index=False)
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


The final results with the score_scaled feature (0.78229) is lower compared the the standard voting classifier without any scaling of the runs (0.78468).