We can more efficiently explore this space by:
1. Obtaining the Out Of Fold (OOF) predictions for the models we want to combine
2. Selecting combinations of single models
3. Scoring the simple average of the results

Doing it this way allows us to try more combinations and use a local CV score, instead of the public leaderboard, to inform decisions on which model combinations work well together.

The random seed is used to determine the CV folds that generate the OOF predictions.  

In [None]:
RANDOM_SEED = 42

In [None]:
import itertools
from math import factorial
from sklearn.metrics import mean_absolute_error
from exp.train import train_model

import pandas as pd
import numpy as np
import os
import json
from exp.features import load_cv_results

In [None]:
save_results= "exp1.csv"
score_df = load_cv_results(save_results)
score_df['filename'] = save_results

save_results= "exp2.csv"
tmp = load_cv_results(save_results)
tmp['filename'] = save_results
score_df = score_df.append(tmp)

save_results= "exp3.csv"
tmp = load_cv_results(save_results)
tmp['filename'] = save_results
score_df = score_df.append(tmp)
score_df["feature_set"] = "standard_scaled"

save_results= "exp4_1.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_10_shuffle_True_rs_41": "score"})
score_df_['n_folds'] = 10
score_df_['shuffle'] = True
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp5.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_10_shuffle_True_rs_41": "score"})
score_df_['n_folds'] = 10
score_df_['shuffle'] = True
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp6.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_10_shuffle_True_rs_41": "score"})
score_df_['n_folds'] = 10
score_df_['shuffle'] = True
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp7.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_10_shuffle_True_rs_41": "score"})
score_df_['n_folds'] = 10
score_df_['shuffle'] = True
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp8_1.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_10_shuffle_True_rs_41": "score"})
score_df_['n_folds'] = 10
score_df_['shuffle'] = True
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp8_2.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_6_shuffle_False_rs_41": "score"})
score_df_['n_folds'] = 6
score_df_['shuffle'] = False
score_df_['rs'] = 41
score_df = score_df.append(score_df_)

save_results= "exp9.csv"
score_df_ = load_cv_results(save_results)
score_df_['filename'] = save_results
score_df_ = score_df_.rename(index=str, columns={"cv_score_n_folds_5_shuffle_False_rs_None": "score"})
score_df_['n_folds'] = 5
score_df_['shuffle'] = False
score_df_['rs'] = None
score_df = score_df.append(score_df_)

score_df.reset_index(inplace=True)

score_df['model_id'] = score_df['index'].astype(str)
score_df['model_id'] = score_df[['alg','filename','model_id']].apply(lambda x: '_'.join(x), axis=1)

score_df = score_df.sort_values(by="score", axis=0)

score_df.head()

# Obtain OOF Predictions

Obtain the Out Of Fold (OOF) predictions for the models we want to combine.  We can try only our top performing single models, or deliberately include poorer performing single models that introduce model heterogeneity.

In [None]:
non_24k = score_df['feature_set'].apply(lambda x: not x.startswith('2400'))
non_24k = score_df.loc[non_24k]

In [None]:
non_24k.sort_values('score', inplace=True)
non_24k.head(15)

Pick the top 15 best scoring single models

In [None]:
OOF_list = non_24k['score'].head(15).index
OOF_list

In [None]:
def get_train_params(model_exp):
    model_dict = {}
    
    model_dict['params'] = json.loads(model_exp['params_json'])
    model_dict['fs'] = model_exp['feature_set']
    model_dict['alg'] = model_exp['alg']
    model_dict['rs'] = RANDOM_SEED
    
    return model_dict

In [None]:
oof_params = []
for model_idx in OOF_list:
    oof_params.append(get_train_params(score_df.loc[model_idx]))

oof_params

## Get OOF Predictions

In [None]:
oof_preds = {}
i=0
for model_dict in oof_params:
    _,_, oof = train_model(**model_dict)
    oof_preds[OOF_list[i]] = oof
    i += 1

In [None]:
oof_preds_df = pd.DataFrame(oof_preds)

In [None]:
oof_preds_df.to_csv('oof_preds_rs42.csv')

## Selecting Combinations


Calculating all combinations does not scale.

$N$: The pool of models to select combinations from

$k$: The number of single models in an ensemble 

$$\frac{N!}{k!(N-k)!}$$

Will have to choose a smaller N, or smaller k, in order to explore the space, or use an algorithm that is only gauraunteed to find a local minimum.

In [None]:
def simple_average(ensemble_oof_preds):
    return mean_absolute_error(ttf, np.mean(ensemble_oof_preds.T))

In [None]:
ttf = pd.read_csv('../kaggle_files/features/train/ttf.csv', index_col=0)

In [None]:
# N = 20 
k_range = range(2,15)

all_combinations = []
for k in k_range:
    print(k)
#     top_k = itertools.combinations(oof_preds_df.columns[:N],k)
    top_k = itertools.combinations(oof_preds_df.columns,k)
    for combo in top_k:
        all_combinations.append([combo,simple_average(oof_preds_df[list(combo)])])

all_combinations_df = pd.DataFrame(all_combinations, columns=['models','mean_cv'])

all_combinations_df.sort_values(by='mean_cv')

## Pick Ensemble

In [None]:
ensemble = score_df.loc[[1810, 825, 720, 1879, 1792, 811]]
ensemble.to_csv('ensemble2_models.csv')

ensemble

## Get Test Submissions

In [None]:
from exp.train import load_train_features, load_test_features, train_get_test_preds

In [None]:
models = []
y_preds = {}
for idx,single_model in ensemble.iterrows():
    print(single_model['alg'])
    X_tr, y_tr = load_train_features(set = single_model['feature_set'])
    X_test = load_test_features(set = single_model['feature_set'])
    params = json.loads(single_model['params_json'])
    alg = single_model['alg']
    model_id = single_model['model_id']

    model, y_pred = train_get_test_preds(X_tr, y_tr, X_test, params=params, alg=alg)
    
    y_preds[model_id] = y_pred

In [None]:
submission = pd.read_csv('../kaggle_files/submission/sample_submission.csv')

submission.head()

In [None]:
submission['time_to_failure'] = pd.DataFrame(y_preds).T.mean()

In [None]:
submission.to_csv('ensemble2_preds.csv', index=False)