
# Basic Overview
Given all the models, which we have built, the idea is to intelligently ensemble them so as to a get a more powerful one.

Source of data : https://www.kaggle.com/c/titanic/data

In [4]:
import pandas as pd
import numpy as np
import os
import re

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

### Listing all relevant routines, that will be used for ensembling.

In [6]:
def populate_model_files_data(files):
    count = 0
    master_df = pd.DataFrame()
    count_to_file_name = dict()
    for  csv_file in files:
        count += 1
        data_df = pd.read_csv(csv_file)
        column_name = 'Survived_or_not_model_' + str(count)

        survived_list = data_df['Survived'].values
        master_df[column_name] = survived_list
        master_df['PassengerId'] = data_df['PassengerId'].values
        count_to_file_name[count] = csv_file
        prev_survived_list = survived_list
    return master_df

In [7]:
def display_corr_info(master_df, generate_corr_heat_map):
    relevant_cols = [col for col in master_df.columns if col not in ['PassengerId']]

    print("                              CORRELATION MATRIX OF MODEL OUTPUTS")
    display(master_df[relevant_cols].corr())
    if generate_corr_heat_map:
        fig, ax = plt.subplots(1, 1, figsize=(16, 9))
        sns.heatmap(master_df[relevant_cols].corr(), ax=ax)    

In [8]:
def get_most_frequent_entry_3(a, b, c):
    sum_vals = a + b + c
    
    if sum_vals <= 1:
        frequent_val = 0
    else:
        frequent_val = 1
    return frequent_val

In [9]:
def get_most_frequent_entry_5(a, b, c, d, e):
    sum_vals = a + b + c + d + e
    
    if sum_vals <= 2:
        frequent_val = 0
    else:
        frequent_val = 1
    return frequent_val

In [10]:
def get_most_frequent_entry_7(a, b, c, d, e, f, g):
    sum_vals = a + b + c + d  + e + f + g
    
    if sum_vals <= 3:
        frequent_val = 0
    else:
        frequent_val = 1
    return frequent_val

In [11]:
def update_ensembled_cols(master_df):
    
    # Decrease by 1 to account for PassengerId column.
    num_files = len(master_df.columns) - 1
    if num_files == 7:
        master_df['Survived_or_not_ensembled'] = master_df.apply(
            lambda x : get_most_frequent_entry_7(x['Survived_or_not_model_1'], 
                                                 x['Survived_or_not_model_2'], 
                                                 x['Survived_or_not_model_3'],                                        
                                                 x['Survived_or_not_model_4'],                                                                               
                                                 x['Survived_or_not_model_5'],
                                                 x['Survived_or_not_model_6'],
                                                 x['Survived_or_not_model_7']), axis=1)

    elif num_files == 5:

        master_df['Survived_or_not_ensembled'] = master_df.apply(
            lambda x : get_most_frequent_entry_5(x['Survived_or_not_model_1'], 
                                                 x['Survived_or_not_model_2'], 
                                                 x['Survived_or_not_model_3'],                                        
                                                 x['Survived_or_not_model_4'],                                                                               
                                                 x['Survived_or_not_model_5']), axis=1)
    elif num_files == 3:
        master_df['Survived_or_not_ensembled'] = master_df.apply(
            lambda x : get_most_frequent_entry_3(x['Survived_or_not_model_1'], 
                                                 x['Survived_or_not_model_2'], 
                                                 x['Survived_or_not_model_3']), axis=1)    
    master_df.sort_values(by=['PassengerId'], inplace=True)

In [12]:
def display_commonalities_stats(master_df, files):
    
    print("                              COMMONALITY STATS\n")
    print("Number of entries to be predicted         : {0}".format(len(master_df)))

    for i in range(len(files)):
        index = i + 1
        rel_csv_file = files[i]
        print("\nRelevant model file                       : {0}".format(rel_csv_file))    
        single_model_prediction_col = 'Survived_or_not_model_' + str(index)
        num_common_entries = np.sum(master_df[single_model_prediction_col] == master_df['Survived_or_not_ensembled'])
        print("Number of entries common with final model : {0}".format(num_common_entries))    
    

In [13]:
def dump_predictions_to_csv(master_df, csv_file):
    predictions_to_kaggle = master_df[['PassengerId', 'Survived_or_not_ensembled']].copy()
    predictions_to_kaggle.rename(columns={'Survived_or_not_ensembled' : 'Survived'}, inplace=True)
    predictions_to_kaggle.to_csv(csv_file, index=False)    

In [14]:
def generate_ensembled_predictions_and_verify_results(files, generate_corr_heat_map=False, 
                                                      generate_csv=False, csv_file='temp.csv'):
    master_df = populate_model_files_data(files)
    
    # Display correlation info amongst predictors as a matrix as well as  heatmap
    display_corr_info(master_df, generate_corr_heat_map)
    
    # The core routine for selecting the majority vote as the ensembled prediction.
    update_ensembled_cols(master_df)
    
    
    # How common are the ensembled predictions 
    display_commonalities_stats(master_df, files)
    
    if generate_csv:
        dump_predictions_to_csv(master_df, csv_file)


### Listing models by their out of sample performance

Before we set out in our ensembling journey, let us list the output files corresponding to each model and their respective out of sample performances.

In [17]:
#kaggle_out_xgboost_sex_pclass_age_fare : 0.8305672733436958
# : 0.8010471204188482
#kaggle_out_svc_tickets : 0.7329842931937173
#kaggle_out_xgboost_cabin_null : 0.6925897239756053
#kaggle_out_xgboost_parch_sibsp_embarked_s : 0.6850822065638291

### Starting with the ensembler.

We start with the default ensembler with our 3 best models.

In [22]:
files = [
        'kaggle_out_xgboost_sex_pclass_age_fare.csv',
        'kaggle_out_svc_names.csv',
        'kaggle_out_svc_tickets.csv']


In [24]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=False)

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,Survived_or_not_model_1,Survived_or_not_model_2,Survived_or_not_model_3
Survived_or_not_model_1,1.0,0.875787,0.183321
Survived_or_not_model_2,0.875787,1.0,0.255624
Survived_or_not_model_3,0.183321,0.255624,1.0


                              COMMONALITY STATS

Number of entries to be predicted         : 418

Relevant model file                       : kaggle_out_xgboost_sex_pclass_age_fare.csv
Number of entries common with final model : 401

Relevant model file                       : kaggle_out_svc_names.csv
Number of entries common with final model : 411

Relevant model file                       : kaggle_out_svc_tickets.csv
Number of entries common with final model : 290


Comments : The correlation of the third model with the first 2 is very small, thus enabling it to be a good ensembler.

However, we have an idea here :

We have 3 models, which are not greatly performing in themselves, but have reasonably small correlation with each other. If we combine these 3 by ensembling, shouldn't we get a model that is more powerful that these 3 individual ones and at the same time preserve the low correlation with the first 2 powerful models ?


### Ensemble of bottom 3 models

In [25]:
files = [
        'kaggle_out_svc_tickets.csv',
        'kaggle_out_xgboost_cabin_null.csv',
        'kaggle_out_xgboost_parch_sibsp_embarked_s.csv' ]


In [26]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=False)

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,Survived_or_not_model_1,Survived_or_not_model_2,Survived_or_not_model_3
Survived_or_not_model_1,1.0,0.29343,0.360471
Survived_or_not_model_2,0.29343,1.0,0.207858
Survived_or_not_model_3,0.360471,0.207858,1.0


                              COMMONALITY STATS

Number of entries to be predicted         : 418

Relevant model file                       : kaggle_out_svc_tickets.csv
Number of entries common with final model : 379

Relevant model file                       : kaggle_out_xgboost_cabin_null.csv
Number of entries common with final model : 356

Relevant model file                       : kaggle_out_xgboost_parch_sibsp_embarked_s.csv
Number of entries common with final model : 358


Comments : As expected the correlation between these models is very low. Therefore, let us dump the ensembled model values to a file so that we can use them for ensembling later.

In [27]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=True, csv_file='kaggle_out_ensemble_bottom_three.csv')

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,Survived_or_not_model_1,Survived_or_not_model_2,Survived_or_not_model_3
Survived_or_not_model_1,1.0,0.29343,0.360471
Survived_or_not_model_2,0.29343,1.0,0.207858
Survived_or_not_model_3,0.360471,0.207858,1.0


                              COMMONALITY STATS

Number of entries to be predicted         : 418

Relevant model file                       : kaggle_out_svc_tickets.csv
Number of entries common with final model : 379

Relevant model file                       : kaggle_out_xgboost_cabin_null.csv
Number of entries common with final model : 356

Relevant model file                       : kaggle_out_xgboost_parch_sibsp_embarked_s.csv
Number of entries common with final model : 358


### Ensemble with the top 2 models.

Now, we proceed to use this ensembled model of the bottom 3 models as another model to be ensembled with the top 2. Let us see how correlations look.

In [28]:
files = [
        'kaggle_out_xgboost_sex_pclass_age_fare.csv',
        'kaggle_out_svc_names.csv',
        'kaggle_out_ensemble_bottom_three.csv']


In [29]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=False)

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,Survived_or_not_model_1,Survived_or_not_model_2,Survived_or_not_model_3
Survived_or_not_model_1,1.0,0.875787,0.268127
Survived_or_not_model_2,0.875787,1.0,0.318947
Survived_or_not_model_3,0.268127,0.318947,1.0


                              COMMONALITY STATS

Number of entries to be predicted         : 418

Relevant model file                       : kaggle_out_xgboost_sex_pclass_age_fare.csv
Number of entries common with final model : 403

Relevant model file                       : kaggle_out_svc_names.csv
Number of entries common with final model : 409

Relevant model file                       : kaggle_out_ensemble_bottom_three.csv
Number of entries common with final model : 303


Comment : The correlation looks to have increased slightly, but we strongly believe that this yield us a better estimate ,as  we are using a better ensembled model as one of the inputs.

NOTE : Ideally, we should have set aside some part of training data as test data and we could have tested out the performance of the ensembled model as well. I am not proceeding to do that here.

### Generate predictions file

In [30]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=True,
                                                  csv_file='kaggle_out_ensembled_model_predictions.csv')

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,Survived_or_not_model_1,Survived_or_not_model_2,Survived_or_not_model_3
Survived_or_not_model_1,1.0,0.875787,0.268127
Survived_or_not_model_2,0.875787,1.0,0.318947
Survived_or_not_model_3,0.268127,0.318947,1.0


                              COMMONALITY STATS

Number of entries to be predicted         : 418

Relevant model file                       : kaggle_out_xgboost_sex_pclass_age_fare.csv
Number of entries common with final model : 403

Relevant model file                       : kaggle_out_svc_names.csv
Number of entries common with final model : 409

Relevant model file                       : kaggle_out_ensemble_bottom_three.csv
Number of entries common with final model : 303
