
# Basic Overview
Given all the models, which we have built, the idea is to intelligently ensemble them so as to a get a more powerful one.

Source of data : https://www.kaggle.com/c/digit-recognizer/data

In [157]:
import pandas as pd
import numpy as np
import os
import re

In [158]:
import seaborn as sns
import matplotlib.pyplot as plt

### Listing all relevant routines, that will be used for ensembling.

In [159]:
def populate_model_files_data(files):
    count = 0
    master_df = pd.DataFrame()
    count_to_file_name = dict()
    for  csv_file in files:
        count += 1
        data_df = pd.read_csv(csv_file, index_col=None)
        if 'prediction' not in data_df.columns:
            VALIDATION_NOT_TEST = False
            if 'label' in data_df.columns:
                data_df.rename(columns={'label': 'prediction'}, inplace=True)
            elif 'Label' in data_df.columns:
                data_df.rename(columns={'Label': 'prediction'}, inplace=True)
        else:
            VALIDATION_NOT_TEST = True
            
        column_name = 'prediction_model_' + str(count)

        survived_list = data_df['prediction'].values
        master_df[column_name] = survived_list
        
        master_df['ImageId']   = data_df['ImageId'].values
        if VALIDATION_NOT_TEST : 
            master_df['label'] = data_df['label'].values        
        count_to_file_name[count] = csv_file
        prev_survived_list = survived_list
    return master_df 

In [160]:
def display_corr_info(master_df, generate_corr_heat_map):
    relevant_cols = [col for col in master_df.columns if col not in ['ImageId'] and 'label' not in col]

    print("                              CORRELATION MATRIX OF MODEL OUTPUTS")
    display(master_df[relevant_cols].corr())
    if generate_corr_heat_map:
        fig, ax = plt.subplots(1, 1, figsize=(16, 9))
        sns.heatmap(master_df[relevant_cols].corr(), ax=ax)    

In [161]:
def get_most_frequent_entry_3(a, b, c):
    sum_vals = a + b + c
    
    if a==b:
        frequent_val = a
    elif b==c:
        frequent_val = b
    elif a==c:
        frequent_val = c
    else:
        print("All three values are different. Proceeding with the first one")
        frequent_val = a
    return frequent_val

In [162]:
def update_ensembled_cols(master_df):
    
    num_files = len([col for col in master_df.columns if 'prediction_model_' in col])
    if num_files == 3:
        master_df['prediction_ensembled'] = master_df.apply(
            lambda x : get_most_frequent_entry_3(x['prediction_model_1'], 
                                                 x['prediction_model_2'], 
                                                 x['prediction_model_3']), axis=1)    
    print(master_df.columns)
    master_df.sort_values(by=['ImageId'], inplace=True)

In [163]:
def display_commonalities_stats(master_df, files):
    print("                              COMMONALITY STATS\n")
    print("Number of entries to be predicted         : {0}".format(len(master_df)))
    for i in range(len(files)):
        index = i + 1
        rel_csv_file = files[i]
        print("\nRelevant model file                       : {0}".format(rel_csv_file))    
        single_model_prediction_col = 'prediction_model_' + str(index)
        num_common_entries = np.sum(master_df[single_model_prediction_col] == master_df['prediction_ensembled'])
        print("Number of entries common with final model : {0}".format(num_common_entries))        

In [164]:
def display_correctness_files(files):
    print("                              CORRECTNESS STATS\n")
    for i in range(len(files)):
        index = i + 1
        rel_csv_file = files[i]
        display_correctness_each_file(rel_csv_file)
    print("\n")

In [165]:
def display_correctness_each_file(rel_csv_file):
    data_df = pd.read_csv(rel_csv_file)
    correctness_percent = np.sum((data_df['label'] == data_df['prediction']))*100.0/(len(data_df))
    print("\nRelevant model file                       : {0}".format(rel_csv_file))    
    print("Percent of correct predictions            : {:0.2f}".format(correctness_percent))        

In [166]:
def dump_predictions_to_csv(master_df, csv_file, on_test_data):
    if not on_test_data:
        predictions_to_kaggle = master_df[['ImageId', 'prediction_ensembled', 'label']].copy()
        predictions_to_kaggle.rename(columns={'prediction_ensembled' : 'prediction'}, inplace=True)
    else:
        predictions_to_kaggle = master_df[['ImageId', 'prediction_ensembled']].copy()
        predictions_to_kaggle['prediction_ensembled'] = predictions_to_kaggle['prediction_ensembled'].apply(
            lambda x : int(x))
        predictions_to_kaggle.rename(columns={'prediction_ensembled' : 'Label'}, inplace=True)
        
    predictions_to_kaggle.to_csv(csv_file, index=False)    

In [167]:
def generate_ensembled_predictions_and_verify_results(files, generate_corr_heat_map=False, 
                                                      generate_csv=False, csv_file='temp.csv', on_test_data=False):
    master_df = populate_model_files_data(files)
    
    # Display correlation info amongst predictors as a matrix as well as  heatmap
    display_corr_info(master_df, generate_corr_heat_map)
    
    if not on_test_data :
        display_correctness_files(files)
    
    # The core routine for selecting the majority vote as the ensembled prediction.
    update_ensembled_cols(master_df)
    
    
    # How common are the ensembled predictions 
    display_commonalities_stats(master_df, files)
    
    if generate_csv:
        dump_predictions_to_csv(master_df, csv_file, on_test_data)


### Starting with the ensembler.

We start with the default ensembler with our 3 best models.

In [168]:
files = [
        'validation_boosting_trees_xgboost.csv',
        'validation_randomforest_sklearn.csv',
        'validation_dnn_tensorflow.csv'
        ]


In [169]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=True,
                                                  csv_file='validation_ensembled_model.csv')  

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,prediction_model_1,prediction_model_2,prediction_model_3
prediction_model_1,1.0,0.974931,0.970737
prediction_model_2,0.974931,1.0,0.966139
prediction_model_3,0.970737,0.966139,1.0


                              CORRECTNESS STATS


Relevant model file                       : validation_boosting_trees_xgboost.csv
Percent of correct predictions            : 97.23

Relevant model file                       : validation_randomforest_sklearn.csv
Percent of correct predictions            : 96.90

Relevant model file                       : validation_dnn_tensorflow.csv
Percent of correct predictions            : 97.12


All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
Al

### Let us see how our ensembled model performs

In [170]:
display_correctness_each_file('validation_ensembled_model.csv')


Relevant model file                       : validation_ensembled_model.csv
Percent of correct predictions            : 97.52


### Comments

Things look rather in line with expectations. We do look to be having a slight improvement here.

### Making predictions on the test data with the ensembled model

In [171]:
  files = [
        'submission_boosting_trees_xgboost.csv',
        'submission_randomforest_sklearn.csv',
        'submission_dnn_tensorflow.csv'
        ]


In [172]:
generate_ensembled_predictions_and_verify_results(files, 
                                                  generate_corr_heat_map=False, 
                                                  generate_csv=True,
                                                  csv_file='submission_ensembled_model.csv',
                                                  on_test_data=True)

                              CORRELATION MATRIX OF MODEL OUTPUTS


Unnamed: 0,prediction_model_1,prediction_model_2,prediction_model_3
prediction_model_1,1.0,0.977069,0.967903
prediction_model_2,0.977069,1.0,0.966542
prediction_model_3,0.967903,0.966542,1.0


All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All three values are different. Proceeding with the first one
All thre