# Investigating Misclassifications by Model

**Methods:**
>1. Load and concat data
>2. Identify misclassified cases
>3. Investigate the respective predictions by model

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.svm as skl_svm
import sklearn.cross_validation as skl_cv
import seaborn as sns
import os
import sys

base_path = '/home/lundi/Python/MNIST/'
sys.path.append(base_path + '/libraries/')

import time
import glob

import MNIST_model_functions as mmf
MNIST_model_functions = mmf.MNIST_model_functions()

## 1. Load and concat data

In [2]:
prediction_data_v1 = pd.DataFrame()

for filename in glob.glob(base_path + '/data/prediction_results/2016.11.7-*_results.csv'):
    prediction_data_v1 = pd.concat([prediction_data_v1, pd.read_csv(filename)])
prediction_data_v1 = prediction_data_v1.rename(columns = {'Unnamed: 0': 'datum_index'})
#prediction_data_v1 = prediction_data_v1.drop(['Unnamed: 0'], axis = 1)

## 2. Identify misclassified cases

In [3]:
prediction_data_v1.loc[prediction_data_v1['datum_index'] == 0]

Unnamed: 0,datum_index,Actual,Predicted,0,1,2,3,4,5,6,7,8,9,is_misclassified,Model
0,0,1,1,0.00079,0.97929,0.00323,0.00273,0.00052,0.00199,0.00121,0.00044,0.00931,0.00051,False,SVC_Poly
0,0,1,1,0.0,0.99959,5e-05,0.00013,1e-05,8e-05,1e-05,1e-05,0.00011,1e-05,False,GBM
0,0,1,1,0.0038,0.66675,0.04661,0.04493,0.00104,0.02654,0.00275,0.0005,0.20441,0.00267,False,LR
0,0,1,1,0.0,0.93533,0.01,0.00733,0.00067,0.018,0.00533,0.00133,0.02067,0.00133,False,RF


I will calculate the fraction of misclassifications for each datum

In [4]:
average_misclassification_fraction = prediction_data_v1.groupby(['datum_index'])['is_misclassified'].mean().reset_index()
average_misclassification_fraction = average_misclassification_fraction.rename(columns = {'is_misclassified': 'misclassified_frac'})

Now, I will merge this data onto the original prediction data

In [5]:
prediction_data_v2 = pd.merge(prediction_data_v1, average_misclassification_fraction, on = ['datum_index'])

Now I will grab cases where the misclassification is 0.5

In [6]:
split_classified_data = prediction_data_v2.loc[prediction_data_v2['misclassified_frac'] == 0.5].drop(['misclassified_frac'], axis=1)

## 3. Investigate the respective predictions by model

Let's predict which models predict together. To do this, I will pivot the table to yield predictions

In [7]:
split_classified_pivot_data = pd.pivot_table(split_classified_data[['datum_index','Actual','Predicted','is_misclassified','Model']], 
                   values = ['is_misclassified'], index = ['datum_index'], columns = ['Model'])

In [8]:
split_classified_pivot_data.corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,is_misclassified,is_misclassified,is_misclassified,is_misclassified
Unnamed: 0_level_1,Model,GBM,LR,RF,SVC_Poly
Unnamed: 0_level_2,Model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
is_misclassified,GBM,1.0,-0.156348,-0.49453,-0.435672
is_misclassified,LR,-0.156348,1.0,-0.359623,-0.335972
is_misclassified,RF,-0.49453,-0.359623,1.0,-0.203399
is_misclassified,SVC_Poly,-0.435672,-0.335972,-0.203399,1.0


A -1.0 here means that they don't predict the same at all. So GBM predicts quite differently from RF and SVC_Poly

Let's see the misclassifications by model and by number (using the overall data)

In [9]:
cm = sns.light_palette("green", as_cmap=True)

prediction_data_v2.groupby(['Model','Actual'])['is_misclassified'].mean().unstack().style.background_gradient(cmap=cm)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GBM,0.0384615,0.0322581,0.0880734,0.0979167,0.0628931,0.098081,0.0503876,0.0770751,0.106695,0.0753138
LR,0.0384615,0.0358423,0.12844,0.135417,0.0880503,0.200426,0.0445736,0.106719,0.171548,0.142259
RF,0.0222672,0.0250896,0.066055,0.0916667,0.048218,0.0746269,0.0329457,0.0731225,0.0857741,0.0983264
SVC_Poly,0.01417,0.0197133,0.0605505,0.0729167,0.0356394,0.0597015,0.0290698,0.0533597,0.0878661,0.0690377


In [10]:
prediction_data_for_stacking = prediction_data_v2[[u'datum_index', u'Actual', u'Predicted', u'0', u'1', u'2', u'3', u'4',
       u'5', u'6', u'7', u'8', u'9', u'Model']]

In [24]:
model_weights_by_number = pd.DataFrame(
             {'0': [0.04, 0.04, 0.022, 0.014],
              '1': [0.032, 0.035, 0.025, 0.02],
              '2': [0.088, 0.128, 0.066, 0.06],
              '3': [0.097, 0.135, 0.091, 0.073],
              '4': [0.063, 0.088, 0.048, 0.035],
              '5': [0.098, 0.2, 0.075, 0.06],
              '6': [0.05, 0.044, 0.032, 0.03],
              '7': [0.077, 0.106, 0.073, 0.053],
              '8': [0.106, 0.17, 0.087, 0.087],
              '9': [0.075, 0.14, 0.098, 0.069]},
        index = ['SVC_Poly','GBM', 'LR', 'RF'])

In [25]:
stacked_prediction = MNIST_model_functions.calculate_prediction_by_probs_with_weights(prediction_data_for_stacking, weights = model_weights_by_number)

In [26]:
import sklearn.metrics as skl_metrics
actuals = prediction_data_v1[['datum_index','Actual']].drop_duplicates()
new_pred_data = MNIST_model_functions.calculate_misclassifications_by_number(actuals, stacked_prediction)

new_confusion_matrix = pd.DataFrame(skl_metrics.confusion_matrix(
                            y_true = new_pred_data['Actual'].astype(str), 
                            y_pred = new_pred_data['Predicted'].astype(str)))

original_data = prediction_data_v1[['datum_index','Actual','Predicted','is_misclassified']].drop_duplicates()
original_confusion_matrix = pd.DataFrame(skl_metrics.confusion_matrix(
                            y_true = original_data['Actual'].astype(str), 
                            y_pred = original_data['Predicted'].astype(str)))

print 1.0 - original_data['is_misclassified'].mean()
original_confusion_matrix - new_confusion_matrix

0.855161516854


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,18,0,0,0,3,1,7,0,0,1
1,1,13,1,-1,1,1,2,1,1,2
2,7,10,15,7,5,3,13,8,11,4
3,6,5,7,23,0,19,3,2,12,3
4,1,0,4,0,28,-1,2,1,3,14
5,9,9,2,22,11,13,11,3,28,9
6,7,2,0,0,4,-4,31,1,0,0
7,2,8,10,2,8,0,2,17,3,24
8,1,13,8,16,7,21,4,4,20,11
9,5,6,5,4,21,5,2,20,7,14


In [27]:
print 1.0 - new_pred_data['is_misclassified'].mean()
new_confusion_matrix

0.935612877425


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,471,0,2,1,0,6,4,0,6,4
1,0,535,2,6,0,2,2,2,9,0
2,1,2,512,4,6,0,1,9,9,1
3,1,0,14,438,0,7,1,2,11,6
4,0,0,2,1,441,1,2,1,2,27
5,1,0,3,8,0,441,5,0,6,5
6,3,0,4,1,6,14,480,1,6,1
7,1,1,8,1,2,1,1,475,1,15
8,2,1,5,9,1,9,1,2,440,8
9,3,1,3,7,6,3,0,6,3,446
