In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
    #for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import Some Tensorflow and Keras related packages & some model APIs

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.image import ImageDataGenerator
from keras.utils import np_utils
import matplotlib.pyplot as plt
from keras.applications.xception import preprocess_input


# Model
This version uses an Xception Net as the base model. The final layer is removed and replaced with 2 Dense (FC) layers, with a dropout layer (65% and 25%) before each of them respectively. The base layers are frozen and the 2 FC layers are trained for 8 epochs till convergence. Then, the base layers are unfrozen and the entire model is trained with a very low learning rate (1e-05) for 20 epochs. 

For this error analysis, we load the model we had trained earlier, for a total of 30 epochs. 

In [None]:
model = tf.keras.models.load_model('../input/pretrained-model-5/model_5.h5')

# Input Pipeline

In the following section, an input pipleline is generated. We are only concerned about the validation data, as we will perform our error analysis. Test data doesn't have label, and train data has already been learnt by the model.

In [None]:
# Preparing validation labels from val_annotations.txt
val_data = pd.read_csv('/kaggle/input/image-detect/val/val_annotations.txt', sep='\t', header=None, names=['File', 'Class', 'X', 'Y', 'H', 'W'])
val_data.drop(['X', 'Y', 'H', 'W'], axis=1, inplace=True)
val_data.head(5)

In [None]:
# Initializing the 3 separated data generators for train/validation/test
# Data augmentation applied only on train data

valid_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)


In [None]:
# Model / data parameters
num_classes = 200
n_x = 150 # input width
n_y = 150 # input height
n_c = 3 # number of channels ('rgb')

In [None]:
# Creating the generators

#validation generator : labels from val_data, created earlier
validation_generator = valid_datagen.flow_from_dataframe(val_data, 
                                                         directory='/kaggle/input/image-detect/val/images/', 
                                                         x_col='File', y_col='Class', 
                                                         target_size=(n_x, n_y),
                                                         color_mode='rgb', 
                                                         class_mode='categorical', 
                                                         batch_size=256, 
                                                         shuffle=False, 
                                                         seed=42)




In [None]:
# Creates a dictionary mapping classes to corresponding word descriptions (wnids.txt to words.txt)
words = pd.read_csv('/kaggle/input/image-detect/words.txt', sep='\t', header=None, names=['Class', 'Words'])
word_id = pd.read_csv('/kaggle/input/image-detect/wnids.txt', sep='\t', header=None, names=['Class'])['Class'].values
id2words = {}
for ids in word_id:
    id2words[ids] = words.loc[words['Class']==ids,'Words'].to_string(index=False).split(",")[0]
    
#print(id2words)


# Visualizing the Generator Images

This section creates a grid to visualize the inputs after applied augemntation 

In [None]:
from PIL import Image
from matplotlib import cm
from mpl_toolkits.axes_grid1 import ImageGrid
import math
%matplotlib inline

def show_grid(image_list,nrows,ncols,label_list=None,show_labels=False,savename=None,figsize=(10,10),showaxis='off'):
    if type(image_list) is not list:
        if(image_list.shape[-1]==1):
            image_list = [image_list[i,:,:,0] for i in range(image_list.shape[0])]
        elif(image_list.shape[-1]==3):
            image_list = [image_list[i,:,:,:] for i in range(image_list.shape[0])]
    fig = plt.figure(None, figsize,frameon=False)
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(nrows, ncols),  # creates 2x2 grid of axes
                     axes_pad=0.3,  # pad between axes in inch.
                     share_all=True,
                     )
    for i in range(nrows*ncols):
        ax = grid[i]
        ax.imshow(image_list[i],cmap='Greys_r')  # The AxesGrid object work as a list of axes.
        ax.axis('off')
        if show_labels:
            ax.set_title(id2words[class_mapping[y_int[i]]])
    if savename != None:
        plt.savefig(savename,bbox_inches='tight')

In [None]:
class_mapping = {v:k for k,v in validation_generator.class_indices.items()}

x,y = next(validation_generator)
#Get class int val from one hot encoded labels
y_int = np.argmax(y,axis=-1)

# Visualising the validation dataset
show_grid(x,4,8,label_list=y,show_labels=True,figsize=(20,10))

In [None]:
#validation generator : labels from val_data, created earlier
validation_generator = valid_datagen.flow_from_dataframe(val_data, 
                                                         directory='/kaggle/input/image-detect/val/images/', 
                                                         x_col='File', y_col='Class', 
                                                         target_size=(n_x, n_y),
                                                         color_mode='rgb', 
                                                         class_mode='categorical', 
                                                         batch_size=1000, 
                                                         shuffle=True, 
                                                         seed=42)



# Error Analysis

We get predictions and perform 2 analyses:

* Randomly visualizing some of the incorrect labels
* Frequency of incorrect predictions for every class

Since our validation generator is randomly shuffling the inputs, any sequential subset of the inputs that we consider should give us a goos approximate of the general performance of our model

In [None]:
x,y = next(validation_generator)
y_int = np.argmax(y,axis=-1)
labels = []
for i in y_int:
    labels.append(class_mapping[i])
    
print(labels[:5])
print(len(labels))

In [None]:
pred=model.predict(x)
predicted_class_indices=np.argmax(pred,axis=1)
labels = (validation_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

In [None]:
print(predictions[:5])
print(len(predictions))

In [None]:
labels = []
for i in y_int:
    labels.append(class_mapping[i])
    
print(labels[:5])
print(len(labels))

In [None]:
wrong_preds = []
for i in range(len(predictions)):
    if predictions[i]!= labels[i]:
        wrong_preds.append(i)
    
print(len(wrong_preds))
print(len(wrong_preds)/len(labels))

In [None]:
wrong_images = []
predicted_labels = []
actual_labels = []
predicted_classes = []
actual_classes = []

for i in wrong_preds:
    wrong_images.append(x[i])
    predicted_labels.append(predictions[i])
    actual_labels.append(labels[i])
    predicted_classes.append(id2words[predictions[i]])
    actual_classes.append(id2words[labels[i]])
    

In [None]:
i = 51
plt.imshow(wrong_images[i])
print(f"Predicted Label: {id2words[predicted_labels[i]]}")
print(f"Correct Label: {id2words[actual_labels[i]]}")

In [None]:
#computes frequency of incorrect predictions for every class

(unique, counts) = np.unique(actual_classes, return_counts=True)
frequencies = np.asarray((unique, counts)).T

print(frequencies)