In [1]:
# basic
import os,datetime
# numerical 
import numpy as np
# visualisation
import matplotlib.pyplot as plt
# model_func
from tools.model_func import *

Using TensorFlow backend.


In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# Functions

### get_cascaded_softmax

In [3]:
# imports
from scipy.special import softmax
import scipy.sparse as sp
import pickle

# function
def get_cascade_sm(y_preds,IN_DIR):
    child_to_siblings = pickle.load(open(os.path.join(IN_DIR,'child_to_siblings.pkl'),'rb'))
    parent_to_child = pickle.load(open(os.path.join(IN_DIR,'parent_to_child.pkl'),'rb'))
    cascade_sm = [softmax(y_preds[0],axis=1)]
    for i in range(len(y_preds)-1):
        data = []
        row_ind = []
        col_ind = []
        sms = cascade_sm[-1]
        for key,val in parent_to_child[i].items():
            child_sm = softmax(y_preds[i+1][:,val],axis=1)
            data.append(np.multiply(child_sm,sms[:,key,np.newaxis]).reshape(-1))
            row_ind.append(np.repeat(np.arange(sms.shape[0]),len(val)))
            col_ind.append(np.tile(val,sms.shape[0]))
        data = np.concatenate(data)
        row_ind = np.concatenate(row_ind)
        col_ind = np.concatenate(col_ind)
        cascade_sm.append(sp.csr_matrix((data,(row_ind,col_ind))).toarray())
    return cascade_sm

### get pAtk

In [4]:
def get_pAtk(y_true,y_pred,k,per_label=False):
    y_true_sparse = np.argmax(np.asarray(y_true),axis=1)
    Atk = np.any(y_true_sparse[:,np.newaxis]==y_pred[:,:k],axis=1).astype(int)
    if per_label:
        lab = []
        for i in range(y_true.shape[-1]):
            lab.append(np.mean(Atk[y_true_sparse==i]))
        return np.array(lab)
    else:
        return np.mean(Atk)

# confusion matrix

- confusion matrix with classes in different hierarchy
- plot cascade softmax matrix in colour by true classes
- find relationship between no. training data in class and accuracy

In [14]:
MODEL_DIR = "outputs/190730_202748_attention"
IN_DIR = "data/sic_hierarchy"
model_name = MODEL_DIR.split('_')[-1]

In [15]:
# load model
from keras.models import model_from_json
import tensorflow as tf

In [16]:
x_train,y_trains,x_test,y_tests = get_input(mode = 'cat',in_dir = IN_DIR)
_,max_sequence_length = x_test.shape
labels_dims = [l.shape[-1] for l in y_tests]
embedding_layer = get_embedding_layer(IN_DIR)
model = get_model(model_name, max_sequence_length, labels_dims, embedding_layer)
model.load_weights(open(os.path.join(MODEL_DIR,'weights.h5'),'rb'))

In [17]:
yp = model.predict(x_test,verbose = 1)



# p@k for cascaded softmax

In [18]:
y_probs = get_cascade_sm(yp,IN_DIR)
y_inds = [np.argsort(y,axis=1)[:,::-1] for y in y_probs]

In [19]:
for i in range(len(y_tests)):
    print(get_pAtk(y_tests[i],y_inds[i],1),get_pAtk(y_tests[i],y_inds[i],5))

0.6737790355571744 0.9211241559023788
0.6118902182745384 0.8459189105886968
0.5084689960492028 0.7464414805682423
0.49161506346493317 0.7364804281431253


# visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
from sklearn.metrics import confusion_matrix
from collections import Counter

In [None]:
ii = 0
y_prob = y_probs[ii]
y_ind = y_inds[ii]
y_true = y_tests[ii]
y_train = y_trains[ii]

In [None]:
# train calss distribution
L = y_prob.shape[-1]
class_train_count = Counter(np.argmax(y_train,axis=1))
counts = [class_train_count[i] for i in range(L)]
plt.bar(range(L),counts)
plt.show()

In [None]:
# confusion matrix
cm = confusion_matrix(np.argmax(y_true,axis=1),np.argmax(y_prob,axis=1),np.arange(y_true.shape[-1]))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # normalise
df_cm = pd.DataFrame(cm, index = range(L),
                     columns = range(L))
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=False)

## effect of class imbalance

In [None]:
# per label pAtk
k = 5
pat1 = get_pAtk(y_true,y_ind,1,per_label=True)
pat5 = get_pAtk(y_true,y_ind,5,per_label=True)
counts = np.array(counts)
ind = np.argsort(counts)[::-1]

fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

ax1.plot(pat1[ind],'g--',alpha=0.5,label='p@1')
ax1.plot(pat5[ind],'b-',alpha=0.5,label='p@5')
ax1.set_ylabel(r'p@k',color='g')
ax1.tick_params(axis='y', labelcolor='g')
ax1.set_ylim(0)
ax1.legend()

ax2.plot(counts[ind],'r')
ax2.set_ylabel('train counts',color='r')
ax2.tick_params(axis='y', labelcolor='r')
# ax2.set_ylim(0)
ax2.set_yscale('log')
plt.title('class imbalance and class pAtk')