In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, re
import pickle

from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
import scipy.sparse

Using TensorFlow backend.


In [2]:
MODEL_DIR = 'outputs/190725_160501_xmlcnn'
IN_DIR = 'data/sic_hierarchy'

In [3]:
mode = 'hierarchy'
mlb_dirs = [os.path.join(IN_DIR,d) for d in os.listdir(IN_DIR) if d.startswith('mlb_{}'.format(mode))]

In [4]:
os.listdir(IN_DIR)

['y_test_cat1.npz',
 'y_train_hierarchy.npz',
 'mlb_cat2.pkl',
 'embedding_matrix.npy',
 'mlb_cat1.pkl',
 'y_train_cat0.npz',
 'y_train_cat1.npz',
 'x_train.npy',
 'y_train_cat3.npz',
 'mlb_hierarchy.pkl',
 'mlb_cat3.pkl',
 'y_test_cat2.npz',
 'y_test_cat0.npz',
 'mlb_cat0.pkl',
 'y_train_cat2.npz',
 'y_test_hierarchy.npz',
 'y_test_cat3.npz',
 'x_test.npy',
 'tokenizer.pkl']

# anyhow

In [5]:
# from run_model.py
def get_input(in_dir,mode):
    x_train = np.load(os.path.join(in_dir,'x_train.npy'))
    dirs = [os.path.join(in_dir,d) for d in sorted(os.listdir(in_dir)) if d.startswith('y_train_{}'.format(mode))]
    y_trains = [scipy.sparse.load_npz(d).todense() for d in dirs]

    x_test = np.load(os.path.join(in_dir,'x_test.npy'))
    dirs = [os.path.join(in_dir,d) for d in sorted(os.listdir(in_dir)) if d.startswith('y_test_{}'.format(mode))]
    y_tests = [scipy.sparse.load_npz(d).todense() for d in dirs]
    return x_train,y_trains,x_test,y_tests

In [6]:
mlb_dirs

['data/sic_hierarchy/mlb_hierarchy.pkl']

In [7]:
mlb = pickle.load(open(mlb_dirs[0],'rb'))

In [8]:
tok = pickle.load(open(os.path.join(IN_DIR,'tokenizer.pkl'),'rb'))

In [9]:
cls_dict = {i:v for i,v in enumerate(mlb.classes_)}

In [10]:
# load test set
x_train,y_trains,x_test,y_tests = get_input(IN_DIR,mode)

In [11]:
# load predicted
y_pred = np.loadtxt(os.path.join(MODEL_DIR,'pred_outputs0.txt'))

In [12]:
y_true = np.argmax(y_tests[0],axis=1).A.flatten()

In [17]:
sequences = [xx[xx!=0] for xx in x_test]

In [19]:
# get top 5 predictions into sic codes
rows = []
gen = tok.sequences_to_texts_generator(sequences)
for i in range(len(y_true)):
    d = {}
    d['text'] = next(gen)
    d['true'] = cls_dict[y_true[i]]
    for j,y in enumerate(y_pred[i]):
        if j == 5:
            break
        d['pred{}'.format(j)] = cls_dict[y]
    rows.append(d)

In [20]:
df = pd.DataFrame.from_dict(rows)

In [21]:
df.head()

Unnamed: 0,pred0,pred1,pred2,pred3,pred4,text,true
0,G47791,G47789,G47910,N82990,G47799,stanley gibbons – the home of stamp collecting...,G47789
1,S96020,N82990,M70229,G47910,G46450,par des coiffeurs pour des coiffeurs cette dev...,S96020
2,J62090,J62020,N82990,M70229,L68310,we specialise in implementing integrating and ...,J62020
3,N82990,N81210,M70229,N78109,N81299,agenda screening services is the market leader...,M70229
4,G47620,G47610,G47710,G47190,G47910,rely on ryman the uk 's leading stationer and ...,G47610


# per hiararchy accuracy

In [83]:
# p@1
cats = [0]*4
for row in rows:
    pred = row['pred0']
    true = row['true']
    if pred[0]==true[0]:
        cats[0]+=1
    if pred[:3]==true[:3]:
        cats[1]+=1
    if pred[:5]==true[:5]:
        cats[2]+=1
    if pred==true:
        cats[3]+=1

In [84]:
np.array(cats)/len(rows)*100

array([68.45736782, 62.38056544, 51.81428451, 50.17582448])

In [85]:
# p@5
cats = [0]*4
for row in rows:
    true = row['true']
    for i in range(5):
        pred = row['pred{}'.format(i)]
        if pred[0]==true[0]:
            cats[0]+=1
            break
    for i in range(5):
        pred = row['pred{}'.format(i)]
        if pred[:3]==true[:3]:
            cats[1]+=1
            break
    for i in range(5):
        pred = row['pred{}'.format(i)]
        if pred[:5]==true[:5]:
            cats[2]+=1
            break
    for i in range(5):
        pred = row['pred{}'.format(i)]
        if pred==true:
            cats[3]+=1
            break

In [86]:
np.array(cats)/len(rows)*100

array([86.98618622, 81.77099386, 75.49805262, 74.91874247])

# text

In [23]:
tok.word_index

{'<UNK>': 1,
 'and': 2,
 'the': 3,
 'in': 4,
 'of': 5,
 'a': 6,
 'to': 7,
 'for': 8,
 'is': 9,
 'uk': 10,
 'we': 11,
 'with': 12,
 'our': 13,
 'services': 14,
 'your': 15,
 'you': 16,
 'are': 17,
 'on': 18,
 "'s": 19,
 'from': 20,
 'at': 21,
 'online': 22,
 'more': 23,
 'all': 24,
 'range': 25,
 'or': 26,
 'business': 27,
 'based': 28,
 'leading': 29,
 'service': 30,
 'quality': 31,
 'home': 32,
 'london': 33,
 'free': 34,
 'over': 35,
 'products': 36,
 'design': 37,
 'an': 38,
 'company': 39,
 'find': 40,
 'today': 41,
 'one': 42,
 'shop': 43,
 'as': 44,
 'offer': 45,
 '’': 46,
 'across': 47,
 'welcome': 48,
 'provide': 49,
 'have': 50,
 'great': 51,
 'solutions': 52,
 'delivery': 53,
 'call': 54,
 'offers': 55,
 'us': 56,
 'r': 57,
 'can': 58,
 'by': 59,
 'including': 60,
 's': 61,
 'local': 62,
 'that': 63,
 'professional': 64,
 'best': 65,
 'hire': 66,
 'ltd': 67,
 'management': 68,
 'providing': 69,
 'new': 70,
 'years': 71,
 'high': 72,
 'get': 73,
 'it': 74,
 'care': 75,
 'comme