# SIC code hierarchy

In [1]:
import numpy as np
import os,re
import pandas as pd
import pickle
import json
import collections

In [2]:
df = pd.read_pickle('data/sic.pkl')

In [3]:
df.head()

Unnamed: 0_level_0,text,categories,train/test
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
453844,welcome to a1 lifts . professional and reliabl...,[33190],train
914843,"our friendly service , surprising range of fre...",[47110],train
326959,"wherever you want to get to in life , lloyds b...",[64191],train
541943,stanley gibbons – the home of stamp collecting...,[47789],train
445157,are you looking for high-quality dentistry in ...,[86230],train


In [5]:
len(set([' '.join(d) for d in df.categories.values]))

538

In [6]:
DATA_PATH = 'data/sic_hierarchy.json'
with open(DATA_PATH) as json_file:  
    hierarchy = json.load(json_file)

In [8]:
hierarchy['A0121x']

{'title': 'Growing of grapes',
 'includes': ['growing of wine grapes and table grapes in vineyards'],
 'excludes': ['manufacture of wine, see ##11.02']}

In [7]:
alpha_look = {}
for key,item in hierarchy.items():
    if key[3]!='x' or key[1]=='x':
        continue
    num = key[1:3]
    if num in alpha_look.keys():
        print('repeated key:{}'.format(key))
        break
    alpha_look[num]=key[0]

In [33]:
depth = [[] for _ in range(7)]
for key in hierarchy.keys():
    ind = key.find('x')
    depth[ind].append(key)
depth = [d for d in depth if d]

In [34]:
len(depth)

5

In [36]:
len(depth[-1])

191

# metrics from predictions

In [27]:
import os
import numpy as np
import scipy.sparse as sp

In [29]:
MODEL_DIR = 'outputs/190715_215817_attentionxml'
IN_DIR = 'data/dl_sic/'

ind = np.loadtxt(os.path.join(MODEL_DIR,'prediction_10_ind.txt'))
logits = np.loadtxt(os.path.join(MODEL_DIR,'prediction_10_logits.txt'))
pred = (logits>0).astype(int)
y_true =sp.load_npz(os.path.join(IN_DIR,'y_train.npz'))

## p@k (multi-label)

In [22]:
y_true

<588992x538 sparse matrix of type '<class 'numpy.int64'>'
	with 588992 stored elements in Compressed Sparse Row format>

In [25]:
k = 1
data = ind*pred

In [23]:
logits

array([[-1.714, -1.798, -2.037, ..., -3.565, -3.641, -3.724],
       [-0.279, -0.35 , -2.628, ..., -3.486, -3.758, -3.895],
       [ 0.313, -1.697, -2.297, ..., -5.657, -5.734, -5.765],
       ...,
       [-0.899, -1.038, -1.948, ..., -3.909, -4.204, -4.523],
       [ 0.263, -1.839, -2.119, ..., -4.064, -4.127, -4.508],
       [ 1.276, -3.641, -3.956, ..., -6.19 , -6.262, -6.312]])

In [26]:
np.mean(np.sum(data!=0,axis=1)!=0)

0.4254483962321813

In [None]:
y_pred = sp.coo_matrix((data, (row, col)), shape=(4, 4))

## p@k (my own, p@k per label)

In [37]:
ll = np.split(y_true.T.indices, y_true.T.indptr)

In [38]:
len(ll)

588994

In [35]:
y_true.shape

(588992, 538)

In [41]:
y_true.T.indptr

array([     0,      1,      2, ..., 588990, 588991, 588992], dtype=int32)

In [42]:
(I,J,V) = sp.find(y_true)

In [45]:
np.split(I,J)

[array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),
 array([], dtype=int32),


In [15]:
df.head()

Unnamed: 0_level_0,text,categories,train/test
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
453844,welcome to a1 lifts . professional and reliabl...,[33190],train
914843,"our friendly service , surprising range of fre...",[47110],train
326959,"wherever you want to get to in life , lloyds b...",[64191],train
541943,stanley gibbons – the home of stamp collecting...,[47789],train
445157,are you looking for high-quality dentistry in ...,[86230],train


In [4]:
df['sic']=df['categories'].apply(lambda x:x[0])

In [8]:
df['cat0']=df['sic'].apply(lambda x:[alpha_look[x[0:2]]])
df['cat1']=df['sic'].apply(lambda x:[x[0:2]])
df['cat2']=df['sic'].apply(lambda x:[x[:4]])
df['cat3']=df['categories']


In [9]:
df.head()

Unnamed: 0_level_0,text,categories,train/test,sic,cat0,cat1,cat2,cat3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
453844,welcome to a1 lifts . professional and reliabl...,[33190],train,33190,[C],[33],[3319],[33190]
914843,"our friendly service , surprising range of fre...",[47110],train,47110,[G],[47],[4711],[47110]
326959,"wherever you want to get to in life , lloyds b...",[64191],train,64191,[K],[64],[6419],[64191]
541943,stanley gibbons – the home of stamp collecting...,[47789],train,47789,[G],[47],[4778],[47789]
445157,are you looking for high-quality dentistry in ...,[86230],train,86230,[Q],[86],[8623],[86230]


In [10]:
df['categories'] = df.apply(lambda x:[x['cat0'][0],x['cat1'][0],x['cat2'][0],x['cat3'][0]],axis=1)

In [11]:
df.head()

Unnamed: 0_level_0,text,categories,train/test,sic,cat0,cat1,cat2,cat3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
453844,welcome to a1 lifts . professional and reliabl...,"[C, 33, 3319, 33190]",train,33190,[C],[33],[3319],[33190]
914843,"our friendly service , surprising range of fre...","[G, 47, 4711, 47110]",train,47110,[G],[47],[4711],[47110]
326959,"wherever you want to get to in life , lloyds b...","[K, 64, 6419, 64191]",train,64191,[K],[64],[6419],[64191]
541943,stanley gibbons – the home of stamp collecting...,"[G, 47, 4778, 47789]",train,47789,[G],[47],[4778],[47789]
445157,are you looking for high-quality dentistry in ...,"[Q, 86, 8623, 86230]",train,86230,[Q],[86],[8623],[86230]


In [12]:
df.to_pickle('data/sic_hiararchy.pkl')