In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy.sparse as sp
from tools.model_func import *
import seaborn as sns

## check if prediction is children of parents

In [None]:
IN_DIR = 'data/sic_hierarchy'

In [None]:
child_dict = pickle.load(open(os.path.join(IN_DIR,'parent_to_child.pkl'),'rb'))
d = dirs[0]
preds,logits = get_preds(d,get_logits=True)

In [None]:
# is first prediction children of parents
N = len(preds[0])
N_H = len(preds)
is_children = np.zeros((N,N_H))
is_children[:,0]=1
for i in range(N):
    for H in range(1,N_H):
        parent = preds[H-1][i,0]
        children = preds[H][i,0]
        if children in child_dict[H-1][parent]:
            is_children[i,H]=1

In [None]:
for H in range(N_H):
    print('Consistent prediction H{}: {:.2f}%'.format(H,np.mean(is_children[:,:H+1].all(axis=1))*100))

In [None]:
# is any of the top 10 predictions children
N = len(preds[0])
N_H = len(preds)
K = len(preds[0][0])
broken_hierarchy = []
is_children = np.zeros((N,N_H))
is_children[:,0]=1
for i in range(N):
    parent = preds[0][i,0]
    for H in range(1,N_H):
        found = False
        for k in range(K):
            children = preds[H][i,k]
            if children in child_dict[H-1][parent]:
                is_children[i,H]=k+1
                found = True
                break
        if not found:
            is_children[i,H:]=-1
            broken_hierarchy.append(i)
            break
        parent = children

In [None]:
for H in range(N_H):
    print('Consistent prediction H{}: {:.2f}%'.format(H,np.mean((is_children[:,:H+1]!=-1).all(axis=1))*100))

# baseline comp

In [None]:
IN_DIR = 'data/sic_hierarchy'
# IN_DIR = 'data/amazon_hierarchy_2'

In [None]:
df = args
df = df[df['input']==IN_DIR]
df = df[df['mode']=='cat']
df = df[df['loss']=='binary']
dirs = sorted(df.dir.to_list())
dirs = [d for d in dirs if 'bert' not in d]
print(dirs)
df

## basic

In [None]:
_,_,_,y_tests = get_input(mode='cat', in_dir = IN_DIR, sparse = True, get_output= [0,0,0,1])
trues = sp.hstack(y_tests).tocsr()

In [None]:
ks = [1,3,5,4]
metrics = {
    'P   @':get_multilabel_pAtk,
    'nDCG@':get_nDCGAtk,
    'mAcc@':get_macro_acc,
          }
results = [get_multi_label_metrics(d,trues,y_tests,metrics,ks=ks) for d in dirs]

In [None]:
# for latex
N_H = len(results[0][0])
for i,metric in enumerate(metrics.keys()):
    print(metric+'k')
    for H in range(N_H):
        for j,k in enumerate(ks):
            print('H{} k{} :'.format(H,k),end='')
            print('&'.join(['{:.2f}'.format(result[i][H][j]*100) for result in results]))

## layer

In [None]:
_,_,_,y_tests = get_input(mode='cat', in_dir = IN_DIR, sparse = True, get_output= [0,0,0,1])

In [None]:
ks = [1,3,5]
metrics = {
#     '   P@':get_pAtk,
#     'nDCG@':get_nDCGAtk,
    'mAcc@':get_macro_acc,
}
results = [get_per_H_metrics(d,y_tests,metrics,ks=ks) for d in dirs[:-2]]

In [None]:
# for latex
N_H = len(results[0][0])
for i,metric in enumerate(metrics.keys()):
    print(metric+'k')
    for H in range(N_H):
        for j,k in enumerate(ks):
            print('H{} k{} :'.format(H,k),end='')
            print('&'.join(['{:.2f}'.format(result[i][H][j]*100) for result in results]))

In [None]:
# get overall macro acc@k 
k = 1
#
for i,model in enumerate(results):
    D = len(model[0])
    atks = [model[0][i][k-1] for i in range(D)]
    cc = sum([y_tests[i].shape[1] for i in range(D)])
    oo = 0
    for i in range(D):
        oo+=atks[i]*y_tests[i].shape[1]/cc
    print(oo*100)