In [None]:
import os,re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy.sparse as sp
from tools.model_func import get_input
import seaborn as sns
from sklearn.metrics import f1_score,precision_score
import warnings

# METRICS

In [None]:
def get_sparse_k(y_true,y_pred,k,include_rank=False):
    m,n = y_true.shape
    rows = np.repeat(np.arange(m),k)
    cols = y_pred[:,:k].flatten()
    if include_rank:
        data = np.tile(np.arange(k)+1,m)
    else:
        data = np.ones_like(rows)
    return sp.csr_matrix((data,(rows,cols)),shape=(m,n))
# categorical classification
def get_top_k_accuracy(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k)
    return (y_true.multiply(pred).sum(axis=1)).A1
# multi-label classification
def get_nDCGAtk(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k,include_rank=True)
    pred.data = 1/np.log(pred.data+1)
    dcg = y_true.multiply(pred).sum(axis=1).A1
    num_labs = y_true[0,:].sum() # small cheat coz we know |y|_0 is constant
    norm_const = (1/np.log(np.arange(min(k,num_labs))+2)).sum()
    ndcg = dcg/norm_const
    return ndcg
def get_pAtk(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k)
    patk = (y_true.multiply(pred).sum(axis=1)/k).A1
    return patk
def get_micro_F1(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k)
    return f1_score(trues,pred,average='micro')
def get_macro_precision(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k)
    return precision_score(trues,pred,average=None)
def get_macro_F1(y_true,y_pred,k):
    pred = get_sparse_k(y_true,y_pred,k)
    return f1_score(trues,pred,average=None)
# for HS
def get_entire_H(y_true,y_pred,k):
    # assume |y_pred|=the correct k
    y_pred = np.sort(preds,axis=1)
    row, _, _ = sp.find(trues.T)
    y_true = row.reshape(y_pred.shape)
    return np.all(y_true[:,:k]==y_pred[:,:k],axis=1)

# INPUT HELPER

In [None]:
def get_args(in_dir):
    dirs = sorted([os.path.join(in_dir,d) for d in os.listdir(in_dir)])
    out_d = defaultdict(list)
    for d in dirs:
        log_dir = os.path.join(d,'train.log')
        args_dir = os.path.join(d,'args.csv')
        if not os.path.exists(log_dir) or not os.path.exists(args_dir):
            continue
        df = pd.read_csv(log_dir)
        arg = pd.read_csv(args_dir)
        mode = arg.loc[0,'mode']
        arg['dir'] = d
        df['dir'] = d
        out_d[mode].append(df)   
        out_d['args'].append(arg)
    args = pd.concat(out_d['args'], ignore_index = True, sort = False)
    args = args[args['mode']=='cat']
    return args

In [None]:
def order_by_probs(model_dir,offsets):
    # need to change categorical ones to probs instead of logits later
    out_dir = os.path.join(model_dir,'combined_pred_outputs.txt')
    if os.path.exists(out_dir):
        preds = np.loadtxt(out_dir)
    else:
        if 'FastText' in model_dir:
            raise Exception('FastText predition does not exist: {}'.format(out_dir))
        pred_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_outputs')])
        preds = [np.loadtxt(pred_dir,dtype=int) for pred_dir in pred_dirs]
        prob_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_probs')])
        if not prob_dirs:
            prob_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_logits')])
        probs = [np.loadtxt(prob_dir) for prob_dir in prob_dirs]
        for i in range(0,len(preds)):
            preds[i]=preds[i]+offsets[i]
        preds = np.concatenate(preds,axis=1)
        # combined top k prediciton
        probs = np.concatenate(probs,axis=1)
        inds = np.argsort(probs,axis=1)[:,:-6:-1]
        preds = np.take_along_axis(preds, inds, axis=1)
        np.savetxt(out_dir,preds,fmt='%d')
        print('SAVE COMBINED PREDICTIONS:\n{}'.format(out_dir))
    return preds

In [None]:
def order_per_H(model_dir,offsets):
    out_dir = os.path.join(model_dir,'per_H_pred_outputs.txt')
    if os.path.exists(out_dir):
        preds = np.loadtxt(out_dir)
    else:
        pred_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_outputs')])
        preds = [np.loadtxt(pred_dir,dtype=int,usecols=0)+offsets[i] for i,pred_dir in enumerate(pred_dirs)]
        preds = np.vstack(preds).T
        prob_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_probs')])
        if not prob_dirs:
            prob_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_logits')])
        probs = [np.loadtxt(prob_dir,usecols=0) for prob_dir in prob_dirs]
        probs = np.vstack(probs).T
        inds = np.argsort(probs,axis=1)[:,::-1]
        preds = np.take_along_axis(preds, inds, axis=1)
        np.savetxt(out_dir,preds,fmt='%d')
        print('SAVE PER H PREDICTIONS:\n{}'.format(out_dir))
    return preds

In [None]:
def get_preds(model_dir,y_tests,mode):
    cnts = [y_tests[i].shape[1] for i in range(len(y_tests))]
    offsets = [0] + [cnts[i]+sum(cnts[:i]) for i in range(len(cnts))]
    if mode == 'top_probs':
        preds = order_by_probs(model_dir,offsets)
    elif mode == 'top_per_H':
        preds = order_per_H(model_dir,offsets)
    return preds

In [None]:
def get_categorical_preds(model_dir,topk=5):
    pred_dirs = sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir) if d.startswith('pred_outputs')])
    preds = [np.loadtxt(pred_dir,dtype=int,usecols=np.arange(topk)).reshape([-1,topk]) for pred_dir in pred_dirs]
    return preds

# get metrics

In [None]:
# get groups
def get_groups(y_trains,num_groups = 3):
    # get train label frequencey
    train_cnts = np.hstack([y.sum(axis=0).A1 for y in y_trains])
    lab_to_cnts = {i:cnt for i,cnt in enumerate(train_cnts)}
    sorted_labs = sorted(lab_to_cnts.keys(),key=lambda x:lab_to_cnts[x])
    # get groups by count
    group_cnt = sum([cnt for cnt in lab_to_cnts.values()])/num_groups
    groups = []
    accumulated_cnts = 0
    group = []
    for lab in sorted_labs:
        accumulated_cnts+=lab_to_cnts[lab]
        if accumulated_cnts>group_cnt and len(groups)<num_groups:
            groups.append(group)
            group = [lab]
            accumulated_cnts = 0
        else:
            group.append(lab)
    groups.append(group)
    # print things
    cut_off = [0]+[lab_to_cnts[g[-1]] for g in groups]
    t_bound = ['${} < t < {}$'.format(cut_off[i],cut_off[i+1])for i in range(len(groups))]
    lab_per_group = [len(g) for g in groups]
    perc_lab_per_group = [g/sum(lab_per_group)*100 for g in lab_per_group]
    df = pd.DataFrame()
    df['group'] = ['G{}'.format(g) for g in range(len(groups))]
    df['num_train'] = [sum([lab_to_cnts[lab] for lab in group])for group in groups]
    df['perc_train'] = df['num_train']/df['num_train'].sum()*100
    
    df['num_train_cut_off'] = cut_off[1:]
    df['t_bound'] = t_bound
    df['num_labels'] = lab_per_group
    df['perc_labels'] = df['num_labels']/df['num_labels'].sum()*100
    return groups,df

In [None]:
# we do not care about the std, only micro averaging is used!! :D
def get_categorical_scores(d,y_tests,metrics,ks):
    preds = get_categorical_preds(d,max(ks))
    df = pd.DataFrame()
    for H in range(len(preds)):
        df.loc[H,'H']='H{}'.format(H)
        for key,func in metrics.items():
            for k in ks:
                metric = key.format(k)
                df.loc[H,metric] = func(y_tests[H],preds[H],k).mean() # func returns metric per sample
    df['dir'] = d
    df['model'] = d.split('_')[-1]
    return df

In [None]:
def get_micro_scores(d,trues,y_tests,metrics,ks,mode):
    preds = get_preds(d,y_tests,mode)
    dic = {}
    dic['dir'] = d
    dic['model'] = d.split('_')[-1]
    for key,func in metrics.items():
        for k in ks:
            metric = key.format(k)
            if metric in dic.keys():
                continue
            dic[metric] = func(trues,preds,k).mean()
    return pd.DataFrame.from_dict([dic])

In [None]:
# we want the std here :]
def get_macro_scores(d,trues,y_tests,metrics,ks,mode,groups):
    preds = get_preds(d,y_tests,mode)
    df = pd.DataFrame(index = [ind for group in groups for ind in group])
    df.index.name = 'lab_ind'
    for G,group in enumerate(groups):
        df.loc[group,'G']='G{}'.format(G)
    for key,func in metrics.items():
        for k in ks:
            ss = func(trues,preds,k)
            for group in groups:
                df.loc[group,key.format(k)] = ss[group]
    df['dir'] = d
    df['model'] = d.split('_')[-1]
    return df

# GET ENTIRE H CORRECT

In [None]:
args = get_args('outputs')

In [None]:
warnings.filterwarnings('ignore')
dff = pd.DataFrame()
df = args
df = df[df['mode']=='cat']
df = df.set_index(['input','loss']).sort_index()
df_ind = 0
for i,data in enumerate(['sic_hierarchy','amazon_hierarchy_2']):
    if i==1:
        break
    in_dir = 'data/{}'.format(data)
    _,y_trains,_,y_tests = get_input(mode='cat', in_dir = in_dir, sparse = True, get_output= [0,1,0,1])
    trues = sp.hstack(y_tests).tocsr()
    groups,_ = get_groups(y_trains,num_groups = 3)
    for loss in ['binary','categorical','masked_categorical']:
        dirs = sorted(df.loc[(in_dir,loss)].dir.to_list())
        for d in dirs:
            dff.loc[df_ind,'dir']=d
            dff.loc[df_ind,'loss']=loss
            dff.loc[df_ind,'model']=d.split('_')[-1].split('.')[0]
            preds = get_preds(d,y_tests,'top_probs')[:,:len(y_tests)]
            for H in range(preds.shape[1]):
                score = get_entire_H(trues,preds,H+1).mean()
                dff.loc[df_ind,'H{}'.format(H)]=score
            df_ind+=1
            print('.',end='')
# df = pd.concat(dfs)
# df.to_pickle('outputs/dfs/macro.pkl')

In [None]:
df2 = dff
# df2 = df2[df2['model']!='bert']
df2 = df2[df2['dir']!='outputs/190810_151748_bert'] # this bert does worst
df2 = df2.set_index(['loss','model']).drop(columns=['dir'])

In [None]:
(df2 - df2.loc['binary'])*100

In [None]:
df2*100

# MICRO SCORES

In [None]:
args = get_args('outputs')
warnings.filterwarnings('ignore')
kss = [[1,3,5,4],[1,3,5]]
metrics = {
    'P@{}':get_pAtk,
    'nDCG@{}':get_nDCGAtk,
    'F1':lambda y_true,y_pred,k:get_micro_F1(y_true,y_pred,1)
}
datas = ['sic_hierarchy','amazon_hierarchy_2']
loss_mode = [
    ('binary','top_probs'),
    ('categorical','top_probs'),
    ('masked_categorical','top_probs'),
]

In [None]:
# start
dfs=[]
df = args
df = df[df['mode']=='cat']
df = df.set_index(['input','loss']).sort_index()
df_ind = 0
for i,data in enumerate(datas):
    in_dir = 'data/{}'.format(data)
    ks = kss[i]
    _,y_trains,_,y_tests = get_input(mode='cat', in_dir = in_dir, sparse = True, get_output= [0,1,0,1])
    trues = sp.hstack(y_tests).tocsr()
    groups,_ = get_groups(y_trains,num_groups = 3)
    for loss,mode in loss_mode:
        if (in_dir,loss) not in list(df.index.values):
            continue
        else:
            print(data,loss,mode)
        dirs = sorted(df.loc[(in_dir,loss)].dir.to_list())
        for d in dirs:
            df1 = get_micro_scores(d,trues,y_tests,metrics,ks,mode)
            df1['loss']=loss
            df1['data']=data
            df1['mode']=mode
            dfs.append(df1)
            print('.',end='')
        print()
df = pd.concat(dfs)
# df.to_pickle('outputs/dfs/micro.pkl')

In [None]:
df = pd.read_pickle('outputs/dfs/micro.pkl')
models = ['xmlcnn','attentionxml','attention','bert']
indexs = ['loss','data','model']
display_metrics = ['F1','P@1','P@3','P@4','nDCG@1','nDCG@3','nDCG@5']

df2 = df
df2 = df2[df2['dir']!='outputs/190810_151748_bert'] # this bert does worst
df2 = df2[df2['model'].isin(models)]
# df2['loss'] = df2['loss']+','+df2['mode']
df2 = df2[indexs+display_metrics]
df2 = df2.melt(
    id_vars = indexs,
    value_vars = display_metrics,
    var_name = 'metric',
    value_name = 'score',
)
df2 = df2.set_index(['loss','data','metric','model'])
df2 = (df2.reindex(datas,level=1).reindex(models,level=-1).unstack([1,-1])*100)
df2

In [None]:
# for latex
df3 = df2.stack(-2).reset_index().set_index(['loss','data','metric']).sort_index().droplevel(0,1).reindex(datas,level=1)[models]
df3.index.names = [None] * len(df3.index.names)
print(df3.loc['categorical'].to_latex(float_format='%.2f').replace(' ','').replace('&',' & '))

In [None]:
def _color_red_or_green(val):
    c='black'
    if val <= -1:
        c = 'red'
    elif val >= 1:
        c = 'green'
    return 'color: %s' % c

In [None]:
(df2.loc['categorical']-df2.loc['binary']).style.applymap(_color_red_or_green).format('{:.2f}')
# df2.map(lambda x:'{:.2f}'.format(x))

# categorical score

In [None]:
args = get_args('outputs')
warnings.filterwarnings('ignore')
kss = [[1],[1]]
metrics = {
    'P@{}':get_pAtk,
}
dfs=[]
df = args
df = df[df['mode']=='cat']
df = df.set_index(['input','loss']).sort_index()
df_ind = 0
for i,data in enumerate(['sic_hierarchy','amazon_hierarchy_2']):
    if i==1:
        break
    in_dir = 'data/{}'.format(data)
    ks = kss[i]
    _,y_trains,_,y_tests = get_input(mode='cat', in_dir = in_dir, sparse = True, get_output= [0,1,0,1])
    for loss in ['categorical','masked_categorical']:
        dirs = sorted(df.loc[(in_dir,loss)].dir.to_list())
        for d in dirs:
            df1 = get_categorical_scores(d,y_tests,metrics,ks)
            df1['loss']=loss
            df1['input']=in_dir
            df1['mode']=mode
            dfs.append(df1)
            print('.',end='')
# df = pd.concat(dfs)
# df.to_pickle('outputs/dfs/macro.pkl')

In [None]:
df = pd.concat(dfs)
df2 = df[df['model']!='bert'].set_index(['loss','model','H']).drop(columns=['dir','mode','input'])

In [None]:
df2.unstack(-1)

In [None]:
((df2.loc['masked_categorical'] - df2.loc['categorical'])*100).unstack(-1)

# MACRO SCORES

In [None]:
args = get_args('outputs')

In [None]:
warnings.filterwarnings('ignore')
kss = [[4],[3]]
metrics = {
    'precision':get_macro_precision,
    'F1':get_macro_F1
}
datas = ['sic_hierarchy','amazon_hierarchy_2']
loss_mode = [
    ('binary','top_probs'),
    ('categorical','top_probs'),
    ('masked_categorical','top_probs'),
]
# function
dfs = []
df = args
df = df[df['mode']=='cat']
df = df.set_index(['input','loss']).sort_index()
for i,data in enumerate(datas):
    in_dir = 'data/{}'.format(data)
    ks = kss[i]
    _,y_trains,_,y_tests = get_input(mode='cat', in_dir = in_dir, sparse = True, get_output= [0,1,0,1])
    trues = sp.hstack(y_tests).tocsr()
    groups,_ = get_groups(y_trains,num_groups = 3)
    for loss,mode in loss_mode:
        index = (in_dir,loss)
        if (in_dir,loss) not in list(df.index.values):
            continue
        else:
            print(data,loss,mode)
        dirs = sorted(df.loc[(in_dir,loss)].dir.to_list())
        if loss == 'binary':
            dirs = ['outputs/{}_c_FastText'.format(data)] + dirs
        for d in dirs:
            print('.',end='')
            df1 = get_macro_scores(d,trues,y_tests,metrics,ks,mode,groups)
            df1['loss']=loss
            df1['input']=in_dir
            df1['mode']=mode
            dfs.append(df1)
        print()
df = pd.concat(dfs)
df.to_pickle('outputs/dfs/macro.pkl')

## TABLE : macro scores

In [None]:
df = pd.read_pickle('outputs/dfs/macro.pkl')

In [None]:
models = ['FastText','xmlcnn','attentionxml','attention','bert']
indexs = ['loss','input','model']
display_metrics = ['precision','F1']
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
df2 = df
df2 = df2[df2['dir']!='outputs/190810_151748_bert'] # this bert does worst
df2 = df2.drop(columns=['dir','mode']).groupby(indexs).mean().reset_index()
df2 = df2.melt(    
    id_vars = indexs,
    value_vars = display_metrics,
    var_name = 'metric',
    value_name = 'score')
df2 = df2.set_index(indexs+['metric'])
df2 = df2.reindex(datas,level=1).reindex(models,level=2).reindex(display_metrics,level=-1)
df2 = df2.unstack(-2)
df2

In [None]:
((df2.loc['categorical']-df2.loc['binary'])*100).style.applymap(_color_red_or_green).format('{:.2f}')

In [None]:
# TABLE
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['FastText','xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
for data in datas:
    print(data)
    df2 = df1[df1.input==data]
    srs = (df2.groupby('model')[metrics].mean()*100).to_dict()
    for metric in metrics:
        print('{:10}'.format(metric),end=':')
        print('&'.join(['{:.2f}'.format(srs[metric][key]) for key in models]))

In [None]:
print(((df2.loc['categorical'])*100).to_latex(float_format='\tg{%.2f}',escape=False).replace(' ','').replace('&',' & '))




# PLOT : macro scores

In [None]:
df = pd.read_pickle('outputs/dfs/macro.pkl')

In [None]:
metric_dict = {
    'precision':'Precision',
    'F1':'F1 score',
}
data_dict = {
    'data/sic_hierarchy':'SIC Code',
    'data/amazon_hierarchy_2':'AmazonCat-13k',
}

In [None]:
# model performance of binary loss
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['FastText','xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
# function
df1 = df.reset_index()
df1 = df1[df1['model'].isin(models)]
df1 = df1[df1['loss']=='binary']
for data in datas:
    df2 = df1[df1.input==data]
    for metric in metrics:
        # y tick counts
        cnts = (df2['G'].value_counts()/df2['G'].value_counts().sum()*100).to_dict()
        groups = sorted(cnts.keys())
        # plot
        fig,ax = plt.subplots()
        bar = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df2,
            ax = ax, 
            palette=sns.color_palette("Set3"),
            order=groups,
            hue_order = models,
            edgecolor = 'k',
            linewidth=1,
            ci=None
           )
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=len(models), mode="expand", borderaxespad=0.)
        ax.set_ylabel(metric_dict[metric])
        ax.set_xlabel('{} label groups (% labels in group)'.format(data_dict[data]))
        ax.set_xticklabels(['{} ({:.2f}%)'.format(key,cnts[key]) for key in groups])
        ax.set_ylim(0,1)
#         ax.axhline(y=0, color='k')
        plt.show()

In [None]:
# model performance of categorical loss (bin as ref)
# PARAMS
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
# function
df1 = df.reset_index()
df1 = df1[df1['model'].isin(models)]
df1 = df1.set_index(['loss','input','model','G','lab_ind'])
df1 = df1.drop(columns=['dir','mode'])
df1b = (df1.loc['binary']).reset_index()
df1 = (df1.loc['categorical']).reset_index()
# fig,axes = plt.subplots(2,2,sharey=True,figsize=(10,8))
for j,data in enumerate(datas):
    df2 = df1[df1.input==data]
    for i,metric in enumerate(metrics):
        fig,ax = plt.subplots()
#         ax = axes[i,j]
        # y tick counts
        cnts = (df2['G'].value_counts()/df2['G'].value_counts().sum()*100).to_dict()
        groups = sorted(cnts.keys())
        # plot
        bar1 = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df2,
            ax = ax, 
            palette=sns.color_palette("Set3")[1:],
            order=groups,
            hue_order = models,
            ci=None,
            edgecolor = 'k',
            linewidth=1,
           )
        bar2 = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df1b[df1b.input==data],
            ax = ax, 
            palette=sns.color_palette("Reds")[1:],
            order=groups,
            hue_order = models,
            ci=None,
            fill=False,
            alpha=0.5,
            edgecolor = 'k',
            linewidth=1,
            linestyle='--',
           )
        ax.legend(models,bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=len(models), mode="expand", borderaxespad=0.)
        ax.set_ylabel('{}'.format(metric_dict[metric]))
        ax.set_xlabel('{} label groups (% labels in group)'.format(data_dict[data]))
        ax.set_xticklabels(['{} ({:.2f}%)'.format(key,cnts[key]) for key in groups])
        ax.axhline(y=0, color='k')
        ax.set_ylim(0,1)
        plt.show()
# plt.tight_layout()
# plt.show()

In [None]:
# model performance of HS (bin as ref)
# PARAMS
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
# function
df1 = df.reset_index()
df1 = df1[df1['model'].isin(models)]
df1 = df1.set_index(['loss','input','model','G','lab_ind'])
df1 = df1.drop(columns=['dir','mode'])
df1b = (df1.loc['binary']).reset_index()
df1 = (df1.loc['masked_categorical']).reset_index()
fig,axes = plt.subplots(2,2,sharey=True,figsize=(10,8))
for j,data in enumerate(datas):
    df2 = df1[df1.input==data]
    if j==1:
        continue
    for i,metric in enumerate(metrics):
#         fig,ax = plt.subplots()
        ax = axes[i,j]
        # y tick counts
        cnts = (df2['G'].value_counts()/df2['G'].value_counts().sum()*100).to_dict()
        groups = sorted(cnts.keys())
        # plot
        bar1 = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df2,
            ax = ax, 
            palette=sns.color_palette("Set3")[1:],
            order=groups,
            hue_order = models,
            ci=None,
            edgecolor = 'k',
            linewidth=1,
           )
        bar2 = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df1b[df1b.input==data],
            ax = ax, 
            palette=sns.color_palette("Reds")[1:],
            order=groups,
            hue_order = models,
            ci=None,
            fill=False,
            alpha=0.5,
            edgecolor = 'k',
            linewidth=1,
            linestyle='--',
           )
        ax.legend(models,bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=len(models), mode="expand", borderaxespad=0.)
        ax.set_ylabel('{}'.format(metric_dict[metric]))
        ax.set_xlabel('{} label groups (% labels in group)'.format(data_dict[data]))
        ax.set_xticklabels(['{} ({:.2f}%)'.format(key,cnts[key]) for key in groups])
        ax.set_ylim(0,1)
#         plt.show()
plt.tight_layout()
plt.show()

In [None]:
# difference, cat - bin
# PARAMS
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
# function
df1 = df.reset_index()
df1 = df1[df1['model'].isin(models)]
df1 = df1.set_index(['loss','input','model','G','lab_ind'])
df1 = df1.drop(columns=['dir','mode'])
df1 = (df1.loc['categorical']-df1.loc['binary']).reset_index()
fig,axes = plt.subplots(2,2,sharey=True,figsize=(10,8))
for j,data in enumerate(datas):
    df2 = df1[df1.input==data]
    for i,metric in enumerate(metrics):
#         fig,ax = plt.subplots()
        ax = axes[i,j]
        # y tick counts
        cnts = (df2['G'].value_counts()/df2['G'].value_counts().sum()*100).to_dict()
        groups = sorted(cnts.keys())
        # plot
        bar = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df2,
            ax = ax, 
            palette=sns.color_palette("Set3")[1:],
            order=groups,
            hue_order = models,
            edgecolor = 'k',
            linewidth=1,
            ci=None
           )
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=len(models), mode="expand", borderaxespad=0.)
        ax.set_ylabel('$\Delta$ {}'.format(metric_dict[metric]))
        ax.set_xlabel('{} label groups (% labels in group)'.format(data_dict[data]))
        ax.set_xticklabels(['{} ({:.2f}%)'.format(key,cnts[key]) for key in groups])
        ax.axhline(y=0, color='k',linewidth=1)
        ax.set_ylim(-0.02,0.09)
#         plt.show()
plt.tight_layout()
plt.show()

In [None]:
# difference, HS - bin
# PARAMS
datas = ['data/sic_hierarchy','data/amazon_hierarchy_2']
models = ['xmlcnn','attentionxml','attention']
metrics = ['precision','F1']
# function
df1 = df.reset_index()
df1 = df1[df1['model'].isin(models)]
df1 = df1.set_index(['loss','input','model','G','lab_ind'])
df1 = df1.drop(columns=['dir','mode'])
df1 = (df1.loc['masked_categorical']-df1.loc['binary']).reset_index()
fig,axes = plt.subplots(2,2,sharey=True,figsize=(10,8))
for j,data in enumerate(datas):
    df2 = df1[df1.input==data]
    for i,metric in enumerate(metrics):
#         fig,ax = plt.subplots()
        ax = axes[i,j]
        # y tick counts
        cnts = (df2['G'].value_counts()/df2['G'].value_counts().sum()*100).to_dict()
        groups = sorted(cnts.keys())
        # plot
        bar = sns.barplot(
            x = 'G',
            y=metric,
            hue='model',
            data = df2,
            ax = ax, 
            palette=sns.color_palette("Set3")[1:],
            order=groups,
            hue_order = models,
            edgecolor = 'k',
            linewidth=1,
            ci=None
           )
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,
               ncol=len(models), mode="expand", borderaxespad=0.)
        ax.set_ylabel('$\Delta$ {}'.format(metric_dict[metric]))
        ax.set_xlabel('{} label groups (% labels in group)'.format(data_dict[data]))
        ax.set_xticklabels(['{} ({:.2f}%)'.format(key,cnts[key]) for key in groups])
        ax.axhline(y=0, color='k',linewidth=1)
        ax.set_ylim(-0.02,0.09)
#         plt.show()
plt.tight_layout()
plt.show()