In [None]:
import numpy as np
import os,datetime,re
import fasttext
from tools.model_func import get_input
import pickle
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
PRED_SIZE = 5

# Functions

### create FT data from sequence

In [3]:
def make_ft(out_dir,xs,ys):
    s = xs[0].shape[0]
    with open(out_dir,'w') as f:
        for i in range(s):
            f.write(' '.join(['__label__{}_{}'.format(j,y[i]) for j,y in enumerate(ys)]))
            f.write(' ')
            f.write(' '.join(np.trim_zeros(xs[0][i,:],'b').astype(str)))
            f.write('\n')
            if i%(s//100)==0:
                print('{:.2f}%'.format(i/s*100),end='\r')
    print('saved to: {}'.format(out_dir))
def create_FT_data(in_dir,mode):
    name = in_dir.split('/')[-1]
    out_dir = 'data/FT/{}_{}'.format(name,mode[0])
    x_trains,y_trains,x_tests,y_tests = get_input(in_dir = in_dir, mode = mode,get_output=[1,1,1,1],sparse = True)
    y_trains = [y.argmax(axis=1).A1 for y in y_trains]
    y_tests = [y.argmax(axis=1).A1 for y in y_tests]
    print('TRAIN')
    make_ft(out_dir + '.train.txt',x_trains,y_trains)
    print('TEST')
    make_ft(out_dir + '.test.txt',x_tests,y_tests)

In [4]:
# create_FT_data('data/sic_hierarchy','hierarchy')

### train FastText model from data

In [5]:
def run_fastText(in_dir,out_dir = 'FastText/models',lr=None,loss='ova',epoch=5,save_model = True,save_pred = True):
    if loss not in ['ova','hs']:
        raise Exception('Unkown loss: {}'.format(loss))
    data = in_dir.split('/')[-1].split('.')[0]
    model_dir= os.path.join(
        out_dir,
        datetime.datetime.now().strftime('%y%m%d_%H%M%S_{}_{}.bin'.format(data,loss)),
    )
    if lr is None:
        if loss == 'ova':
            lr = 0.1
        elif loss == 'hs':
            lr = 1.0
    model = fasttext.train_supervised(
        input=in_dir,
        epoch=epoch,
        lr=lr,
        wordNgrams=2,
        minCount=1,
        loss = loss,
        )
    if save_model:
        model.save_model(model_dir)
        print('Model saved to:\n{}'.format(model_dir))
    return model,model_dir

In [6]:
# model,model_dir = run_fastText('data/FT/amazon_hierarchy_2_c.train.txt',loss='ova')
# model,model_dir = run_fastText('data/FT/sic_hierarchy_c.train.txt',loss='ova')

## test FastText models

In [7]:
# input
def get_test_data(test_dir):
    label_pattern = re.compile('__label__\S+')
    true_labels = []
    contents = []
    with open(test_dir,'r',encoding = "ISO-8859-1") as f:
        for line in f.read().splitlines():
            true_labels.append(set(label_pattern.findall(line)))
            contents.append(label_pattern.sub(r'',line).strip())
    return true_labels,contents
def get_prediction(model_dir,contents,top_k = PRED_SIZE,save_predictions = True,save_logits= False):
    model = fasttext.load_model(model_dir)
    preds = []
    logits = []
    s = len(contents)
    for i in range(s):
        p,l = model.predict(contents[i],k=top_k)
        preds.append(p)
        logits.append(l)
        if i%(s//100)==0:
            print('{:.2f}%'.format(i/s*100),end='\r')
    PRED_DIR = model_dir.split('.')[0]+'_pred_outputs.pkl'
    LOG_DIR = model_dir.split('.')[0]+'_pred_logits.pkl'
    if save_predictions:
        with open(PRED_DIR, 'wb') as f:
            pickle.dump(preds, f)
        print('SAVE PREDICTION TO:\n{}'.format(PRED_DIR))
    if save_logits:
        with open(LOG_DIR, 'wb') as f:
            pickle.dump(logits, f)
        print('SAVE LOGITS TO:\n{}'.format(LOG_DIR))
    return preds,logits

### evaluation

In [8]:
def get_correct(true_labels,raw_preds,match=None):
    # turn raw labels to multilabel metrics
    s = len(true_labels)
    outputs = np.zeros(shape=(s,PRED_SIZE))
    for i in range(s):
        k=0
        for pred in raw_preds[i]:
            if match is not None and match not in pred:
                continue
            if pred in true_labels[i]:
                outputs[i,k]=1
            k+=1
    return outputs

## misc

In [9]:
# get all labels
def get_all_labels(true_labels):
    labs = set()
    for lab in true_labels:
        for l in lab:
            ss = l.split('_')
            labs.add((int(ss[-2]),int(ss[-1])))
    return labs
def get_lab_to_ind_dict(labs,per_hierarchy):
    lab_to_ind_dict = {}
    if not per_hierarchy:
        # get offset
        a, _ =zip(*list(labs))
        a = np.array(a)
        cnts = [np.sum(a==i)for i in range(max(a))]
        offset = [0] + [sum(cnts[:i+1]) for i in range(len(cnts))]
        # get dict
        for lab in labs:
            lab_to_ind_dict['__label__{}_{}'.format(lab[0],lab[1])] = lab[1]+offset[lab[0]]
    else:
         for lab in labs:
            lab_to_ind_dict['__label__{}_{}'.format(lab[0],lab[1])] = lab
    return lab_to_ind_dict

# Predict

In [10]:
# print existing models
print('Existing Test Data:\n')
dd = 'data/FT'
md = 'FastText/models'
data_dirs = [os.path.join(dd,d) for d in os.listdir(dd) if 'test' in d]
model_dirs = [os.path.join(md,d) for d in os.listdir(md) if 'bin' in d]
for data_dir in data_dirs:
    for model_dir in model_dirs:
        if data_dir.split('/')[-1].split('.')[0] in model_dir:
            print('TEST_DIR = \'{}\''.format(data_dir))
            print('MODEL_DIR = \'{}\''.format(model_dir))
            print()

Existing Test Data:

TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
MODEL_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova.bin'

TEST_DIR = 'data/FT/amazon_hierarchy_2_c.test.txt'
MODEL_DIR = 'FastText/models/190825_173230_amazon_hierarchy_2_c_ova.bin'



In [11]:
TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
MODEL_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova.bin'
true_labels,contents = get_test_data(TEST_DIR)
raw_preds,_ = get_prediction(MODEL_DIR,contents)




SAVE PREDICTION TO:
FastText/models/190823_195436_sic_hierarchy_c_ova_pred_outputs.pkl


# make FT data like DL data

## normal

In [13]:
# print existing models
print('Existing Test Data:\n')
dd = 'data/FT'
md = 'FastText/models'
data_dirs = [os.path.join(dd,d) for d in os.listdir(dd) if 'test' in d]
model_dirs = [os.path.join(md,d) for d in os.listdir(md) if 'pred' in d]
for data_dir in data_dirs:
    for model_dir in model_dirs:
        if data_dir.split('/')[-1].split('.')[0] in model_dir:
            print('TEST_DIR = \'{}\''.format(data_dir))
            print('PRED_DIR = \'{}\''.format(model_dir))
            print()

Existing Test Data:

TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
PRED_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova_pred_outputs.pkl'



In [14]:
TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
PRED_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova_pred_outputs.pkl'
true_labels,contents = get_test_data(TEST_DIR)
raw_preds = pickle.load(open(PRED_DIR,'rb'))

In [15]:
# get all labels
labs = get_all_labels(true_labels)
lab_to_ind_dict = get_lab_to_ind_dict(labs,per_hierarchy=False)

In [17]:
# convert raw_preds to numbers
s = len(raw_preds)
k = 5
preds = np.ones(shape = (s,k))*-1
for i in range(s):
    for j,lab in enumerate(raw_preds[i]):
        if j>=k:
            break
        preds[i,j]=lab_to_ind_dict[lab]
# check no empty
print('missing {} entries'.format((preds==-1).sum()))

missing 0 entries


In [18]:
# save
SAVE_DIR = 'outputs'
data_name = TEST_DIR.split('/')[-1].split('.')[0]
dd = os.path.join(SAVE_DIR,data_name+'_FastText')
if not os.path.exists(dd):
    os.mkdir(dd)
dd = os.path.join(dd,'pred_outputs.txt')
np.savetxt(dd,preds.astype(int),fmt='%d')
print('SAVED TO: {}'.format(dd))

SAVED TO: outputs/sic_hierarchy_c_FastText/pred_outputs.txt


## per hierarchy

In [20]:
# print existing models
print('Existing Test Data:\n')
dd = 'data/FT'
md = 'FastText/models'
data_dirs = [os.path.join(dd,d) for d in os.listdir(dd) if 'test' in d]
model_dirs = [os.path.join(md,d) for d in os.listdir(md) if 'bin' in d]
for data_dir in data_dirs:
    for model_dir in model_dirs:
        if data_dir.split('/')[-1].split('.')[0] in model_dir:
            print('TEST_DIR = \'{}\''.format(data_dir))
            print('MODEL_DIR = \'{}\''.format(model_dir))
            print()

Existing Test Data:

TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
MODEL_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova.bin'

TEST_DIR = 'data/FT/amazon_hierarchy_2_c.test.txt'
MODEL_DIR = 'FastText/models/190825_173230_amazon_hierarchy_2_c_ova.bin'



In [21]:
TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
MODEL_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova.bin'
true_labels,contents = get_test_data(TEST_DIR)
labs = get_all_labels(true_labels)
raw_preds,_ = get_prediction(MODEL_DIR,contents,top_k = min(2000,len(labs)),save_predictions = False,save_logits= False)




99.96%

In [22]:
# get all labels
labs = get_all_labels(true_labels)
lab_to_ind_dict = get_lab_to_ind_dict(labs,per_hierarchy=True)

In [23]:
# get class sizes
a, _ =zip(*list(labs))
a = np.array(a)
cnts = [np.sum(a==i)for i in range(max(a)+1)]
cnts

[18, 77, 453, 538]

In [24]:
k = 5
pp = [np.ones((s,k))*-1 for _ in cnts]
s = len(raw_preds)
for i in range(s):
    Hs = np.array(list(map(lambda x: lab_to_ind_dict[x][0],raw_preds[i])))
    Ps = np.array(list(map(lambda x: lab_to_ind_dict[x][1],raw_preds[i])))
    for H in range(len(cnts)):
        pp[H][i,:] = Ps[np.argwhere(Hs==H)[:k].flatten()]
    if i%(s//100)==0:
        print('{:.2f}%'.format(i/s*100),end='\r')

99.96%

In [25]:
for H,pred in enumerate(pp):
    print('H{} missing {} entries'.format(H,(pred==-1).sum()))

H0 missing 0 entries
H1 missing 0 entries
H2 missing 0 entries
H3 missing 0 entries


In [26]:
# save
SAVE_DIR = 'outputs'
data_name = TEST_DIR.split('/')[-1].split('.')[0]
d = os.path.join(SAVE_DIR,data_name+'_FastText')
if not os.path.exists(d):
    os.mkdir(d)
for H,pred in enumerate(pp):
    dd = os.path.join(d,'pred_outputs{}.txt'.format(H))
    np.savetxt(dd,pred.astype(int),fmt='%d')

In [29]:
sorted([os.path.join(model_dir,d) for d in os.listdir(model_dir)])

['outputs/sic_hierarchy_c_FastText/pred_outputs.txt',
 'outputs/sic_hierarchy_c_FastText/pred_outputs0.txt',
 'outputs/sic_hierarchy_c_FastText/pred_outputs1.txt',
 'outputs/sic_hierarchy_c_FastText/pred_outputs2.txt',
 'outputs/sic_hierarchy_c_FastText/pred_outputs3.txt']

# from saved predictions 

In [None]:
# TEST_DIR = 'data/FT/sic_hierarchy_c.test.txt'
# MODEL_DIR = 'FastText/models/190823_195436_sic_hierarchy_c_ova_pred.pkl'
TEST_DIR = 'data/FT/amazon_hierarchy_2_c.test.txt'
MODEL_DIR = 'FastText/models/190825_173230_amazon_hierarchy_2_c_ova_pred.pkl'
true_labels,contents = get_test_data(TEST_DIR)
raw_preds = pickle.load(open(MODEL_DIR,'rb'))
preds = get_correct(true_labels,raw_preds)



In [None]:
# metrics
print('classification p@k:',end='')
print(['{:.2f}'.format(preds[:,:k].any(axis=1).mean()*100) for k in [1,3,4,5]])
print('multi-label p@k   :',end='')
print(['{:.2f}'.format(preds[:,:k].mean()*100) for k in [1,3,4,5]])
print('nDCGAtk           :',end='')
dcgs = []
dcg= preds/np.log(np.arange(PRED_SIZE)+2)
num_labs = len(true_labels[0])
for k in [1,3,4,5]:
    norm_const = (1/np.log(np.arange(min(k,num_labs))+2)).sum()
    dcgs.append(dcg[:,:k].sum(axis=1).mean()/norm_const)
print(['{:.2f}'.format(dcg*100) for dcg in dcgs])

### look at stats on each H

In [None]:
# look at stats on each H
for H in range(4):
    preds = get_correct(true_labels,raw_preds,'__label__{}_'.format(H))
    print('classification p@k:',end='')
    print(['{:.2f}'.format(preds[:,:k].any(axis=1).mean()*100) for k in [1,3,4,5]])
    dcgs = []
    dcg= preds/np.log(np.arange(PRED_SIZE)+2)
    num_labs = 1
    print('nDCG@k            :',end='')
    for k in [1,3,4,5]:
        norm_const = (1/np.log(np.arange(min(k,num_labs))+2)).sum()
        dcgs.append(dcg[:,:k].sum(axis=1).mean()/norm_const)
    print(['{:.2f}'.format(dcg*100) for dcg in dcgs])

In [None]:
# sneaky check of missing preds
woop = []
for i,raw_pred in enumerate(raw_preds):
    woop.append(set([l.split('_')[-2] for l in raw_pred]))
nooo = []
for i,w in enumerate(woop):
    if len(w)<len(true_labels[0]):
        nooo.append(i)
print(len(nooo))

### get macro-average p@k

In [None]:
# get total count of each label
cnts = defaultdict(int)
s = len(true_labels)
for i in range(s):
    for lab in true_labels[i]:
        cnts[lab]+=1

In [None]:
s = len(true_labels)
for k in [1,4]:
    corrAtk = defaultdict(int)
    for i in range(s):
        for j in range(k):
            if raw_preds[i][0] in true_labels[i]:
                corrAtk[raw_preds[i][0]] +=1
    accAtk = {lab:corrAtk[lab]/cnts[lab] for lab in cnts.keys()}
    print((np.array([val for key,val in accAtk.items()])).mean()*100)

### get macro-average acc@k

In [None]:
# get total count of each label
cnts = defaultdict(int)
s = len(true_labels)
for i in range(s):
    for lab in true_labels[i]:
        cnts[lab]+=1

In [None]:
# get total acc of each label
pp = []
for H in range(len(true_labels[0])):
    pp.append(get_correct(true_labels,raw_preds,'__label__{}_'.format(H)))

In [None]:
k=1
corrAtk = defaultdict(int)
for i in range(s):
    for j,lab in enumerate(sorted(true_labels[i])):
        corrAtk[lab]+=pp[j][i,:k].sum()

In [None]:
accAtk = {lab:val/cnts[lab] for lab,val in corrAtk.items()}

In [None]:
print((np.array([val for key,val in accAtk.items()])).mean()*100)

In [None]:
for H in range(len(true_labels[0])):
    print((np.array([val for key,val in accAtk.items() if '__label__{}_'.format(H) in key])).mean()*100)

### look at stats on each label

In [None]:
cnts = defaultdict(int)
s = len(true_labels)
for i in range(s):
    for lab in true_labels[i]:
        cnts[lab]+=1

In [None]:
precisions = []
for H in range(4):
    preds = get_correct(true_labels,raw_preds,'__label__{}_'.format(H))
    precisions.append([preds[:,:k].any(axis=1).mean()*100 for k in [1,3,5]])

In [None]:
perc = np.array([sum([1 for l in cnts.keys() if '__label__{}_'.format(H) in l ])/len(cnts) for H in range(4)])

In [None]:
(np.array(precisions)*perc[:,np.newaxis]).sum(axis=0)

In [None]:
# p@k per label
corr = defaultdict(list)
match = None
s = len(true_labels)
for i in range(s):
    k=0
    for pred in raw_preds[i]:
        if match is not None and match not in pred:
            continue
        if pred in true_labels[i]:
            corr[pred].append(k)
        k+=1

In [None]:
k = 5
pAt = dict()
for key,cnt in cnts.items():
    if cnt==0:
        continue
    pAt[key]=(np.array(corr[key])<k).sum()/cnt

In [None]:
'{:.2f}'.format(np.array(list(pAt.values())).mean()*100)

In [None]:
plt.plot(sorted(list(pAt.values())))

In [None]:
train_labs,_ = get_test_data('data/FT/sic_hierarchy_c.train.txt')

In [None]:
train_cnts = defaultdict(int)
s = len(train_labs)
for i in range(s):
    for lab in train_labs[i]:
        train_cnts[lab]+=1

In [None]:
sorted_labs = sorted(train_cnts.items(), key=lambda kv: kv[1])
sl = [l[0] for l in sorted_labs]

In [None]:
plt.plot([pAt[l] for l in sl])