# Train ULMFiT + sentencepiece arxiv categories large classifier on arxiv abstracts with classes weights

This tunes the model trained in 04-train-large-ulmfit-sp notebook for one epoch with weighted loss.

In [1]:
%cd ~/paperswithcode/paper-extractor

/home/ubuntu/paperswithcode/paper-extractor


In [2]:
import pandas as pd, numpy as np
from pathlib import Path

DATA_PATH = Path("notebooks/shared-notebooks/arxiv-class")
TRAIN_PATH = DATA_PATH / "arxiv-tag-classifier-data.json"
TEST_PATH = DATA_PATH / "classifier.tsv"

In [3]:
from fastai.text import *

BASE_DIR = Path("./models/ulmfit_baseline")
VOCAB_PATH = BASE_DIR / "data_lm_export_vocab.pkl"
MODELS_PATH = DATA_PATH / "models"

processor = SPProcessor(sp_model=BASE_DIR / "tmp" / "spm.model", sp_vocab=BASE_DIR / "tmp" / "spm.vocab", n_cpus=8, mark_fields=True)
vocab = Vocab.load(VOCAB_PATH)

In [4]:
data_clas = load_data(MODELS_PATH, "data_clas_abs.pkl", bs=128, num_workers=16)

In [5]:
def set_seed(seed=None):
    if seed is not None:
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)

In [12]:
train_df = pd.read_pickle(DATA_PATH / "train_df.pkl.gz")

In [13]:
train_df.categories.head()

0    hep-ph astro-ph hep-ex nucl-ex nucl-th
1                           math.KT math.AT
2                           math.NT math.CO
3                                   math.CO
4                                    hep-ph
Name: categories, dtype: object

In [14]:
from collections import Counter
c = Counter()
for x in train_df.categories:
    c.update(x.split(' '))

In [27]:
ac = sum(c.values())

In [30]:
147875 / ac

0.05591595534134918

In [76]:
def f(x):
    if x < 1000:
        x = 1000
    return np.power(x, 0.5)
n = sum([f(v) for v in c.values()])
d = {k: f(v) / n for k, v in c.items()}
s = Counter(d)

In [82]:
s.most_common(1)[0][1] / s.most_common()[-20][1]

12.16038650701531

In [83]:
s.most_common(1)[0][1]

0.021683353602918804

In [81]:
sum([c[x] / len(train_df) / s[x] for x in c])

188.14946944422888

In [86]:
ss = {k: 1 / v / 188 for k, v in s.items()}

In [89]:
Counter(ss).most_common()

[('cmp-lg', 2.983067478247558),
 ('adap-org', 2.983067478247558),
 ('dg-ga', 2.983067478247558),
 ('funct-an', 2.983067478247558),
 ('patt-sol', 2.983067478247558),
 ('atom-ph', 2.983067478247558),
 ('chem-ph', 2.983067478247558),
 ('mtrl-th', 2.983067478247558),
 ('acc-phys', 2.983067478247558),
 ('supr-con', 2.983067478247558),
 ('ao-sci', 2.983067478247558),
 ('plasm-ph', 2.983067478247558),
 ('comp-gas', 2.983067478247558),
 ('q-bio.OT', 2.983067478247558),
 ('cs.GL', 2.983067478247558),
 ('bayes-an', 2.983067478247558),
 ('cs.OS', 2.983067478247558),
 ('stat.OT', 2.983067478247558),
 ('econ.TH', 2.983067478247558),
 ('econ.GN', 2.983067478247558),
 ('econ.EM', 2.9522301170196155),
 ('q-fin.TR', 2.870458019295076),
 ('q-fin.PM', 2.845536961690038),
 ('q-fin.EC', 2.8174762971327314),
 ('nlin.CG', 2.7391810145560704),
 ('q-bio', 2.7277055513866273),
 ('solv-int', 2.7040691179026126),
 ('q-fin.MF', 2.6284828388234383),
 ('q-fin.CP', 2.5974110250767937),
 ('q-bio.SC', 2.575056838140844

In [88]:
sum([c[x] / len(train_df) * ss[x] for x in c])

1.00079505023526

In [94]:
pos_weight = torch.FloatTensor([ss[x] for x in data_clas.train_dl.y.classes]).cuda()

In [103]:
set_seed(42)
lin_ftrs = [len(data_clas.valid_dl.y.classes) * 2] # 352
learn = text_classifier_learner(data_clas, AWD_LSTM, lin_ftrs=lin_ftrs).to_fp16()
micro_f1 = MultiLabelFbeta(learn, beta=1.0)
learn.metrics = [micro_f1]

In [104]:
learn.loss_func = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
#BCEWithLogitsFlat(pos_weight=pos_weight)

In [105]:
learn.load("arxiv_large_class_sp30k_1_1_ft_1_1_6_abstracts.pkl");

In [106]:
learn.unfreeze()
learn.fit_one_cycle(1, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,multi_label_fbeta,micro_fbeta,time
0,0.013375,0.016208,0.656061,36:22,


In [107]:
learn.save("arxiv_large_class_sp30k_1_1_ft_1_1_6_pos_weight_1_abstracts.pkl")

In [8]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,multi_label_fbeta,micro_fbeta,time
0,0.022476,0.021277,0.610026,17:22,


In [9]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,multi_label_fbeta,micro_fbeta,time
0,0.020063,0.020329,0.637803,24:33,


In [9]:
# old results
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,multi_label_fbeta,micro_fbeta,time
0,0.019959,0.019981,0.640732,14:45,


In [10]:
learn.unfreeze()
learn.fit_one_cycle(6, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,multi_label_fbeta,micro_fbeta,time
0,0.01874,0.020213,0.643172,41:41,
1,0.017784,0.019404,0.653917,42:13,
2,0.017556,0.018888,0.659366,40:29,
3,0.017756,0.018624,0.664363,37:27,
4,0.016774,0.018277,0.669639,41:15,
5,0.016667,0.018294,0.670396,40:32,


In [11]:
learn.save("arxiv_large_class_sp30k_1_1_ft_1_1_6_abstracts.pkl")

In [None]:
# %%javascript
# IPython.notebook.save_notebook()