# Train ULMFiT + sentencepiece arXiv categories classifier on arXiv full text

TODO: update description
This notebook is based on notebook 01-. It contains code for training an arxiv categories classifier using ULMFiT with sentencepiece unigram tokenization model. Both the tokenizator and language model were trained on corpus of 64K+ machine learning papers. In this notebook we train classifier (without finetuning) on arxiv data using only titles and abstracts to predict categories. We use papers published before 2020 as a training set and after 2020 as a validation set, excluding arxiv test set from both sets.

In [34]:
%cd ~/Development/arxiv-classifier

/Users/mhl10/Development/arxiv-classifier


In [35]:
import pandas as pd, numpy as np
from pathlib import Path

DATA_PATH = Path("./data")
TRAIN_PATH = DATA_PATH / "arxiv-tag-classifier-data.json"
TEST_PATH = DATA_PATH / "classifier.tsv"

In [36]:
import json
with TRAIN_PATH.open() as f:
    train = json.load(f)

In [37]:
print(f"TEST_PATH: {TEST_PATH}")
test = pd.read_csv(TEST_PATH, sep="\t", header=None, names=["arxiv_id", "tags", "output"])

TEST_PATH: data/classifier.tsv


In [38]:
from fastai.text import *

# BASE_DIR = Path("./models/ulmfit_baseline")
# VOCAB_PATH = BASE_DIR / "data_lm_export_vocab.pkl"
MODELS_PATH = Path("./models")

processor = SPProcessor(sp_model=MODELS_PATH / "spm.model", sp_vocab=MODELS_PATH / "spm.vocab", n_cpus=4, mark_fields=True)
# this line was in notebook 01 but I don't think it's used anywhere.
# vocab = Vocab.load(VOCAB_PATH)

In [39]:
import re
aidv_re = re.compile("(v\d+)?$")
test_no_version = test.arxiv_id.str.replace(aidv_re, "")

In [40]:
all_df = pd.DataFrame(train)

In [41]:
all_df.date = pd.to_datetime(all_df.date, infer_datetime_format=True)

In [42]:
filtered = all_df[~all_df.arxiv_id.isin(test_no_version)]

In [43]:
train_df = filtered[filtered.date.dt.year<2020]
valid_df = filtered[~(filtered.date.dt.year<2020)]

In [44]:
# TBD: add fulltext column
text_cols = ["title", "abstract"]
valid_tl = TextList.from_df(valid_df, MODELS_PATH, cols=text_cols, processor=processor)

In [45]:
train_tl = TextList.from_df(train_df, MODELS_PATH, cols=text_cols, processor=processor)

In [32]:
data_clas = ItemLists(MODELS_PATH, train_tl, valid_tl)\
    .label_from_df(["categories"], label_delim=" ")\
    .databunch(bs=64)

In [33]:
data_clas.save("data_clas_abs.pkl")

In [15]:
test_df = all_df[all_df.arxiv_id.isin(test_no_version)]

In [16]:
train_df.to_pickle(DATA_PATH / "train_df.pkl.gz")
valid_df.to_pickle(DATA_PATH / "valid_df.pkl.gz")
test_df.to_pickle(DATA_PATH / "test_df.pkl.gz")

In [176]:
def set_seed(seed=None):
    if seed is not None:
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)

In [177]:
spulmfit_path=Path("/home/ubuntu/paperswithcode/paper-extractor/models/ulmfit_baseline")

set_seed(42)
learn = text_classifier_learner(data_clas, AWD_LSTM)

In [46]:
# TODO: may need to ask Marcin about this
# try substituting this with lm.pth per Marcin
# Marcin: "I cannot find pretrained-on-papers_enc.pkl; as far as I remember it’s the same as in lm.pth or it’s just the encoder from lm.pth"
learn.load_encoder( MODELS_PATH / "lm.pth")
# learn.load_encoder(spulmfit_path / "models"/ "pretrained-on-papers_enc.pkl")

NameError: name 'learn' is not defined

In [179]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,time
0,0.033447,0.028418,36:16


In [180]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,time
0,0.025298,0.024094,51:55


In [181]:
learn.unfreeze()
learn.fit_one_cycle(6, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,time
0,0.021932,0.02214,1:33:16
1,0.019765,0.020655,1:44:28
2,0.01843,0.020145,1:49:05
3,0.018067,0.019065,1:38:02
4,0.017113,0.018982,1:37:19
5,0.018216,0.018868,1:48:41


In [182]:
learn.save("arxiv_class_sp30k_1_1_6_abstracts.pkl")