# Train ULMFiT + sentencepiece arXiv categories classifier on arXiv full text

TODO: update description
This notebook is based on notebook 01-. It contains code for training an arxiv categories classifier using ULMFiT with sentencepiece unigram tokenization model. Both the tokenizator and language model were trained on corpus of 64K+ machine learning papers. In this notebook we train classifier (without finetuning) on arxiv data using only titles and abstracts to predict categories. We use papers published before 2020 as a training set and after 2020 as a validation set, excluding arxiv test set from both sets.

In [None]:
import json
import numpy  as np
import pandas as pd
import re
from fastai.text import *
from pathlib     import Path

In [None]:
processor = SPProcessor(
    sp_model=Path('models/spm.model'),
    sp_vocab=Path('models/spm.vocab'), 
    n_cpus=10, 
    mark_fields=True)

In [None]:
with Path('data/arxiv-tag-classifier-data.json').open() as f:
  data_string = json.load(f)

In [None]:
test = pd.read_csv(Path("data/classifier.tsv"), 
                   sep="\t", 
                   header=None, 
                   names=["arxiv_id", "tags", "output"])

In [None]:
aidv_re = re.compile("(v\d+)?$")
test_no_version = test.arxiv_id.str.replace(aidv_re, "")

In [None]:
all_df = pd.DataFrame(data_string)

In [None]:
all_df.date = pd.to_datetime(all_df.date, infer_datetime_format=True)

In [None]:
filtered = all_df[~all_df.arxiv_id.isin(test_no_version)]

In [None]:
train_df = filtered[filtered.date.dt.year<2020]
valid_df = filtered[~(filtered.date.dt.year<2020)]

In [None]:
# TBD: add fulltext column
text_cols = ["title", "abstract"]
valid_tl = TextList.from_df(valid_df, 
                            Path("."), 
                            cols=text_cols, 
                            processor=processor)

In [None]:
train_tl = TextList.from_df(train_df, 
                            Path("."), 
                            cols=text_cols, 
                            processor=processor)

In [None]:
data_clas = ItemLists(Path("."), train_tl, valid_tl)\
    .label_from_df(["categories"], label_delim=" ")\
    .databunch(bs=64)

In [None]:
test_df = all_df[all_df.arxiv_id.isin(test_no_version)]

In [None]:
def set_seed(seed=None):
    if seed is not None:
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.seed(seed)

In [None]:
set_seed(42)
learn = text_classifier_learner(data_clas, AWD_LSTM)

In [None]:
learn.load_encoder( Path("lm") )
print('done: load_encoder')

In [None]:
learn.fit_one_cycle(1, 1e-2)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(6, slice(2e-3/100, 2e-3))

In [None]:
#learn.save("arxiv_class_sp30k_1_1_6_abstracts.pkl")
learn.save("reproduce_abstract_model.pkl")


In [None]:
dir(learn)

In [None]:
learn.export('models/reproduce_abstract_model_b.pkl')