This toy version is to verify we can train a model.

## Install Python Libraries
- FastAI versions < 1.0.61 try to download a missing Wikipedia base model.
- Sentencepiece is an external option for FastAI

In [None]:
%pip install fastai==1.0.61
%pip install sentencepiece==0.1.83
# Kernel -> Restart Kernel and Clear All Outputs...

## Settings

In [None]:
model_id      = 'm3'
dataset_id    = f'ds2'
dataset_name  = f'{dataset_id}-mixed-large'
dataset_types = ['train', 'validate']
use_num_cpus  = 4
batch_size    = 32

## Get Data
- Copy a model pre-trained on arXiv machine learning papers to this VM.
- Copy new arXiv papers to this VM, to adapt the model to all subject categories.

In [1]:
import os
if not os.path.exists('models'):
    os.makedirs('models')
if not os.path.exists(f'data/{dataset_name}'):
    os.makedirs(f'data/{dataset_name}')

In [2]:
from google.cloud import storage
storage_client = storage.Client()
bucket = storage_client.get_bucket('arxiv-development-classifier')

In [3]:
bucket.get_blob('models/abstract-spm.model').download_to_filename('models/abstract-spm.model')
bucket.get_blob('models/abstract-spm.vocab').download_to_filename('models/abstract-spm.vocab')
bucket.get_blob('models/abstract-lm.pth').download_to_filename('models/abstract-lm.pth')

In [None]:
blobs = bucket.list_blobs(prefix=f"data/{dataset_id}")
for item in blobs:
  print(item.name)

In [4]:
for dataset_type in dataset_types:
    tmp=f'data/{dataset_name}/{dataset_name}-{dataset_type}.json'
    print(f'Downloading: {tmp}')
    bucket.get_blob(tmp).download_to_filename(tmp)

## Train
- Create a model and save to disk.
- To monitor GPU use in a terminal:
```
watch -n 9 nvidia-smi
```
- To log GPU use:
```
nvidia-smi --query-gpu=timestamp,pstate,temperature.gpu,utilization.gpu,utilization.memory,memory.total,memory.free,memory.used --format=csv -l 60 > nvidia-m3.log
```

In [5]:
import json
import numpy  as np
import pandas as pd
import re
import torch
from fastai.text import *
from pathlib     import Path 

In [6]:
processor = SPProcessor(
    sp_model=Path('models/abstract-spm.model'),
    sp_vocab=Path('models/abstract-spm.vocab'), 
    n_cpus=use_num_cpus, 
    mark_fields=True)

In [7]:
train_df = pd.read_json('data/train.json')
train_tl = TextList.from_df(train_df, Path('.'), cols=["fulltext"], processor=processor)

In [8]:
validate_df = pd.read_json('data/validate.json')
validate_tl = TextList.from_df(validate_df, Path('.'), cols=["fulltext"], processor=processor)

In [9]:
data_clas = ItemLists(Path('.'), train_tl, validate_tl)\
    .label_from_df(["primary_category"])\
    .databunch(bs=batch_size)

In [10]:
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(42)
learn = text_classifier_learner(data_clas, AWD_LSTM)
x = learn.load_encoder( Path('abstract-lm') ) # models subdir is the default in fastai

In [11]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,1.012685,1.131623,0.468085,02:21


In [12]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(5e-3/2., 5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.550785,0.885119,0.617021,02:23


In [13]:
learn.unfreeze()
learn.fit_one_cycle(6, slice(2e-3/100, 2e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.434553,0.722576,0.702128,02:36
1,0.438091,0.574864,0.87234,02:40
2,0.402751,0.441914,0.87234,02:40
3,0.363952,0.358679,0.914894,02:40
4,0.33098,0.295906,0.893617,02:43
5,0.309518,0.267687,0.87234,02:44


In [14]:
learn.export(f'models/{model_id}-{dataset_name}.pkl')

## Check Model Files
- Verify that the model files can be loaded and a prediction can be made.

In [15]:
learn = load_learner(path=Path('models'),file=Path(f'{model_id}-{dataset_name}'))

In [16]:
s1 = 'In the Minimal'
s2 = 'Supersymmetric Extension of the Standard Model (MSSM) two complex Higgs'
inputs = [s1, s2]
probabilities = learn.predict(inputs)[2].numpy()
list(zip(learn.data.classes, probabilities))

[('astro-ph.GA', 0.0016822844),
 ('cs.HC', 0.052237447),
 ('hep-ph', 0.8532866),
 ('math.PR', 0.09279356)]

## Copy Model to GS

In [17]:
from google.cloud import storage
storage_client = storage.Client()
bucket = storage_client.get_bucket('arxiv-development-classifier')

In [None]:
blobs = bucket.list_blobs(prefix=f"models/{model_id}")
for item in blobs:
  print(item.name)

In [None]:
tmp = f'models/{model_id}-{dataset_name}.pkl'
blob = bucket.blob(tmp)
blob.upload_from_filename(tmp)

## Debug

In [None]:
torch.cuda.get_device_name(0)

In [None]:
# This should take 10 seconds per gpu, to initialize.
#   If it takes 5 minutes, that's bad.
#   Maybe a CUDA communication issue with the GPU.
%time torch.rand(2).cuda()

In [None]:
import gc
gc.collect()

In [None]:
import torch
torch.cuda.empty_cache()