# 2 Unsupervised Language Model

- 2.1 Initialisation
- 2.2 Training
- 2.3 Evaluation

## 2.1 Initialisation

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import pandas as pd
from pathlib import Path
from functools import partial

from utils import tok_fixed, tok_variable, get_model_LM
import sys; sys.path.append("../tools"); from config import *

In [None]:
df_iter = pd.read_csv(HUMAN/'human_genome_data_fa.csv', chunksize=NROWS_TRAIN+NROWS_VAL)
df = next(df_iter)

# set val to be first 20k rows
df_tr = df[:NROWS_TRAIN]
df_va = df[NROWS_TRAIN:NROWS_TRAIN+NROWS_VAL]

In [None]:
MODEL_CONFIG = dict(emb_sz=400, 
                    n_hid=1150, 
                    n_layers=3, 
                    pad_token=0, 
                    qrnn=False, 
                    output_p=0.25, 
                    hidden_p=0.1, 
                    input_p=0.2, 
                    embed_p=0.02, 
                    weight_p=0.15, 
                    tie_weights=True, 
                    out_bias=True)

DROP_MULT   = 0.3

## Generate data

To calculate the required batch size to fit on CUDA, we can use the inverse proportionality of batch size with number of features, which is 4^n_gram. We know that a batch of 4^3 features will need to be 4^3/4^2 = 4 times smaller than for 4^2 features.

In [None]:
experiments = []

# fixed length
for i,ngram_stride in enumerate(NGRAM_STRIDE):
    experiment = {}
    experiment['title'] = 'fixed_{}_{}_rows_{}'.format(*ngram_stride,NROWS_TRAIN)
    experiment['xdata'], experiment['vocab'] = tok_fixed(df_tr, df_va, *ngram_stride, bs=BS[i])
    
    experiments.append(experiment)

# variable length   
for i,max_vocab in enumerate(MAX_VOCAB):
    experiment = {}
    experiment['title'] = 'variable_{}_rows_{}'.format(max_vocab,NROWS_TRAIN)    
    experiment['xdata'], experiment['vocab'] = tok_variable(df_tr, df_va, max_vocab, bs=BS[i])
    
    experiments.append(experiment)

## 2.2 Language Model Training

In [None]:
def train_model(experiment, epochs=10):
    import torch; import os
    config      = MODEL_CONFIG.copy()
    drop_mult   = DROP_MULT
    
    data  = experiment['xdata']
    learn = get_model_LM(data, drop_mult, config)
    learn = learn.to_fp16(dynamic=True); # convert model weights to 16-bit float
    
    model = 'models/' + experiment['title'] + '.pth'
    if os.path.exists(HUMAN/model):
        print('model found: loading model: {}'.format(experiment['title']))
        learn.load(experiment['title'])
        learn.data = data

    # add callbacks
    from fastai.callbacks.csv_logger import CSVLogger
    learn.callback_fns.append(partial(CSVLogger, 
                                      filename='history_' + experiment['title'], 
                                      append=True))
    
    learn.fit_one_cycle(epochs, 5e-3, moms=(0.8, 0.7))
    learn.save(experiment['title'])
    learn.save_encoder(experiment['title']+'_enc')
    
    # free up cuda
    del learn; del data; torch.cuda.empty_cache()

for experiment in experiments[2:]:
    print(experiment['title'])
    train_model(experiment, epochs=16)

In [None]:
experiment['xdata']

---

## 2.3 Validation

In [None]:
def plot_losses(learn):
    import matplotlib.pyplot as plt
    fig,ax = plt.subplots(2,1,figsize=(8,12))
    ax[0].plot(list(range(len(learn.recorder.val_losses))),learn.recorder.val_losses, label='Validation loss')
    ax[0].plot(list(range(len(learn.recorder.val_losses))),
               [learn.recorder.losses[i] for i in range(len(learn.recorder.val_losses),
                                                        len(learn.recorder.losses),
                                                        len(learn.recorder.losses)//len(learn.recorder.val_losses))], 
               label='Training loss')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss')
    ax[0].legend(loc='upper right')
    ax[1].plot(list(range(len(learn.recorder.val_losses))),learn.recorder.metrics)
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')

def train_sequence(experiment):
    config      = MODEL_CONFIG.copy()
    drop_mult   = DROP_MULT
    data,vocab  = generate_variable_vocab(df_tr, df_va, 64)
    learn = get_model_LM(data, drop_mult, config)
    learn = learn.to_fp16(dynamic=True); # convert model weights to 16-bit float
    
    count = 0
    lr = 5e-3
    for df in df_iter:
        data,_ = generate_variable_vocab(df, df_va, 64)
        learn.data = data                        
        lr_iter = lr/1.5**count
        print(f'Learning Rate: {lr_iter}')
        learn.fit_one_cycle(1, lr_iter, moms=(0.8,0.7))
        count += 1
        
        plot_losses(learn)

---


In [None]:
# %load_ext tensorboard

In [None]:
# import tensorboardX
# from fastai.callbacks.tensorboard import LearnerTensorboardWriter
# from pathlib import Path

# project_id = 'exp1'
# tboard_path = Path('./logs/' + project_id)
# learn.callback_fns.append(partial(LearnerTensorboardWriter, 
#                                     base_dir=tboard_path, 
#                                     name='run1'))


---