# 2 Unsupervised Language Model

1. Initialisation
2. Training
3. Evaluation

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
import pandas as pd
from pathlib import Path
from functools import partial

from utils import generate_fixed_vocab, generate_variable_vocab, get_model_LM
import sys; sys.path.append("../tools"); from config import *

In [3]:
df_iter = pd.read_csv(HUMAN/'human_genome_data_fa.csv', chunksize=NROWS_TRAIN+NROWS_VAL)
df = next(df_iter)

# set val to be first 20k rows
df_tr = df[:NROWS_TRAIN]
df_va = df[NROWS_TRAIN:NROWS_TRAIN+NROWS_VAL]

## 2.1 Experiment 1: Fixed Length Data

In [4]:
MODEL_CONFIG = dict(emb_sz=400, 
                    n_hid=1150, 
                    n_layers=3, 
                    pad_token=0, 
                    qrnn=False, 
                    output_p=0.25, 
                    hidden_p=0.1, 
                    input_p=0.2, 
                    embed_p=0.02, 
                    weight_p=0.15, 
                    tie_weights=True, 
                    out_bias=True)

DROP_MULT   = 0.3

Generate data

In [5]:
experiments = []

# fixed length
for ngram_stride in NGRAM_STRIDE:
    experiment = {}
    experiment['title'] = 'fixed_{}_{}'.format(*ngram_stride)
    experiment['xdata'], experiment['vocab'] = generate_fixed_vocab(df_tr, df_va, *ngram_stride)
    
    experiments.append(experiment)

# variable length   
for max_vocab in MAX_VOCAB:
    experiment = {}
    experiment['title'] = 'variable_{}'.format(max_vocab)    
    experiment['xdata'], experiment['vocab'] = generate_variable_vocab(df_tr, df_va, max_vocab)
    
    experiments.append(experiment)

In [10]:
def train_model(experiment, n_cycles=1, lr_find=False):
    
    config      = MODEL_CONFIG
    drop_mult   = DROP_MULT
    
    data  = experiment['xdata']
    learn = get_model_LM(data, drop_mult, config)
    learn = learn.to_fp16(dynamic=True); # convert model weights to 16-bit float
    
    # add callbacks
    from fastai.callbacks.csv_logger import CSVLogger
    learn.callback_fns.append(partial(CSVLogger, append=True))
    
    if lr_find:
        learn.lr_find()
        learn.recorder.plot()
    
    learn.fit_one_cycle(n_cycles, 2e-2, moms=(0.8, 0.7))
                
    learn.save(experiment['title'])
    learn.save_encoder(experiment['title'])
    
    # free up cuda
    del learn; del data; torch.cuda.empty_cache()

for experiment in experiments:
    print(experiment['title'])
    train_model(experiment, n_cycles=1)

fixed_3_1


KeyError: 'tie_weights'

---


In [None]:
def freememory():
    """
    Run garbage collection to free up memory.
    """
    import gc
    gc.collect()
    
torch.cuda.empty_cache()
freememory()

In [None]:
%load_ext tensorboard

In [None]:
import tensorboardX
from fastai.callbacks.tensorboard import LearnerTensorboardWriter
from pathlib import Path

project_id = 'exp1'
tboard_path = Path('./logs/' + project_id)
learn.callback_fns.append(partial(LearnerTensorboardWriter, 
                                    base_dir=tboard_path, 
                                    name='run1'))

---

In [None]:
data = GenomicTextLMDataBunch.from_df(path, df[20000:40000], df_val, bs=3000, tokenizer=tok, vocab=model_vocab, max_vocab=80000,
                              chunksize=10000, text_cols=0, label_cols=1)

config = dict(emb_sz=400, 
              n_hid=1150, 
              n_layers=3, 
              pad_token=0, 
              qrnn=False, 
              output_p=0.25, 
              hidden_p=0.1, 
              input_p=0.2, 
              embed_p=0.02, 
              weight_p=0.15, 
              tie_weights=True, 
              out_bias=True)

drop_mult=0.3

learn = get_model_LM(data, drop_mult, config)
learn = learn.to_fp16(dynamic=True);

learn.fit_one_cycle(2, 5e-3, moms=(0.8, 0.7))

In [None]:
voc = np.load(path/'human_vocab_3m1s.npy')
model_vocab = GenomicVocab(voc)

In [None]:
count = 0
lr = 5e-3
for df in df_iter:
    data = GenomicTextLMDataBunch.from_df(path, df, df_val, bs=800, tokenizer=tok, vocab=model_vocab, max_vocab=80000,
                                  chunksize=20000, text_cols=0, label_cols=1)
    learn.data = data                        
    lr_iter = lr/1.5**count
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
    count += 1

In [None]:
learn.save('human_3m1s2')
learn.save_encoder('human_3m1s_enc2')

In [None]:
learn.load('human_3m1s2');

In [None]:
learn = learn.to_fp32();

In [None]:
learn.save('human_3m1s2_fp32')