# 3 Task Based Language Model

1. Initialisation
2. Training
3. Fine-tuning
4. Evaluation

Dataset of interest:
    1. Long non-coding RNA (lncRNA) vs. messenger RNA (mRNA)
        - 

## 3.1 Initialisation
### 3.1.1 Imports 

In [None]:
# Set it to a particular device
import torch
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
from pathlib import Path
from functools import partial

from utils import tok_fixed, tok_variable, get_model_LM
import sys; sys.path.append("../tools"); from config import *
from utils import *

### 3.1.2 mRNA/lncRNA Data initialisation

In [4]:
data_df = pd.read_csv(HUMAN/'lncRNA.csv', usecols=['Sequence','Name'])

# data for LM fine-tuning
df_ulm              = (data_df[data_df['Name'].str.contains('TRAIN.fa')].pipe(partition_data))
df_tr_,df_va_       = df_ulm[df_ulm.set == 'train'], df_ulm[df_ulm.set == 'valid']

# dfs for classification
df_clas             = (data_df[data_df['Name'].str.contains('train16K')].pipe(partition_data))
df_clas['Target']   = df_clas['Name'].map(lambda x : x.split('.')[0][:-1])
df_tr,df_va         = df_clas[df_clas.set == 'train'], df_clas[df_clas.set == 'valid']
df_te               = data_df[data_df['Name'].str.contains('TEST500')]

## 3.2 LM Fine-Tuning

In [5]:
%%time
def make_experiments(df_tr, df_va):
    """Construct experiment based on tokenisation parameters explored.
    """
    experiments = []

    # fixed length
    for i,ngram_stride in enumerate(NGRAM_STRIDE):
        experiment = {}
        experiment['title'] = 'fixed_{}_{}_rows_{}'.format(*ngram_stride,NROWS_TRAIN)
        experiment['xdata'], experiment['vocab'] = tok_fixed(df_tr, df_va, *ngram_stride, bs=BS[i])

        experiments.append(experiment)

    # variable length   
    for i,max_vocab in enumerate(MAX_VOCAB):
        experiment = {}
        experiment['title'] = 'variable_{}_rows_{}'.format(max_vocab,NROWS_TRAIN)    
        experiment['xdata'], experiment['vocab'] = tok_variable(df_tr, df_va, max_vocab, bs=BS[i])

        experiments.append(experiment)
    return experiments

experiments = make_experiments(df_tr_, df_va_)

CPU times: user 50.3 s, sys: 37 s, total: 1min 27s
Wall time: 2min 35s


In [8]:
TUNE_CONFIG     = dict(emb_sz=400, 
                       n_hid=1150, 
                       n_layers=3,
                       pad_token=0, 
                       qrnn=False, 
                       output_p=0.25, 
                       hidden_p=0.1, 
                       input_p=0.2,
                       embed_p=0.02, 
                       weight_p=0.15, 
                       tie_weights=True, 
                       out_bias=True)
TUNE_DROP_MULT  = 0.25

def tune_model(experiment, epochs=1):
    config      = TUNE_CONFIG.copy()
    drop_mult   = TUNE_DROP_MULT
    
    data  = experiment['xdata']
    learn = get_model_LM(data, drop_mult, config)
    learn = learn.to_fp16(dynamic=True); # convert model weights to 16-bit float
    
    model = 'models/' + experiment['title'] + '.pth'
    if os.path.exists(HUMAN/model):
        print('model found: loading model: {}'.format(experiment['title']))
        learn.load(experiment['title'])
        learn.data = data

    # add callbacks
    from fastai.callbacks.csv_logger import CSVLogger
    learn.callback_fns.append(partial(CSVLogger, 
                                      filename='history_tune_' + experiment['title'], 
                                      append=True))
    
    learn.fit(epochs=epochs,wd=1e-4)
    learn.save('tune_'+experiment['title'])
    learn.save_encoder('tune_'+experiment['title']+'_enc')
    
    # free up cuda
    del learn; del data; torch.cuda.empty_cache()

for experiment in experiments[-1:]:
    print(experiment['title'])
    tune_model(experiment, epochs=4)

fixed_3_3_rows_20000
model found: loading model: fixed_3_3_rows_20000


epoch,train_loss,valid_loss,accuracy,time
0,3.792576,3.783734,0.080727,16:35
1,3.761145,3.761088,0.085415,16:35
2,3.72849,3.733666,0.091369,16:36
3,3.694636,3.712957,0.096546,16:36


fixed_5_5_rows_20000
model found: loading model: fixed_5_5_rows_20000


epoch,train_loss,valid_loss,accuracy,time
0,6.33997,6.326385,0.025775,10:17
1,6.295557,6.293774,0.027928,10:17
2,6.252541,6.266128,0.030771,10:16
3,6.218936,6.248549,0.032542,10:17


fixed_7_7_rows_20000
model found: loading model: fixed_7_7_rows_20000


epoch,train_loss,valid_loss,accuracy,time
0,8.909459,8.897055,0.013352,10:43
1,8.832302,8.84511,0.015462,10:43
2,8.788483,8.812013,0.017471,10:43
3,8.73033,8.791214,0.018979,10:42


variable_64_rows_20000
model found: loading model: variable_64_rows_20000


epoch,train_loss,valid_loss,accuracy,time
0,3.302855,3.2835,0.234347,20:42
1,3.23832,3.233262,0.238526,20:42
2,3.206257,3.205786,0.243254,20:44
3,3.167009,3.169769,0.245685,20:44


variable_1024_rows_20000
model found: loading model: variable_1024_rows_20000


epoch,train_loss,valid_loss,accuracy,time
0,5.840509,5.790755,0.102125,11:29
1,5.620809,5.586918,0.12521,11:29
2,5.445714,5.413337,0.146633,11:28
3,5.33603,5.320811,0.160544,11:28


variable_16384_rows_20000
model found: loading model: variable_16384_rows_20000


epoch,train_loss,valid_loss,accuracy,time


KeyboardInterrupt: 

## 3.3 Classification

In [None]:
%%time
def make_experiments(df_tr, df_va):
    """Construct experiment based on tokenisation parameters explored.
    """
    experiments = []

    # fixed length
    for i,ngram_stride in enumerate(NGRAM_STRIDE):
        experiment = {}
        experiment['title'] = 'fixed_{}_{}_rows_{}'.format(*ngram_stride,NROWS_TRAIN)
        experiment['xdata'], experiment['vocab'] = tok_fixed(df_tr, df_va, *ngram_stride, 
                                                             bs=400, clas=True)

        experiments.append(experiment)

    # variable length   
    for i,max_vocab in enumerate(MAX_VOCAB):
        experiment = {}
        experiment['title'] = 'variable_{}_rows_{}'.format(max_vocab,NROWS_TRAIN)
        experiment['xdata'], experiment['vocab'] = tok_variable(df_tr, df_va, max_vocab, 
                                                                bs=400, clas=True)

        experiments.append(experiment)
        
    return experiments

experiments = make_experiments(df_tr, df_va)

In [None]:
CLAS_CONFIG     = dict(emb_sz=400, 
                       n_hid=1150, 
                       n_layers=3, 
                       pad_token=0, 
                       qrnn=False, 
                       output_p=0.4,
                       hidden_p=0.2, 
                       input_p=0.6,
                       embed_p=0.1, 
                       weight_p=0.5)
CLAS_DROP_MULT  = 0.5

def tune_classifier(experiment, epochs=1):
    config      = CLAS_CONFIG.copy()
    drop_mult   = CLAS_DROP_MULT

    data  = experiment['xdata']
    learn = get_model_clas(data, CLAS_DROP_MULT, CLAS_CONFIG, max_len=4000*70)
    learn.load_encoder(experiment['title']+'_enc')
    learn = learn.to_fp16(dynamic=True);
    
    # add callbacks
    from fastai.callbacks.csv_logger import CSVLogger
    learn.callback_fns.append(partial(CSVLogger, 
                                      filename='history_clas' + experiment['title'], 
                                      append=True))
    
    learn.freeze()
    learn.fit_one_cycle(epochs, 5e-2, moms=(0.8, 0.7))
    learn.save('clas_'+experiment['title'])
    learn.save_encoder('clas_'+experiment['title']+'_enc')
    
tune_classifier(experiments[1], epochs=4)

In [None]:
CLAS_CONFIG     = dict(emb_sz=400, 
                       n_hid=1150, 
                       n_layers=3, 
                       pad_token=0, 
                       qrnn=False, 
                       output_p=0.4,
                       hidden_p=0.2, 
                       input_p=0.6,
                       embed_p=0.1, 
                       weight_p=0.5)
CLAS_DROP_MULT  = 0.5
tune_classifier(experiments[1], epochs=4)

## 3.4 Evaluation
We now evaluate every model trained for classification performance on the `TEST500` dataset.
All models have been trained for 10 epochs unsupervised, then fine tuned for an additional 8 epochs on long read ncRNA and mRNA data. We plot confusion matrices for each model, as well as a comparative accuracy plot.

In [None]:
get_scores(learn)

## 3.5 Explainability
We would like to visualise the model's attention at the intersection between coding and non-coding regions.
- We would like to identify a subset of the model's embedding that captures the majority of this variance
- We could call this the `coding neuron`