# Human Genome Language Model
1. Initialisation
    - 1.1 Imports
    - 1.2 Data processing
2. Tokenisation
    - 2.1 Fixed Length Tokenisation
    - 2.2 Variable Length Tokenisation
3. Experiments
    - 3.1 Comparing fixed and variable length tokenisation strategies

---
# 1. Initialisation
## 1.1 Imports and config
Libraries used include:
- `fastai` for access to the ULMFiT model API
- `pandas` and `matplotlib` for data science toolset

Handy tricks include:
- `autoreload` functionality so that updates to library code are automatically recompiled any time a cell is run.
- `InteractiveShell` is used for multiple outputs from a single cell.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from fastai import *
from fastai.text import *
import sentencepiece as spm
import pandas as pd, matplotlib.pyplot as plt
import numpy as np
# import cupy as np

# Imports from GenomicULMFiT repo
sys.path.append("../../../Genomic-ULMFiT/")
sys.path.append("../..")

from src.processing import process_fasta
from src.config import GRCH38_P13, GENOME
from utils import *

In [None]:
HOME_LOCAL  = '/home/jovyan/ml_genomics'
HOME_REMOTE = '/home/athon/'

HOME        = HOME_REMOTE

THESIS      = HOME / Path('thesis/')
HUMAN       = THESIS / Path('data/human/')

In [None]:
# Dataset params
NROWS_TRAIN     = 10000
NROWS_VAL       = 10000
BATCH_SIZE      = 100

# Tokenisation - fixed
NGRAM_STRIDE    = [(3,1),(5,3),(7,5)]  #(ngram,stride) combinations for tokenisation

# Tokenisation - variable
MAX_VOCAB       = [4**3, 4**5, 4**7]

## 1.2 Data Initialisation
This process assumes data has already been read and processed into `.csv` format via the `process_fasta` script.
Data is read from `human_genome_data_fa.csv` in chunks to reduce data in memory. This can then be iterated with `df.next()`.

In [None]:
df_iter = pd.read_csv(HUMAN/'human_genome_data_fa.csv', chunksize=NROWS_TRAIN+NROWS_VAL)
df = next(df_iter)

# set val to be first 20k rows
df_tr = df[:NROWS_TRAIN]
df_va = df[NROWS_TRAIN:NROWS_TRAIN+NROWS_VAL]

---
# 2. Tokenisation
## 2.1 Fixed length tokenisation
- Technique of original author was to iterate over entire genome sequence with for loops.
- We need to vectorise this.

In [None]:
# from tools.tokenizers import seq_tokenizer, vec_tokenizer

def seq_tokenizer(t, ngram, stride):
    import time
    t = t.upper()
    if ngram == 1:
        toks = list(t)
    else:
        start = time.time()
        toks = [t[i:i+ngram] for i in range(0, len(t), stride) if len(t[i:i+ngram]) == ngram]

    if len(toks[-1]) < ngram:
        toks = toks[:-1]

    return toks,time.time() - start

def vec_tokenizer(a, ngram=3, stride=1, padnum=0):
    import time
    start = time.time()
    a = np.array(list(a)).astype(object)
    n = a.strides[0]
    nrows = ((a.size)//stride)
    strided = np.lib.stride_tricks.as_strided
    out = strided(a[(ngram-1):-(stride-1)], shape=(nrows,ngram), strides=(stride*n,-n))[:,::-1] 
    out = out[ngram-1:,:]
    toks = list(out[:,0] + out[:,1] + out[:,2])
    return toks,time.time() - start

# Test how well the vectorisation works
def test_tokenizers():
    seq_=[]; vec_=[]
    for i in [1,10,100,1000,10000]:
        test = ''.join([''.join(row) for row in df.head(i).values[:,0]])
        toks ,time = seq_tokenizer(test, 3, 1);seq_.append(time)
        toks_,time_ = vec_tokenizer(test, 3, 1);vec_.append(time_)
    
    fig,ax = plt.subplots(figsize=(12,6))

    stats=pd.DataFrame({'seq':seq_,'vec':vec_})
    stats.plot(ax=ax)
    ax.set_title('Sequential vs Vectorised Tokenisation')
    ax.set_ylabel('Time')
    
test_tokenizers()

### Vectorised technique for faster fixed Length tokenisation

In [None]:
import time

class FixedLengthGenomicTokenizer(BaseTokenizer):
    """
    Fixed length tokenisation for DNA.
    """
    def __init__(self, lang='en', ngram=5, stride=2):
        self.lang = lang
        self.ngram = ngram
        self.stride = stride
        
    def tokenizer(self, t):
        t = t.upper()
        if self.ngram == 1:
            toks = list(t)
        else:
            toks = [t[i:i+self.ngram] for i in range(0, len(t), self.stride) if len(t[i:i+self.ngram]) == self.ngram]
        if len(toks[-1]) < self.ngram:
            toks = toks[:-1]
        return toks
    
    def add_special_cases(self, toks):
        pass
    

In [None]:
def generate_fixed_vocab(df_train, df_val, ngram=3, stride=1):
    """Create fixed length tokenizer, initialise databunch and return vocabulary."""
    
    # initialise tokeniser
    tok = Tokenizer(partial(GenomicTokenizer, ngram=ngram, stride=stride), 
                    n_cpus=40,
                    pre_rules=[],
                    post_rules=[],
                    special_cases=[])
    
    data = GenomicTextLMDataBunch.from_df(HUMAN, df_train, df_val, bs=BATCH_SIZE, tokenizer=tok, 
                              chunksize=NROWS_TRAIN, text_cols=0, label_cols=1, max_vocab=((4**ngram)+1))

    # Save and load vocab
    np.save(HUMAN / 'fixed_vocab_{}m{}s.npy'.format(ngram,stride), data.vocab.itos)
    return data.vocab.itos
    
vocabs = []
for ngram_stride in NGRAM_STRIDE:
    vocabs.append(generate_fixed_vocab(df_tr, df_va, *ngram_stride))

In [None]:
def generate_variable_vocab(df_train, df_val, size=128):
    """Create variable length vocabulary using SentencePiece tokenisation.
    """
    sp_proc = SPProcessor(char_coverage=1, 
                          vocab_sz=size,
                          n_cpus = 40,
                          pre_rules=[],
                          post_rules=[])
    
    data = GenomicTextLMDataBunch.from_df(
        HUMAN, df_train, df_val, bs=BATCH_SIZE, processor=sp_proc,
        chunksize=NROWS_TRAIN, text_cols=0, label_cols=1, max_vocab=size
    )
    
    # Save and load vocab
    np.save(HUMAN / 'variable_vocab_{}tok.npy'.format(size), data.vocab.itos)
    return data.vocab.itos
    
for max_vocab in MAX_VOCAB:
    vocabs.append(generate_variable_vocab(df_tr, df_va, size=max_vocab))

In [None]:
pd.DataFrame(vocabs).T.head(20)

---

# Experimental Setup
We propose to explore model performance across different fixed and variable token parameters.
For fixed length tokenisation, the parameters of interest are the number of base pairs per token, also known as `ngram`.
Additionally, the number of base pairs between the start of 1 token and the next is known as `stride`. As an example, take the sequence of base pairs:
$$\text{TCTGGCGACAACCAGGGA}$$

Using fixed length tokenisations of size **_3_** and stride **_0_**, we have the following outputs:
$$\text{[TCT],[GGC],[GAC],[AAC],[CAG],[GGA]}$$

For parameters: `{size:3, stride:1}`
$$\text{[TCT],[],[],[GGC],[],[],[GAC],[],[],[AAC],[],[],[CAG],[],[],[GGA]}$$

For parameters: `{size:5, stride:3}`
$$\text{[TCT],[GGC],[GAC],[AAC],[CAG],[GGA]}$$

For parameters: `{size:7, stride:5}`
$$\text{[TCT],[GGC],[GAC],[AAC],[CAG],[GGA]}$$


## Incorporating colour

In [None]:
from IPython.display import HTML as html_print
from pylab import *

def printc(s, color='white'):
    display(html_print("<text style=color:{}>{}</text>".format(color, s)))

def colour_gradient(n):
    cmap = cm.get_cmap('viridis', n)    # PiYG

    hexcol = []
    for i in range(cmap.N):
        rgb = cmap(i)[:3] # will return rgba, we take only first 3 so we get rgb
        c_hex = matplotlib.colors.rgb2hex(rgb)
        printc('test', str(c_hex))
        
def construct_html(wordlist, newline=50):
    html=""
    for i,pair in enumerate(wordlist):
        if (i!=0 and i%newline == 0):
            display(html_print(html))
            html=""
        html = html + "<text style=font-family:monospace;color:{1}>{0}</text>".format(*pair)
    display(html_print(html))

def hexcols(n=100):
    cmap = cm.get_cmap('viridis', n)
    return [matplotlib.colors.rgb2hex(cmap(i)[:3]) for i in range(cmap.N)]

# variable length tokenised data
i = 10; test = ''.join([''.join(row) for row in df.head(i).values[:,0]])
test = test[:1600]
construct_html([x for x in zip(test,hexcols(len(test)))], newline=80)

In [None]:
# tokenise data
def tok_data(df_train, df_val, tok='Fixed', params=None):
    """Tokenise train and val data with provided tokenisation technique and parameters.
    """
    if tok == 'Fixed':
        ngram,stride = params
        tok = Tokenizer(partial(GenomicTokenizer, ngram=ngram, stride=stride), n_cpus=40,
                        pre_rules=[], post_rules=[], special_cases=[])
        data = GenomicTextLMDataBunch.from_df(HUMAN, df_train, df_val, bs=BATCH_SIZE, tokenizer=tok, 
                              chunksize=NROWS_TRAIN, text_cols=0, label_cols=1, max_vocab=((4**ngram)+1))

    elif tok == 'Variable':
        size = params
        sp_proc = SPProcessor(char_coverage=1,vocab_sz=size,n_cpus = 40,pre_rules=[],post_rules=[])
        data = GenomicTextLMDataBunch.from_df(HUMAN, df_tr, df_va, bs=NROWS_TRAIN//10, 
                                              processor=sp_proc, chunksize=NROWS_TRAIN,
                                              text_cols=0, label_cols=1, max_vocab=size)
    return data

test_sp = tok_data(df_tr, df_va, tok='Variable', params=(512))

In [None]:
tok_str = ''.join([str(test_sp.x[i]) for i in range(len(test_sp.x))])
tokens  = [tok for tok in tok_str.split(' ') if set(tok) <= set('ACTG')]

In [None]:
def plot_tok_dist(tokens, sample=50):
    fig, axs = plt.subplots(nrows=3, figsize=(12,12))
    
    pd.Series(tokens).value_counts().tail(-1).plot(ax=axs[0],cmap='viridis')
    axs[0].set_title('Token distribution.'.format(sample))
    axs[0].set_ylabel('Count')
    
    pd.Series(tokens)\
        .value_counts().tail(-1).sample(sample).sort_values(ascending=False).plot.bar(ax=axs[1], cmap='viridis')
    axs[1].set_title('MC Sample of Token Distribution, sample size {}.'.format(sample))
    axs[1].set_ylabel('Count')
    
    pd.Series([len(token) for token in tokens])\
        .value_counts().sort_index().plot.bar(ax=axs[2], cmap='viridis')
    axs[2].set_title('Token Length Distribution')
    axs[2].set_ylabel('Count')
    
    plt.tight_layout()
    
plot_tok_dist(tokens)

## Colouring Bases

In [None]:
# pd.Series(tokens).value_counts().tail(-1)
def vis_tokens(toks=tokens[-1000:], by='length'):
    
    if by == 'length':
        ys = [len(token) for token in toks]
    
    elif by == 'likelihood':
        counts = pd.Series(tokens).value_counts()
        ys = [counts[str(token)] if token != 'C' else 2000 for token in toks]
        
    ymax = max(ys); ymin = min(ys)
    cmap = cm.get_cmap('viridis', ymax)

    hexs = [matplotlib.colors.rgb2hex(cmap(y)[:3]) for y in ys]
    construct_html([x for x in zip(toks,hexs)], newline=50)

### Colour by token length

In [None]:
vis_tokens(by='length')

### Colour by token likelihood

In [None]:
vis_tokens(by='likelihood')

---

# 3. Experiments

## 3.1 Experiment 1: Fixed Length Data

In [None]:
MODEL_CONFIG = dict(emb_sz=400, 
                    n_hid=1150, 
                    n_layers=3, 
                    pad_token=0, 
                    qrnn=False, 
                    output_p=0.25, 
                    hidden_p=0.1, 
                    input_p=0.2, 
                    embed_p=0.02, 
                    weight_p=0.15, 
                    tie_weights=True, 
                    out_bias=True)

DROP_MULT   = 0.3

Generate data

In [None]:
experiments = []

# fixed length
for ngram_stride in NGRAM_STRIDE:
    experiment = {}
    experiment['title'] = 'fixed_{}_{}'.format(*ngram_stride)
#     experiment['vocab'] = generate_fixed_vocab(df_tr, df_va, *ngram_stride)
    experiment['xdata'] = tok_data(df_tr, df_va, 'Fixed', ngram_stride)
    
    experiments.append(experiment)

# variable length   
for max_vocab in MAX_VOCAB:
    experiment = {}
    experiment['title'] = 'variable_{}'.format(max_vocab)    
#     experiment['vocab'] = generate_variable_vocab(df_tr, df_va, max_vocab)
    experiment['xdata'] = tok_data(df_tr, df_va, 'Variable', max_vocab)
    
    experiments.append(experiment)

Save experiments

In [None]:
import pickle as pkl
with open('experiments.pkl', 'wb') as out:
    pkl.dump(experiments, out)
    out.close()

Load experiments

In [1]:
import sys
sys.setrecursionlimit(30000)

In [None]:
import pickle as pkl
with open('experiments.pkl', 'rb') as infile:
    experiments = pkl.load(infile)
    infile.close()

In [None]:
def train_model(experiment, n_cycles=1, lr_find=False):
    
    config = dict(emb_sz=400, 
              n_hid=1150, 
              n_layers=3, 
              pad_token=0, 
              qrnn=False, 
              output_p=0.25, 
              hidden_p=0.1, 
              input_p=0.2, 
              embed_p=0.02, 
              weight_p=0.15, 
              tie_weights=True, 
              out_bias=True)
    drop_mult=0.3
    
    data  = experiment['xdata']
    learn = get_model_LM(data, drop_mult, config)
    learn = learn.to_fp16(dynamic=True); # convert model weights to 16-bit float
    
    # add callbacks
    from fastai.callbacks.csv_logger import CSVLogger
    learn.callback_fns.append(partial(CSVLogger, append=True))
    
    if lr_find:
        learn.lr_find()
        learn.recorder.plot()
    
    learn.fit_one_cycle(n_cycles, 2e-2, moms=(0.8, 0.7))
                
    learn.save(experiment['title'])
    learn.save_encoder(experiment['title'])
    
    # free up cuda
    del learn; del data; torch.cuda.empty_cache()

for experiment in experiments:
    print(experiment['title'])
    train_model(experiment, n_cycles=5)

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()

---


In [None]:
def freememory():
    """
    Run garbage collection to free up memory.
    """
    import gc
    gc.collect()
    
torch.cuda.empty_cache()
freememory()

In [None]:
%load_ext tensorboard

In [None]:
import tensorboardX
from fastai.callbacks.tensorboard import LearnerTensorboardWriter
from pathlib import Path

project_id = 'exp1'
tboard_path = Path('./logs/' + project_id)
learn.callback_fns.append(partial(LearnerTensorboardWriter, 
                                    base_dir=tboard_path, 
                                    name='run1'))

---

In [None]:
data = GenomicTextLMDataBunch.from_df(path, df[20000:40000], df_val, bs=3000, tokenizer=tok, vocab=model_vocab, max_vocab=80000,
                              chunksize=10000, text_cols=0, label_cols=1)

config = dict(emb_sz=400, 
              n_hid=1150, 
              n_layers=3, 
              pad_token=0, 
              qrnn=False, 
              output_p=0.25, 
              hidden_p=0.1, 
              input_p=0.2, 
              embed_p=0.02, 
              weight_p=0.15, 
              tie_weights=True, 
              out_bias=True)

drop_mult=0.3

learn = get_model_LM(data, drop_mult, config)
learn = learn.to_fp16(dynamic=True);

learn.fit_one_cycle(2, 5e-3, moms=(0.8, 0.7))

In [None]:
voc = np.load(path/'human_vocab_3m1s.npy')
model_vocab = GenomicVocab(voc)

In [None]:
count = 0
lr = 5e-3
for df in df_iter:
    data = GenomicTextLMDataBunch.from_df(path, df, df_val, bs=800, tokenizer=tok, vocab=model_vocab, max_vocab=80000,
                                  chunksize=20000, text_cols=0, label_cols=1)
    learn.data = data                        
    lr_iter = lr/1.5**count
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
    count += 1

In [None]:
learn.save('human_3m1s2')
learn.save_encoder('human_3m1s_enc2')

In [None]:
learn.load('human_3m1s2');

In [None]:
learn = learn.to_fp32();

In [None]:
learn.save('human_3m1s2_fp32')