In [1]:
from torchtext import *
from fastai import *
import torch
from fastai.text import *
from torchtext.datasets import Multi30k, LanguageModelingDataset
from models import *
from torchtext.data import *
from tqdm import tqdm, trange, tqdm_notebook
import os
import csv
from itertools import chain
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
class LMDataSet(data.Dataset):
    """Defines a dataset for language modeling."""

    def __init__(self, path, text_field, newline_eos=True,
                 encoding='utf-8', **kwargs):
        """Create a LanguageModelingDataset given a path and a field.

        Arguments:
            path: Path to the data file.
            text_field: The field that will be used for text data.
            newline_eos: Whether to add an <eos> token for every newline in the
                data file. Default: True.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        fields = [('text', text_field)]
        text = []
        with open(path, encoding=encoding) as f:
            reader = csv.reader(f, delimiter='\t')
            count=0
            for row in reader:
                count+=1
                if count ==1 :
                    continue 
                text += text_field.preprocess(row[1])
                if newline_eos:
                    text.append(u'<eos>')
                

        examples = [data.Example.fromlist([text], fields)]
        super(LMDataSet, self).__init__(
            examples, fields, **kwargs)
        
    
    
    def get_sample(self, ratio=0.2) :
        text = self.examples[0].text
        fs =len(text)
        text_sample = text[:int(fs*ratio)]
        ex_sample = Example.fromlist([text_sample], list(self.fields.items()))
        return Dataset([ex_sample], list(self.fields.items()))

In [3]:
def split_train_valid(lm_dataset, ratio=0.8) :
        text = lm_dataset.examples[0].text
        fs =len(text)
        
        train = text[:int(fs*ratio)]
        valid = text[int(fs*ratio):]
        ex_train = Example.fromlist([train], list(lm_dataset.fields.items()))
        ex_valid = Example.fromlist([valid], list(lm_dataset.fields.items()))
        return Dataset([ex_train], list(lm_dataset.fields.items())), Dataset([ex_valid], list(lm_dataset.fields.items()))

In [4]:
lm_ds = LMDataSet('data/train_full.tsv', Field())

In [5]:
lm_ds.fields['text'].build_vocab([lm_ds[0].text], max_size=60000)

In [6]:
lm_ds_sample = lm_ds.get_sample(0.01)

In [7]:
train, valid = split_train_valid(lm_ds)

In [8]:
len(train[0].text), len(valid[0].text), len(lm_ds[0].text)

(34793900, 8698476, 43492376)

In [9]:
bs = 64
bptt = 50

In [10]:
vocabbb = train.fields['text'].vocab

In [11]:
train_it = BPTTIterator(train, bs, bptt)
valid_it = BPTTIterator(valid, bs, bptt)

In [12]:
def load_pretrained_lm(vocab) :    
    lm = get_language_model(AWD_LSTM, len(vocab))
    model_path = untar_data('https://s3.amazonaws.com/fast-ai-modelzoo/wt103-1', data=False)
    fnames = [list(model_path.glob(f'*.{ext}'))[0] for ext in ['pth', 'pkl']]
    old_itos = pickle.load(open(fnames[1], 'rb'))
    old_stoi = {v:k for k,v in enumerate(old_itos)}
    wgts = torch.load(fnames[0], map_location=lambda storage, loc: storage)
    wgts = convert_weights(wgts, old_stoi, vocab.itos)
    lm.load_state_dict(wgts)
    return lm

In [13]:
class Databunch() :
    def __init__(self, train_dl, valid_dl) :
        self.train_dl = train_dl
        self.valid_dl = valid_dl
    @property
    def train_ds(self): return self.train_dl.dataset
        
    @property
    def valid_ds(self): return self.valid_dl.dataset

In [14]:
class mLearner():
    def __init__(self, model, opt, loss_func, data):
        self.model,self.opt,self.loss_func,self.data = model,opt,loss_func,data
    
    def freeze_to(self, n) :
        assert(n < len(self.opt.param_groups))
        for g in self.opt.param_groups[:n]:
            for l in g['params']:
                l.requires_grad=False
        for g in self.opt.param_groups[n:]: 
            for l in g['params']:
                l.requires_grad=True
    def unfreeze(self) :
        self.freeze_to(0)
    def freeze(self) :
        for g in self.opt.param_groups:
            for l in g['params']:
                l.requires_grad=False

In [15]:
def get_model_param_groups(model) :
    parameters = [] 
    for i in range(3) :
        layer = f'{i}'
        parameters.append({'params' :lm_pretrained._modules['0']._modules['rnns']._modules[layer].parameters()})
    modules = chain(lm_pretrained._modules['1'].parameters(), lm_pretrained._modules['0']._modules['encoder'].parameters())
    parameters.append({'params': modules})
    return parameters

In [19]:
def fit_awd_lstm(epochs, learn, cuda=True, show_info=True, grad_clip=0.1, alpha=2., beta=1., record=True, one_cycle=True, 
                 cut_frac = 0.1, n_max = 0.01, ratio=32, discr=True, discr_rate=2.6):
    
    #number of batches in one epoch for validation and training data
    train_size = len(learn.data.train_dl)
    valid_size = len(learn.data.valid_dl)
    
    # total iterations and cut used for slanted_triangular learning rates (T and cut from paper)
    total_iterations = epochs*train_size
    cut = int(total_iterations*cut_frac)

    
    if record:
        lrs = []
        train_losses = []
        val_losses =[]
        train_accs = []
        valid_accs =[]
    
    #puts model on gpu
    if cuda :
        learn.model.cuda()
    
    #Start the epoch
    for epoch in range(epochs):
        
        #loss and accuracy 
        train_loss, valid_loss, train_acc, valid_acc = 0, 0, 0, 0

        #puts the model on training mode (activates dropout)
        learn.model.train()
        
        #iterator over all batches in training
        batches = tqdm_notebook(learn.data.train_dl, leave=False,
                        total=len(learn.data.train_dl), desc=f'Epoch {epoch} training')
        
        #batch number counter
        batch_num = 0
       
        #starts sgd for each batches
        for batch in batches:
            
            #Slanted_triangular learning rates
            if one_cycle :
                iteration = (epoch * train_size) + batch_num
                assert(total_iterations >= iteration)

                if iteration < cut :
                    p=iteration/cut
                else :
                    p = 1-( (iteration-cut) / (cut*(1/cut_frac-1) ))
                    p = max(p, 0)
                new_lr = n_max*( (1 + p*(ratio-1)) / ratio )
                
                for p in learn.opt.param_groups :
                    p['lr'] = new_lr
                lrs.append(new_lr)
            batch_num+=1

            #disdcriminative learning rate 
            if discr :
                for i in range(3) :#all  3 layers starting from last one 
                    learn.opt.param_groups[-(i+2)]['lr'] = learn.opt.param_groups[-(i+2)]['lr']/ (discr_rate)**i

            #forward pass
            xb = batch.text.t().cuda()
            yb = batch.target.t().cuda()
            pred, raw_out, out = learn.model(xb)
            loss = learn.loss_func(pred, yb)
            
            #activation regularization 
            if alpha != 0.:  loss += alpha * out[-1].float().pow(2).mean()
            
            #temporal activation regularization 
            if beta != 0.:
                h = raw_out[-1]
                if len(h)>1: loss += beta * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
            
            train_loss += loss
            train_acc += (torch.argmax(pred, dim=2) == yb).type(torch.FloatTensor).mean() 

            # compute gradients and updtape parameters
            loss.backward()
            
            #gradient clipping
            if grad_clip:  nn.utils.clip_grad_norm_(learn.model.parameters(), grad_clip)
            
            #optimizationm step
            learn.opt.step()
            learn.opt.zero_grad()

        train_loss = train_loss/train_size
        train_acc = train_acc/train_size
        

        # putting the model in eval mode so that dropout is not applied
        learn.model.eval()
        with torch.no_grad():
            batches = tqdm_notebook(learn.data.valid_dl, leave=False,
                     total=len(learn.data.valid_dl), desc=f'Epoch {epoch} validation')
            for batch in batches:
                xb = batch.text.t().cuda()
                yb = batch.target.t().cuda()
                pred = learn.model(xb)[0]
                loss = learn.loss_func(pred, yb)

                valid_loss += loss
                valid_acc += (torch.argmax(pred, dim=2) == yb).type(torch.FloatTensor).mean() 
                
        valid_loss = valid_loss/valid_size
        valid_acc = valid_acc/valid_size
        
        if show_info :
            print("Epoch {:.0f} training loss : {:.3f}, train accuracy : {:.3f}, validation loss : {:.3f}, valid accuracy : {:.3f}".format(epoch, train_loss, train_acc, valid_loss, valid_acc))
        if record :
            val_losses.append(valid_loss)
            train_losses.append(train_loss)
            train_accs.append(train_acc)
            valid_accs.append(valid_acc)
    
    if record :
        return {'train_loss' : train_losses, 'valid_loss' : val_losses, 'train_acc': train_acc, 'valid_acc' : valid_acc, 'lrs' : lrs}    

In [17]:
lm = get_language_model(AWD_LSTM, len(vocabbb))
lm_pretrained = load_pretrained_lm(vocabbb)

In [18]:
lr = 0.01

opt = torch.optim.Adam(get_model_param_groups(lm_pretrained), lr=lr)
opt_pretrained = torch.optim.Adam(get_model_param_groups(lm_pretrained), lr=lr)

data = Databunch(train_it, valid_it)
loss_func = CrossEntropyFlat()

learner = mLearner(lm, opt, loss_func, data)
learner_pretrained = mLearner(lm_pretrained, opt_pretrained, loss_func, data)

In [20]:
learner_pretrained.freeze_to(-1)

In [33]:
info = fit_awd_lstm(1, learner_pretrained)

total iterations : 109, cut : 10


HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=109, style=ProgressStyle(description_w…

HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=28, style=ProgressStyle(description_…

Epoch 0 training loss : 5.930, train accuracy : 0.171, validation loss : 5.259, valid accuracy : 0.220


In [33]:
info = fit_awd_lstm(3, learner_pretrained)

total iterations : 327, cut : 32


HBox(children=(IntProgress(value=0, description='Epoch 0 training', max=109, style=ProgressStyle(description_w…



HBox(children=(IntProgress(value=0, description='Epoch 0 validation', max=28, style=ProgressStyle(description_…

Epoch 0 training loss : 4.484, train accuracy : 0.275, validation loss : 5.133, valid accuracy : 0.240


HBox(children=(IntProgress(value=0, description='Epoch 1 training', max=109, style=ProgressStyle(description_w…



HBox(children=(IntProgress(value=0, description='Epoch 1 validation', max=28, style=ProgressStyle(description_…

Epoch 1 training loss : 4.197, train accuracy : 0.291, validation loss : 5.131, valid accuracy : 0.247


HBox(children=(IntProgress(value=0, description='Epoch 2 training', max=109, style=ProgressStyle(description_w…



HBox(children=(IntProgress(value=0, description='Epoch 2 validation', max=28, style=ProgressStyle(description_…

Epoch 2 training loss : 3.807, train accuracy : 0.327, validation loss : 5.172, valid accuracy : 0.251


In [47]:
def plot_training(info) :
    if not isinstance(info, dict) :
        info = {'' : info}
    fig, ax = plt.subplots(5, figsize=(10,10))
    metrcis = ['train_loss' , 'valid_loss', 'train_acc' , 'valid_acc' , 'lrs' ]
    for model in info :
        for i, met in enumerate(metrcis) :
            ax[i].plot(range(len(info[model][met])), info[model][met], label=model)
            ax[i].set_xlabel('iterations')
            ax[i].set_ylabel(met)
            ax[i].legend()
    fig.tight_layout()
    fig.show()