In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai import *
from fastai.text import *
from scipy.spatial.distance import cosine as dist

## Load and Preprocess Data

In [None]:
import numpy as np
import json

class prepareData:
    def __init__(self, filename):
        self.data=self.loadData(filename)
        self.X=[]
        self.Y=[]
    
    def loadData(self,filename):
        data=[]
        with open(filename) as f:
            data = json.load(f)
        return data

    def getLength(self):
        return (len(self.X))
        

    def preprocessData(self):
        #extract words in a window
        full_forms=[]
        for i in self.data:
            acro_at=i['acronym']
            tok=i['tokens']
            full_forms.append(i['expansion'])
            n=len(tok)
            low=acro_at-5
            up=acro_at+5
            if low<0:
                low=0
            if up>n:
                up=n
            window=''
            for j in range(low,up):
                window=window+tok[j]+' '
            self.X.append(window)

        label_set=set(full_forms)
        n=len(label_set)
        l=list(label_set)

        for a in full_forms:
            for i in range(n):
                if l[i]==a:
                    self.Y.append(i)


In [None]:
import pandas as pd
data=dataClass.prepareData('dataset.json')
df = pd.DataFrame(list(zip(data.X, data.Y)),columns =['sentence', 'label'])
df.head()

## Train language model

In [None]:
from fastai.text import * 
# after tokenisation
data_lm = (TextList.from_df(df, cols='sentence')
                .split_by_rand_pct(0.1)
                .label_for_lm()  
                .databunch(bs=48))
data_lm.show_batch()

In [None]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.5).to_fp16()
learn.lr_find()

In [None]:
learn.recorder.plot(skip_end=15)

In [None]:
# define batch size and learning rate
bs=48
lr = 1e-02
lr *= bs/20

In [None]:
# fit the classifier for one cycle
learn.fit_one_cycle(1, lr, moms=(0.8,0.7))

In [None]:
# unfreeze all layers and then train some more
learn.unfreeze()
learn.fit_one_cycle(5, lr/10, moms=(0.8,0.7))

In [None]:
# save the encoder and vocab
learn.save('fine_tuned_10')
learn.save_encoder('fine_tuned_enc_10')

## Classification Phase

In [None]:
# define dataset for classification 
data_clas = (TextList.from_df(df, cols=['sentence'], vocab=data_lm.vocab)
             .split_by_rand_pct(0.1)
             .label_from_df(cols= 'label')
             .databunch(bs=48))

data_clas.show_batch()

In [None]:
#initialising classifier
learn_c = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5,metrics=[accuracy]).to_fp16()
learn_c.load_encoder('fine_tuned_enc_10')
learn_c.freeze()

In [None]:
learn_c.lr_find()
learn_c.recorder.plot(skip_end=15)

In [None]:
lr=1e-2

In [None]:
learn_c.fit_one_cycle(3,lr, moms=(0.8,0.7))

In [None]:
# plot loss and momentum
learn_c.recorder.plot_losses(), learn_c.recorder.plot_lr(show_moms=True)

In [None]:
# unfreeze and train more.
learn_c.unfreeze()
learn_c.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10), moms=(0.8,0.7))