<a href="https://colab.research.google.com/github/askaranam/TransferLearning_NLP/blob/master/ULMFiT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A jupyter notebook created in Google Colab for experimenting with transfer learning for text classification using 
AWD_LSTM and Transformer architectures using fastai library in python


In [0]:
# Installing Pytorch and fast ai
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
# installing FastAi (latest version)
!pip install fastai

In [0]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [0]:
# importing dataset from sklearn datasets
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})
df.shape

In [0]:
# Using data from two classes (specifically, classes: 1, 10) for the purpose of binary classification
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)
df['label'].value_counts()

In [0]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

In [0]:
# removing unwanted characters
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

In [0]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)
df_trn.shape, df_val.shape

In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [0]:
# Pretrained weights for each architecture, it could be AWD_LSTM or Transformer
learn = language_model_learner(data_lm, AWD_LSTM , drop_mult=0.7)

In [0]:
# Using One Cycle Policy for regularization 
# from https://arxiv.org/abs/1803.09820
learn.fit_one_cycle(2, 1e-12)

In [0]:
# save the encoder
learn.save_encoder('ft_enc')

In [0]:
# now use Classification model to build the classifier with fine tuned encoder 
learn = text_classifier_learner(data_clas, drop_mult=0.7, arch = AWD_LSTM)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-2)


In [15]:
# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,215,23
1,19,217
