# Transfer Learning Using BERT
Adapted from [this tutorial](https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f) using [bert-base-cased](https://huggingface.co/bert-base-cased). Rather than predicting the masked word in a sentence, the new model predicts the masked label. This was implemented in the hopes that transfer learning would significantly improve modeling outcomes, but performance was on-par with the much more efficient fastText model. Included here only for reference.

In [1]:
#install hugging face transformers - if needed
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [2]:
#imports
import pandas as pd
import numpy as np
from torch import nn, cuda, no_grad, save, backends, manual_seed
from torch import device as dvc
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import gc
from google.colab import files

In [3]:
#for reproducible results: https://vandurajan91.medium.com/random-seeds-and-reproducible-results-in-pytorch-211620301eba
#DataLoaders may still introduce randomness: https://pytorch.org/docs/stable/notes/randomness.html
random_seed = 42
manual_seed(random_seed)
cuda.manual_seed(random_seed)
backends.cudnn.deterministic = True
backends.cudnn.benchmark = False
np.random.seed(random_seed)

In [10]:
#load in data
train = pd.read_csv("spacy_train.csv", dtype={"grade_reduced": str})
val = pd.read_csv("spacy_val.csv", dtype={"grade_reduced": str})
test = pd.read_csv("spacy_test.csv", dtype={"grade_reduced": str})

In [5]:
class ClassificationDataset(Dataset):
    """
    Creates a PyTorch compatible Dataset from the given DataFrame to be used with the BertClassifier model

    Variables:
    df: DataFrame with the x and y columns
    x_col: Str: The text feature to be modeled on
    y_col: Str: The target classification column
    bert_type: Str: Name of the pre-trained Bert model to use
    labels: Dict: Dictionary converting the labels in the y_col of the DataFrame to numeric
    """
    def __init__(self, df, x_col, y_col, tokenizer, labels):

        self.labels = [labels[label] for label in df[y_col]]
        self.texts = [tokenizer(x, 
                                padding = 'max_length', 
                                max_length = 512, 
                                truncation = True,
                                return_tensors="pt")
                       for x in df[x_col]]

    #number of "rows"              
    def __len__(self):
        return len(self.texts)

    #return one x, y pair (to the DataLoader)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [6]:
class BertClassifier(nn.Module):
    """
    Neural Network Classifier 
    bert_type: Str: Name of the pre-trained Bert model to use
    num_labels: Int: Number of target classes
    """

    def __init__(self, bert_type, num_labels, dropout=1):

        #just a pytorch thing, keep it!
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained(bert_type)
        self.dropout = nn.Dropout(dropout) # default keeps all values, likely needs to be adjusted to avoid overfitting
        self.linear = nn.Linear(768, num_labels)
        self.relu = nn.ReLU()
        self.losses = {'train' : [],
                       'val' : [],
                       'test' : []}
        self.accuracy = {'train' : [],
                         'val' : [],
                         'test' : []}

    def forward(self, input_id, mask): # mask tells us which tokens are not [PAD]
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

    def update_loss(self, loss_type, value):
        self.losses[loss_type].append(value)

    def update_acc(self, acc_type, value):
        self.accuracy[acc_type].append(value)

In [7]:
def evaluate(model, data, device, criterion=None, batch_size=2, result_type=None):
    """
    Takes in a neural net and set of data and returns the performance of the model on the data
    """
    dataloader = DataLoader(data, batch_size=batch_size)

    total_acc = 0
    total_loss = 0

    with no_grad():
        for input, label in dataloader:
            label = label.to(device)
            mask = input['attention_mask'].to(device)
            input_id = input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            total_acc += (output.argmax(dim=1) == label).sum().item()

            if criterion:
                total_loss += criterion(output, label.long()).item()

    if result_type:
        print(f"{result_type.title()} Accuracy: {round(total_acc / len(data), 3)}")
    if criterion:
        print(f"{result_type.title()} Loss: {round(total_loss / len(data), 3)}")

    if criterion:
        return total_acc, total_loss
    return total_acc


In [8]:
def train_model(model, train, val, device, batch_size, learning_rate, epochs):
    """
    Takes in a model, training data, and validation data and trains the given model
    Returns None, but changes are saved in the model itself 
    """

    #dataloader to iterate through the training dataset
    tr_dataloader = DataLoader(train, batch_size=batch_size, shuffle=True)

    #loss criterion
    criterion = nn.CrossEntropyLoss()

    #use GPU if available
    if cuda.is_available():
        criterion = criterion.cuda()

    #optimizer - automatically cuda if model has been switched to cuda
    optimizer = Adam(model.parameters(), lr=learning_rate)


    for epoch_num in range(epochs):

        #model training
        total_loss_train = 0
        total_acc_train = 0

        for train_input, train_label in tqdm(tr_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            total_acc_train += (output.argmax(dim=1) == train_label).sum().item()

            #reset gradients and backwards propagate
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        #training epoch update
        model.update_acc('train', total_acc_train / len(train))
        model.update_loss('train', total_loss_train / len(train))

        print(f"Epoch: {epoch_num + 1}")
        print(f"Train Loss: {round(total_loss_train / len(train), 3)}")
        print(f"Train Accuracy: {round(total_acc_train / len(train), 3)}")

        #run validation + printed update
        total_acc_val, total_loss_val = evaluate(model, val, device, criterion, batch_size, result_type="Validation")
        print()

        #update val loss/acc in model
        model.update_acc('train', total_acc_train / len(train))
        model.update_loss('val', total_loss_val / len(val))

        #clear cache to hopefully avoid runtime errors
        cuda.empty_cache()
        gc.collect()
    
    # total_loss_val = 0
    # total_acc_val = 0

    # #no back propagation so no need for gradients - much faster
    # with no_grad():
    #   for val_input, val_label in tqdm(val_dataloader):
    #     val_label = val_label.to(Device)
    #     mask = val_input['attention_mask'].to(device)
    #     input_id = val_input['input_ids'].squeeze(1).to(device)

    #     output = model(input_id, mask)

    #     batch_loss = criterion(output, val_label.long())
    #     total_loss_val += batch_loss.item()

    #     acc = (output.argmax(dim=1) == val_label).sum()
    #     total_acc_val += acc.item()


In [10]:
# #create datasets
# bert_type = 'bert-base-uncased'
# x_col = 'text_combined'
# y_col = 'grade_reduced'
# labels = {x : i for i, x in enumerate(sorted(train['grade_reduced'].unique(), key=lambda x: int(x.split('.')[1])))}
# tokenizer = BertTokenizer.from_pretrained(bert_type)

# train_dataset = ClassificationDataset(train, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )
# val_dataset = ClassificationDataset(val, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )
# test_dataset = ClassificationDataset(test, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )


In [11]:
#create lemmatized datasets
bert_type = 'bert-base-uncased'
x_col = 'lemmatized_text_combined'
y_col = 'grade_reduced'
labels = {x : i for i, x in enumerate(sorted(train['grade_reduced'].unique(), key=lambda x: int(x.split('.')[1])))}
tokenizer = BertTokenizer.from_pretrained(bert_type)

train_lemma_dataset = ClassificationDataset(train, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )
val_lemma_dataset = ClassificationDataset(val, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )
test_lemma_dataset = ClassificationDataset(test, x_col=x_col, y_col=y_col, tokenizer=tokenizer, labels=labels )

In [12]:
#create model
model = BertClassifier(bert_type=bert_type, num_labels=len(labels), dropout=.5)

#use GPU if available
if cuda.is_available():
    device = dvc("cuda")
    model = model.cuda()
else:
    device = dvc("cpu")

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
cuda.empty_cache()
gc.collect()

279

In [14]:
#train model and print progress
train_model(model, train_lemma_dataset, val_lemma_dataset, device, batch_size=16, epochs=5, learning_rate=1e-6)

100%|██████████| 5738/5738 [1:15:44<00:00,  1.26it/s]


Epoch: 1
Train Loss: 0.13
Train Accuracy: 0.273
Validation Accuracy: 0.301
Validation Loss: 0.118



100%|██████████| 5738/5738 [1:15:48<00:00,  1.26it/s]


Epoch: 2
Train Loss: 0.114
Train Accuracy: 0.313
Validation Accuracy: 0.315
Validation Loss: 0.112



100%|██████████| 5738/5738 [1:15:46<00:00,  1.26it/s]


Epoch: 3
Train Loss: 0.11
Train Accuracy: 0.328
Validation Accuracy: 0.323
Validation Loss: 0.11



100%|██████████| 5738/5738 [1:15:45<00:00,  1.26it/s]


Epoch: 4
Train Loss: 0.107
Train Accuracy: 0.339
Validation Accuracy: 0.324
Validation Loss: 0.109



100%|██████████| 5738/5738 [1:15:46<00:00,  1.26it/s]


Epoch: 5
Train Loss: 0.104
Train Accuracy: 0.351
Validation Accuracy: 0.332
Validation Loss: 0.108



In [15]:
test_acc = evaluate(model, test_lemma_dataset, device, batch_size=2, result_type="Test")
model.update_acc('test', test_acc)

Test Accuracy: 0.337


In [16]:
#https://www.honchosearch.com/blog/seo/how-to-download-files-from-google-colab/#:~:text=To%20download%20a%20file%20for,then%20this%20won't%20work.&text=Once%20executed%2C%20this%20will%20download%20the%20file%20directly%20to%20your%20downloads.
save(model, 'model.pt')
files.download('model.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>