# Fine Tuning Bert for CoLA

In [None]:
# first we install hub, and huggingface transformers package in this 
# runtime enviroment

!pip install hub
!pip install transformers

**Note**: Restart the colab runtime as few packages has been updated or you may get error (<font color="red">FileNotFoundError</font>)

In [None]:
# first we import hub
import hub

# the following lines will fetch the dataset using hub and show us the 
# structure in which the data is stored in.
ds = hub.Dataset("activeloop/CoLA")
print(ds.schema)

In [None]:
# since now we have the dataset, let us fetch the BERT model 
from transformers import BertForSequenceClassification, BertConfig

# Initializing a model from the bert-base-uncased style 
# configuration for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# accessing the model configuration to keep it handy
configuration = model.config

In [None]:
# Now we have both our data and our model. It is time to set up the 
# input pipeline for which we shall require appropriate tokenizers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# let us look at how input sequences are generated 
# for BERT using the simple sentence "Hello World!"

example = "Hello World!"
encoded_input = tokenizer(example)
print("Encoded input tokens: ", encoded_input["input_ids"])
decoded_input = tokenizer.decode(encoded_input["input_ids"])
print("Decoded input sequence: ", decoded_input)

# one can observe how the tokenizer converts words into ids mapping them 
# to the vocab file and adds special sentinel tokens like [CLS] and [SEP].

In [None]:
import torch
from transformers import AdamW
from tqdm import tqdm

def train_model(dataset, model, batch_size=16, epochs=1):

    # we set up the device so that we can use  GPU hardware acceleration.
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.train() 

    # now we define our optimizer
    optim = AdamW(model.parameters(), lr=5e-5)

    for epoch in range(epochs):
        print("Epoch: ", epoch+1, flush=True)
        avg_loss = 0    #to store the avg_loss across epochs
        
        # first we fetch batches of raw data
        for batch in tqdm(range((ds.shape[0] // batch_size) + 1)):
            batch_sentences = []
            if batch == (ds.shape[0] // batch_size):
                for i in range(batch*batch_size, ds.shape[0]):
                    batch_sentences.append(ds["sentence"][i].compute())
                batch_labels = ds["labels", batch*batch_size :].compute()
            else: 
                for i in range(batch*batch_size, (batch+1)*batch_size):
                    batch_sentences.append(ds["sentence"][i].compute())
                batch_labels = ds["labels", batch*batch_size : (batch+1)*batch_size].compute()
            
            # now we need to preprocess the raw data
            batch_sentences = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
            batch_labels = torch.as_tensor(batch_labels)

            # the following steps train the model
            optim.zero_grad()
            input_ids = batch_sentences['input_ids'].to(device)
            attention_mask = batch_sentences['attention_mask'].to(device)
            batch_labels = batch_labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
            loss = outputs[0]
            loss.backward()
            optim.step()

            avg_loss += loss
        print("Average Loss:", avg_loss)

In [None]:
train_model(ds, model, batch_size=32, epochs=10)