# Finetuning the BERT-base-german-cased model

About the model: https://huggingface.co/bert-base-german-cased

In this example, we will be fine-tuning a transformer model for the multiclass text classification problem, e.g., classifying sentences/news headlines into provided categories. 

Hardware requirements:

    Python 3.6 and above
    Pytorch, Transformers and general use Python ML Libraries
    GPU enabled setup

### Setup

In [None]:
# install modules

!pip install torch 
!pip install transformers

In [1]:
# import libraries 

import json
import pandas as pd
import torch
import transformers

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting up device for GPU usage 
# (check options if using Google Colab or Cloud!)

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Load preprocessed corpus

In [None]:
# prepare for data preprocessing for modelling
# load in preprocessed dataset here!
# remember to encode the categories appropriately!

### Prepare the dataset & dataloader

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05

In [None]:
# loading model tokenizer from transformers library
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
# create "triage" dataset class (i.e. data ETL pipeline)
# accepts the dataframe and generates tokenized output needed by the model

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
# Creating the dataset size + dataloader for the neural network

# The dataloader loads data into the model in a controlled manner. 
# This is needed because all the data from the dataset cannot be 
# loaded into memory at once, hence the amount of data loaded 
# and then passed to the neural network needs to be controlled.

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

### Loading the German BERT model 

In [None]:
# load the required model from the transformers library

model = AutoModelForMaskedLM.from_pretrained("bert-base-german-cased")
model.to(device)

In [None]:
# Creating the loss function and optimizer

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

### Fine-tuning the model

Here we define a training function that trains the model on the training dataset created above, specified number of times (epoch). An epoch defines how many times the complete data will be passed through the network.

Following events happen in this function to fine tune the neural network:

    The dataloader passes data to the model based on the batch size.
    Subsequent output from the model and the actual category are compared to calculate the loss.
    Loss value is used to optimize the weights of the neurons in the network.
    After every 5000 steps the loss value is printed in the console.


In [None]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # when using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

### Validating the model

During the validation stage we pass the unseen data (test data) to the model. This step determines how good the model performs on the unseen data. In this example, we retained 20% of the data which was separated during dataset above. It may be advisable to use k-fold cross-validation instead.

In [None]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging the dataloader created for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

### Saving the trained model

In [None]:
# saving the files for inference

output_model_file = './models/pytorch_germanbert_news.bin'
output_vocab_file = './models/vocab_germanbert_news.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('Stage completed')

### Inference using new data

In [None]:
...