
# Multi label classification with Bert
Use oversampling with the same model to increase accuracy

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
DRIVE_PATH = "/content/drive/MyDrive/Competitive-programming-problems-classification/Github/Competitive-programming-problems-classification/backend"
# DRIVE_PATH = "/Users/abraham/Library/CloudStorage/GoogleDrive-abraham.murillo7443@alumnos.udg.mx/My Drive/Competitive-programming-problems-classification/Github/Competitive-programming-problems-classification/backend"
CHECKPOINTS_DIR = f"{DRIVE_PATH}/training"

In [3]:
! pip install -q pytorch-lightning

In [4]:
! pip install -q transformers

In [5]:
# Import all libraries
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#handling html data
from bs4 import BeautifulSoup

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

BERT_MODEL_NAME = 'bert-base-cased'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load dataset

In [6]:
startingTopics = [
    "sortings",
    "strings",
    # "greedy",
    "number theory",
    "math",
    "graphs",
    "geometry",
    "data structures",
]

def getTopicIndexMap():
  map = {}
  for index, topic in enumerate(startingTopics):
    map[topic] = int(index)
  return map

topicIndexMap = getTopicIndexMap()
topicIndexMap

{'sortings': 0,
 'strings': 1,
 'number theory': 2,
 'math': 3,
 'graphs': 4,
 'geometry': 5,
 'data structures': 6}

In [7]:
import json 

problems = []
for topic in startingTopics:
  codeforcesProblems = json.load(open(f"{DRIVE_PATH}/data/codeforces-{topic}.json"))
  problems.extend(codeforcesProblems)
  # omegaupProblems = json.load(open(f"data/omegaup-{topic}.json"))
  # problems.extend(omegaupProblems)

texts = []
categories = []

problemIndexesByCategory = {}
for index, problem in enumerate(problems):
    validTopics = [topic for topic in problem['topics'] if topic in startingTopics]
    text = problem['history']
    
    texts.append(text)
    categories.append(validTopics)

    for topic in validTopics:
      if topic not in problemIndexesByCategory:
        problemIndexesByCategory[topic] = []
      problemIndexesByCategory[topic].append(index)

In [8]:
# pibcLens = [(len(indexes), topic) for topic, indexes in problemIndexesByCategory.items()]
# morePopularTopic = max(pibcLens)
# print(morePopularTopic)

# for topic, indexes in problemIndexesByCategory.items():
#     moreIndexes = np.random.choice(indexes, morePopularTopic[0] - len(indexes), replace=True)
#     for index in moreIndexes:
#         texts.append(texts[index])
#         categories.append(categories[index])
#     print(f"For {topic}({len(indexes)}) add {len(moreIndexes)} problems, total now", len(texts))

In [9]:
df = pd.DataFrame({'text': texts, 'category': categories})
print(df.shape)
df

(6242, 2)


Unnamed: 0,text,category
0,You are participating in Yet Another Tournamen...,[sortings]
1,An array a is called ugly if it contains at le...,"[math, sortings]"
2,Let' s call a string balanced if all character...,"[sortings, strings]"
3,A company of n people is planning a visit to t...,[sortings]
4,This is an interactive problem. Anya has gathe...,"[graphs, sortings]"
...,...,...
6237,N ladies attend the ball in the King' s palace...,"[data structures, sortings]"
6238,There are several days left before the fiftiet...,[data structures]
6239,Everyone knows that long ago on the territory ...,[data structures]
6240,This is yet another problem dealing with regul...,"[data structures, sortings, strings]"


In [10]:
df.shape

(6242, 2)

In [11]:
# Encode the tags(labels) in a binary format in order to be used for training
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
 
binary_categories = mlb.fit_transform(df['category'])

print(binary_categories.shape)
binary_categories

(6242, 7)


array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 1, 1],
       [1, 0, 0, ..., 0, 0, 0]])

# CPProblemsDataset

In [12]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

MAX_TEXT_LEN = 512

def tokenize(text):
  return tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True, # Add [CLS] [SEP]
            max_length= MAX_TEXT_LEN,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True, # Differentiates padded vs normal token
            truncation=True, # Truncate data beyond max length
            return_tensors = 'pt' # PyTorch Tensor format
        )

class CPProblemsDataset(Dataset):
    def __init__(self, texts, labels):
        self.text = texts
        self.labels = labels
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = tokenize(text)
        
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        #token_type_ids = inputs["token_type_ids"]
        
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.labels[item_idx], dtype=torch.float)
            
        }

# CPModule

In [13]:
class CPModule(pl.LightningDataModule):
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test, batch_size = 16):
        super().__init__()
        self.train_text = x_train
        self.train_label = y_train
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.batch_size = batch_size

    def setup(self, stage = None):
        self.train_dataset = CPProblemsDataset(texts = self.train_text, labels = self.train_label)
        self.val_dataset = CPProblemsDataset(texts = self.val_text, labels = self.val_label)
        self.test_dataset = CPProblemsDataset(texts = self.test_text, labels = self.test_label)
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, shuffle = True, num_workers = 2)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size = 16)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = 16)


# CP Classifier

In [14]:
class CPClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, num_classes, steps_per_epoch = None, num_epochs = 3, lr = 2e-5 ):
        super().__init__()

        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict = True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes) # outputs = number of labels
        self.steps_per_epoch = steps_per_epoch
        self.num_epochs = num_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, input_ids, attn_mask):
        output = self.bert(input_ids = input_ids, attention_mask = attn_mask)
        output = self.classifier(output.pooler_output)  

        return output
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        
        return {"loss": loss, "predictions": outputs, "labels": labels }

    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr = self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.num_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

# Split data in train, test and validation

In [15]:
from sklearn.model_selection import train_test_split

# First Split for Train and Test
x_train, x_test, y_train, y_test = train_test_split(texts, binary_categories, test_size=0.1, random_state=RANDOM_SEED, shuffle=True)

# Next split Train in to training and validation
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

# Training :(

In [65]:
# Initialize the parameters that will be use for training
NUM_EPOCHS = 10
BATCH_SIZE = 15
LEARNING_RATE = 2e-05

module = CPModule(x_train, y_train, x_val, y_val, x_test, y_test, BATCH_SIZE)
module.setup()

In [66]:
# Instantiate the classifier model
steps_per_epoch = len(x_train)//BATCH_SIZE
model = CPClassifier(
    num_classes = len(startingTopics), 
    steps_per_epoch = steps_per_epoch, 
    num_epochs = NUM_EPOCHS, 
    lr = LEARNING_RATE
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [67]:
# Initialize Pytorch Lightning callback for Model checkpointing

# saves a file like: input/CP-epoch=02-val_loss=0.32.ckpt
val_loss_checkpoint = ModelCheckpoint(
    dirpath = CHECKPOINTS_DIR,
    monitor = 'val_loss', # monitored quantity
    filename = "no_greedy-val_loss-{epoch:02d}-{val_loss:.2f}",
    save_top_k = 2, #  save the top 3 models
    mode = 'min', # mode of the monitored quantity  for optimization
)

latest_checkpoint = ModelCheckpoint(
    dirpath = CHECKPOINTS_DIR,
    filename = "no_greedy-latest-{epoch:02d}-{step}",
    monitor = 'step', # monitored every step
    mode = 'max',
    save_top_k=2, #  save the top 3 models
)


In [68]:
# Instantiate the Model Trainer
trainer = pl.Trainer(
    max_epochs = NUM_EPOCHS, 
    gpus = 1, 
    callbacks = [val_loss_checkpoint, latest_checkpoint],
    enable_checkpointing = True,
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [69]:
# Using a previous version already trained 
latest_version = f"{CHECKPOINTS_DIR}/no_greedy-val_loss-epoch=07-val_loss=0.41.ckpt"

In [None]:
# Train the Classifier Model
trainer.fit(model, module, ckpt_path = latest_version)

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/drive/MyDrive/Competitive-programming-problems-classification/Github/Competitive-programming-problems-classification/backend/training/no_greedy-val_loss-epoch=07-val_loss=0.41.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 108 M 
1 | classifier | Linear            | 5.4 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.263   Total estimated model params size (MB)
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint file at /content/drive/MyDrive/Competitive-programming-problems-classification/Github/Competitive-prog

Sanity Checking: 0it [00:00, ?it/s]

Training: 300it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# Evaluate the model performance

In [22]:
# Evaluate the model performance on the test dataset
trainer.test(model, datamodule = module)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.4067067503929138
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4067067503929138}]

# Setup test dataset for BERT

In [23]:
best_model_path = val_loss_checkpoint.best_model_path
best_model_path

'/content/drive/MyDrive/Competitive-programming-problems-classification/Github/Competitive-programming-problems-classification/backend/training/no_greedy-val_loss-epoch=07-val_loss=0.41.ckpt'

In [24]:
from torch.utils.data import TensorDataset

# Tokenize all questions in x_test
input_ids = []
attention_masks = []

for text in x_test:
    encoded_text = tokenize(text)
    
    # Add the input_ids from encoded question to the list.    
    input_ids.append(encoded_text['input_ids'])
    # Add its attention mask 
    attention_masks.append(encoded_text['attention_mask'])
    
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(y_test)

# Set the batch size.  
TEST_BATCH_SIZE = 64  

# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)

In [25]:
pred_data[0][0].shape

torch.Size([512])

In [26]:
flat_pred_outs = 0
flat_true_labels = 0

In [27]:
# Put model in evaluation mode
model = model.to(device) # moving model to cuda
model.eval()

# Tracking variables 
pred_outs, true_labels = [], []
#i=0
# Predict 
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids, b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #i+=1
        # Store predictions and true labels
        #print(i)
        #print(outputs)
        #print(logits)
        #print(label_ids)
    pred_outs.append(pred_out)
    true_labels.append(label_ids)

In [28]:
pred_outs[0][0]

array([0.24029681, 0.07392873, 0.1726837 , 0.23748383, 0.11509492,
       0.17493628, 0.6267794 ], dtype=float32)

In [29]:
# Combine the results across all batches. 
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [30]:
flat_pred_outs.shape , flat_true_labels.shape

((625, 7), (625, 7))

In [31]:
# convert probabilities into 0 or 1 based on a threshold value
def classify(pred_prob, threshold):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= threshold:
                temp.append(1) # Infer tag value as 1 (present)
            else:
                temp.append(0) # Infer tag value as 0 (absent)
        y_pred.append(temp)

    return y_pred

In [32]:
flat_pred_outs[3]

array([0.25296727, 0.06746758, 0.10736085, 0.740096  , 0.48290464,
       0.20477144, 0.09837015], dtype=float32)

In [33]:
flat_true_labels[3]

array([0, 0, 0, 1, 1, 0, 0])

# Predictions of Tags in Test set

The predictions are in terms of logits (probabilities for each of the 8 tags). Hence we need to have a threshold value to convert these probabilities to 0 or 1.

Let's specify a set of candidate threshold values. We will select the threshold value that performs the best for the test set.

In [44]:
thresholds = np.arange(0.3, 0.51,0.01)
thresholds

array([0.3 , 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 ,
       0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 ])

In [45]:
from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel() 

for threshold in thresholds:
    #classes for each threshold
    pred_bin_label = classify(flat_pred_outs, threshold) 

    #convert to 1D array
    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true, y_pred))

In [46]:
# find the optimal threshold
opt_thresh = thresholds[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

Optimal Threshold Value = 0.31


# Performance Score Evaluation

In [47]:
#predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs, opt_thresh)
y_pred = np.array(y_pred_labels).ravel() # Flatten

In [48]:
y_pred_prob = []
for row, pred in enumerate(flat_pred_outs):
  y_pred_prob.append([prob for col, prob in enumerate(pred) if y_pred_labels[row][col] == 1])

In [49]:
print(metrics.classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      3403
           1       0.53      0.62      0.57       972

    accuracy                           0.80      4375
   macro avg       0.71      0.73      0.72      4375
weighted avg       0.81      0.80      0.80      4375



In [50]:
y_pred = mlb.inverse_transform(np.array(y_pred_labels))
y_act = mlb.inverse_transform(flat_true_labels)

df = pd.DataFrame({
    'Text': x_test, 
    'Actual topics': y_act,
    'Predicted topics': y_pred,
    'Probability': y_pred_prob,
})

In [57]:
df.sample(20)

Unnamed: 0,Text,Actual topics,Predicted topics,Probability
414,In problems on strings one often has to find a...,"(data structures, strings)","(data structures, strings)","[0.38354817, 0.6675674]"
66,"Stepan has n pens. Every day he uses them, and...","(number theory,)","(math, sortings)","[0.3641933, 0.31432465]"
144,"In the last war of PMP, he defeated all his op...","(data structures,)","(data structures, graphs)","[0.43917316, 0.35740724]"
421,Arkady has got an infinite plane painted in co...,"(data structures,)","(math,)",[0.400309]
99,"Even if the world is full of counterfeits, I s...","(math,)","(math, number theory)","[0.71770084, 0.44452074]"
387,"You are given two integers l and r , where l <...","(math, number theory)","(data structures, math)","[0.3424146, 0.49920917]"
273,Easy and hard versions are actually different ...,"(data structures, sortings)","(data structures, math)","[0.41313773, 0.41461298]"
98,Consider an infinite triangle made up of layer...,"(graphs, math, sortings)","(data structures, graphs)","[0.37404794, 0.3893599]"
295,Caisa is now at home and his son has a simple ...,"(math, number theory)","(data structures, graphs)","[0.45393652, 0.38731015]"
40,Consider an infinite triangle made up of layer...,"(graphs, math, sortings)","(data structures, graphs)","[0.37404794, 0.3893599]"


In [None]:
empty = df[df['Predicted topics'] == ()]
empty