
# Multi label classification with Bert
Use oversampling with the same model to increase accuracy

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
DRIVE_PATH = "/content/drive/MyDrive/Competitive-programming-problems-classification/Github/Competitive-programming-problems-classification/backend"

In [None]:
! pip install -q transformers

In [None]:
# Import all libraries
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#handling html data
from bs4 import BeautifulSoup

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

BERT_MODEL_NAME = 'bert-base-cased'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load dataset

In [None]:
topics = [
    "sortings",
    "strings",
    "greedy",
    "number theory",
    "math",
    "graphs",
    "geometry",
    "data structures",
]

topics

['sortings',
 'strings',
 'greedy',
 'number theory',
 'math',
 'graphs',
 'geometry',
 'data structures']

In [None]:
import json 

problems = []
for topic in topics:
  codeforcesProblems = json.load(open(f"{DRIVE_PATH}/data/codeforces-{topic}.json"))
  problems.extend(codeforcesProblems)
  # omegaupProblems = json.load(open(f"data/omegaup-{topic}.json"))
  # problems.extend(omegaupProblems)

texts = []
categories = []
for problem in problems:
    validTopics = [topic for topic in problem['topics'] if topic in topics]
    text = problem['history']
    
    texts.append(text)
    categories.append(validTopics)

In [None]:
df = pd.DataFrame({'text': texts, 'category': categories})
df

Unnamed: 0,text,category
0,You are participating in Yet Another Tournamen...,[sortings]
1,An array a is called ugly if it contains at le...,"[math, sortings]"
2,Let' s call a string balanced if all character...,"[sortings, strings]"
3,A company of n people is planning a visit to t...,[sortings]
4,This is an interactive problem. Anya has gathe...,"[graphs, sortings]"
...,...,...
6237,N ladies attend the ball in the King' s palace...,"[data structures, sortings]"
6238,There are several days left before the fiftiet...,[data structures]
6239,Everyone knows that long ago on the territory ...,[data structures]
6240,This is yet another problem dealing with regul...,"[data structures, sortings, strings]"


In [None]:
df.shape

(7742, 2)

In [None]:
# Encode the tags(labels) in a binary format in order to be used for training
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
 
binary_categories = mlb.fit_transform(df['category'])
df['binary_categories'] = pd.Series(list(binary_categories))
df.head()

Unnamed: 0,text,category,binary_categories
0,You are participating in Yet Another Tournamen...,"[greedy, sortings]","[0, 0, 0, 1, 0, 0, 1, 0]"
1,An array a is called ugly if it contains at le...,"[math, sortings]","[0, 0, 0, 0, 1, 0, 1, 0]"
2,Let' s call a string balanced if all character...,"[greedy, sortings, strings]","[0, 0, 0, 1, 0, 0, 1, 1]"
3,A company of n people is planning a visit to t...,"[greedy, sortings]","[0, 0, 0, 1, 0, 0, 1, 0]"
4,This is an interactive problem. Anya has gathe...,"[graphs, greedy, sortings]","[0, 0, 1, 1, 0, 0, 1, 0]"


In [None]:
DATAFRAME_EXPERIMENT_NAME = "oversampling"