
# Multi label classification with Bert
Use oversampling with the same model to increase accuracy

In [1]:
# Import all libraries
import pandas as pd
import numpy as np
import re

# Huggingface transformers
import transformers
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#handling html data
from bs4 import BeautifulSoup

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
%matplotlib inline

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

BERT_MODEL_NAME = 'bert-base-cased'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
import TextCleaner

# Load dataset

In [3]:
topics = [
    "sortings",
    "strings",
    "greedy",
    "number theory",
    "math",
    "graphs",
    "geometry",
    "data structures",
]

topics

['sortings',
 'strings',
 'greedy',
 'number theory',
 'math',
 'graphs',
 'geometry',
 'data structures']

In [4]:
import json 

problems = []
for topic in topics:
  codeforcesProblems = json.load(open(f"data/codeforces-{topic}.json"))
  problems.extend(codeforcesProblems)
  # omegaupProblems = json.load(open(f"data/omegaup-{topic}.json"))
  # problems.extend(omegaupProblems)

data = []
for problem in problems:
    validTopics = [topic for topic in problem['topics'] if topic in topics]
    text = problem['history']

    data.append({
      'title': problem['title'],
      'history': TextCleaner.pretty(text) if len(text) else "",
      'topics': validTopics,
      'url': problem['url'],
    })

In [5]:
df = pd.DataFrame(data)
df

Unnamed: 0,title,history,topics,url
0,Yet Another Tournament,You are participating in Yet Another Tournamen...,"[greedy, sortings]",https://codeforces.com/problemset/problem/1783/C
1,Make it Beautiful,An array a is called ugly if it contains at le...,"[math, sortings]",https://codeforces.com/problemset/problem/1783/A
2,Equal Frequencies,Let' s call a string balanced if all character...,"[greedy, sortings, strings]",https://codeforces.com/problemset/problem/1781/C
3,Going to the Cinema,A company of n people is planning a visit to t...,"[greedy, sortings]",https://codeforces.com/problemset/problem/1781/B
4,Anya's Simultaneous Exhibition,This is an interactive problem. Anya has gathe...,"[graphs, greedy, sortings]",https://codeforces.com/problemset/problem/1779/E
...,...,...,...,...
7737,Ball,N ladies attend the ball in the King' s palace...,"[data structures, sortings]",https://codeforces.com/problemset/problem/12/D
7738,Exposition,There are several days left before the fiftiet...,[data structures],https://codeforces.com/problemset/problem/6/E
7739,Bindian Signalizing,Everyone knows that long ago on the territory ...,[data structures],https://codeforces.com/problemset/problem/5/E
7740,Longest Regular Bracket Sequence,This is yet another problem dealing with regul...,"[data structures, greedy, sortings, strings]",https://codeforces.com/problemset/problem/5/C


In [6]:
df.shape

(7742, 4)

In [7]:
df['history'] = df['history'].apply(lambda x: x.replace('\t', '').replace('\n', '').replace('\"', ''))

In [8]:
dfclean = df[['title', 'history', 'topics', 'url']]

In [9]:
dfclean.to_csv("data/codeforces.csv")

In [12]:
result = dfclean.to_json(orient="records")
parsed = json.loads(result)
with open('data/codeforces.json', 'w') as f:
  json.dump(parsed, f, indent=2)

In [10]:
df2 = pd.read_csv("data/codeforces.csv")
df2

Unnamed: 0.1,Unnamed: 0,title,history,topics,url
0,0,Yet Another Tournament,You are participating in Yet Another Tournamen...,"['greedy', 'sortings']",https://codeforces.com/problemset/problem/1783/C
1,1,Make it Beautiful,An array a is called ugly if it contains at le...,"['math', 'sortings']",https://codeforces.com/problemset/problem/1783/A
2,2,Equal Frequencies,Let' s call a string balanced if all character...,"['greedy', 'sortings', 'strings']",https://codeforces.com/problemset/problem/1781/C
3,3,Going to the Cinema,A company of n people is planning a visit to t...,"['greedy', 'sortings']",https://codeforces.com/problemset/problem/1781/B
4,4,Anya's Simultaneous Exhibition,This is an interactive problem. Anya has gathe...,"['graphs', 'greedy', 'sortings']",https://codeforces.com/problemset/problem/1779/E
...,...,...,...,...,...
7737,7737,Ball,N ladies attend the ball in the King' s palace...,"['data structures', 'sortings']",https://codeforces.com/problemset/problem/12/D
7738,7738,Exposition,There are several days left before the fiftiet...,['data structures'],https://codeforces.com/problemset/problem/6/E
7739,7739,Bindian Signalizing,Everyone knows that long ago on the territory ...,['data structures'],https://codeforces.com/problemset/problem/5/E
7740,7740,Longest Regular Bracket Sequence,This is yet another problem dealing with regul...,"['data structures', 'greedy', 'sortings', 'str...",https://codeforces.com/problemset/problem/5/C
