In [None]:
# !pip install -q transformers datasets

In [None]:
# from datasets import load_dataset

# dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup


In [None]:
df = pd.read_csv("../input/train-csv/train.csv")
df.head()

In [None]:
df['text'] = df['Title'] +" "+ df['Abstract']
del df['Title']
del df['Abstract']
df['Categories'] = df['Categories'].str.replace(', ', ',')
df['Categories'] = df['Categories'].str.strip('[]')
categories_df = df['Categories'].str.get_dummies(sep=',')
df = pd.concat([df.drop('Categories', axis=1), categories_df], axis=1)
df.head()
# column_names_list = df.columns.tolist()
# print(column_names_list)

In [None]:
sw = stopwords.words('english')

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split()]
    
    text = " ".join(text) #removing stopwords
    
    return text

In [None]:
df['text'] = df['text'].apply(lambda x: clean_text(x))

In [None]:
df.columns = df.columns.str.strip("'")
new_column_name = 'title_summary'  # Specify the new name you want

# Use the rename method to change the column name
df = df.rename(columns={df.columns[1]: new_column_name})
df.head()

As we can see, the dataset contains 3 splits: one for training, one for validation and one for testing.

In [None]:
target_cols = ["math.AT", "stat.AP", "cs.AR", "math.QA", "q-bio.MN", "eess.AS", "eess.IV", "stat.ME", "econ.GN", "eess.SP", 
               "q-fin.RM", "cs.LG", "cs.CR", "q-bio.BM", "q-fin.GN", "q-fin.MF", "q-fin.PR", "math.CV", "cs.LO", "econ.TH", 
               "math.CO", "cs.AI", "math.AC", "q-bio.CB", "q-fin.CP", "cs.CL", "cs.DC", "math.LO", "math.NT", "cs.SD", 
               "q-fin.TR", "cs.CV", "stat.ML", "q-fin.EC", "econ.EM", "cs.CE", "stat.CO", "math.PR", "q-bio.NC", "math.AP", 
               "cs.OS", "cs.NI", "cs.IT", "cs.PL", "cs.GT", "cs.DM", "math.IT", "cs.SE", "cs.RO", "stat.TH", "cs.DB", 
               "math.ST", "q-bio.GN", "q-fin.PM", "q-bio.TO", "math.GR", "cs.IR"]
prefixes = {}

for col in target_cols:
    prefix, sub_cat = col.split(".")
    if prefix not in prefixes:
        prefixes[prefix]=[]
    prefixes[prefix].append(col)

print(prefixes)


def get_domain(row, x):
    sum_d= row[x].sum()
    if sum_d>0:
        return 1
    else:
        return 0

for x in prefixes:
    df[x] = df.apply(get_domain, args=(prefixes[x],), axis=1)
   
df.head()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
# label_columns = df.columns[2:].tolist()
# df[label_columns] = df[label_columns].astype('float')

In [None]:
level1_cols=[x for x in prefixes]
print(level1_cols)

In [None]:
level2_cols = [col for col in df.columns if col not in ['Id', 'title_summary']+level1_cols]
level2_cols



In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, level1_cols, level2_cols):
        self.df = df
        self.max_len = max_len
        self.text = df.title_summary
        self.tokenizer = tokenizer
        self.targets_level1 = df[level1_cols].values  # Targets for level 1
        self.targets_level2 = df[level2_cols].values  # Targets for level 2
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets_level1': torch.tensor(self.targets_level1[index], dtype=torch.float),
            'targets_level2': torch.tensor(self.targets_level2[index], dtype=torch.float)
        }


In [None]:
df[target_cols].values

In [None]:
train_dataset = BERTDataset(df, tokenizer, MAX_LEN,level1_cols,level2_cols)

In [None]:
# next(iter(train_dataset))

In [None]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)

In [None]:
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, num_labels_level1, num_labels_level2):
        super(BERTClass, self).__init__()
        self.bert = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier_level1 = torch.nn.Linear(768, num_labels_level1)  # Classifier for level 1
        self.classifier_level2 = torch.nn.Linear(768 + num_labels_level1, num_labels_level2)  # Classifier for level 2, input includes level 1 predictions
    
    def forward(self, ids, mask, token_type_ids):
        outputs = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        sequence_output, pooled_output = outputs[:2]
        pooled_output = self.dropout(pooled_output)
        
        level1_logits = self.classifier_level1(pooled_output)
        
        # Concatenate pooled_output with predictions from level 1 for level 2 input
        level2_input = torch.cat((pooled_output, level1_logits), 1)
        level2_logits = self.classifier_level2(level2_input)
        
        return level1_logits, level2_logits

# Determine the number of unique labels for level 1 and level 2
num_labels_level1 = len(level1_cols)  # Replace with the actual number of level 1 labels
num_labels_level2 = len(level2_cols)  # Replace with the actual number of level 2 labels

model = BERTClass(num_labels_level1, num_labels_level2)
model.to(device);


In [None]:
def loss_fn(outputs_level1, targets_level1, outputs_level2, targets_level2):
    loss_fct = torch.nn.BCEWithLogitsLoss()
    loss_level1 = loss_fct(outputs_level1.view(-1, num_labels_level1), targets_level1)
    loss_level2 = loss_fct(outputs_level2.view(-1, num_labels_level2), targets_level2)
    return loss_level1 + loss_level2  # You can also weigh these losses differently if needed


In [None]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [None]:
import time

def train(epoch):
    model.train()
    total_loss = 0  # Track the total loss
    start_time = time.time()  # Capture the start time of the training
    
    for i, data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets_level1 = data['targets_level1'].to(device, dtype=torch.float)
        targets_level2 = data['targets_level2'].to(device, dtype=torch.float)

        optimizer.zero_grad()
        level1_logits, level2_logits = model(ids, mask, token_type_ids)

        loss = loss_fn(level1_logits, targets_level1, level2_logits, targets_level2)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()

        if i % 50 == 0:
            elapsed_time = time.time() - start_time  # Calculate elapsed time
            print(f'Epoch: {epoch}, Batch: {i}, Loss: {loss.item()}, Elapsed Time: {elapsed_time:.2f} seconds')
    
    avg_loss = total_loss / len(train_loader)
    total_time = time.time() - start_time  # Total time for the epoch
    print(f'Epoch: {epoch}, Average Loss: {avg_loss}, Total Time: {total_time:.2f} seconds')




In [None]:
# Training loop
for epoch in range(EPOCHS):
    train(epoch)


In [None]:
torch.save(model.state_dict(), 'model.bin')