Data from - [Github](https://github.com/Rhitabrat/Youtube-Comments-Categorization), [Paper](https://arxiv.org/pdf/2111.01908.pdf)

In [None]:
!pip install sentence-transformers
!pip install transformers

In [1]:
import numpy as np
import pandas as pd

from string import digits
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
from tqdm import tqdm, notebook

import time

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
/kaggle/input/youtube-comments-categorization/youtube-comments-categorization.csv
/kaggle/input/youtube-videos-title-description-comments/GBvideos.csv
/kaggle/input/youtube-videos-title-description-comments/GBtext-details.csv
/kaggle/input/youtube-videos-title-description-comments/UScomments.csv
/kaggle/input/youtube-videos-title-description-comments/GB_category_id.json
/kaggle/input/youtube-videos-title-description-comments/US_category_id.json
/kaggle/input/youtube-videos-title-description-comments/UStext-details.csv
/kaggle/input/youtube-videos-title-description-comments/GBcomments.csv
/kaggle/input/youtube-videos-title-description-comments/USvideos.csv


In [None]:
import os
import random
import gc
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

import seaborn as sns
sns.set(style='darkgrid')

from sklearn.model_selection import StratifiedKFold, train_test_split

%matplotlib inline

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import AutoConfig

In [None]:
DEBUG = False

DATA_PATH = '/kaggle/input/youtube-comments-categorization/youtube-comments-categorization.csv'

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device.type)

SEED = 97
seed_everything(SEED)

Trying 3 bert based models without pre-training

In [None]:
BERT = 'bert-base-uncased'
DISTIL_BERT = 'distilbert-base-uncased'
ROBERTA = 'roberta-base'

In [None]:
cfg ={}

MODEL_PATH = ROBERTA

In [None]:
df = pd.read_csv(DATA_PATH, header = None)
df.columns = ['comment', 'label']
df.head()

In [None]:
len(df)

In [None]:
_, ax = plt.subplots()
labels = df.label.unique()
ax.set_xticklabels(labels=labels, rotation=45)
sns.countplot(x='label', data=df, ax=ax)

In [None]:
df.label.unique()
labels = {'positive':0, 'imperative':1, 'interrogative':2, 'miscellaneous':3,
       'corrective':4, 'negative':5}
df['enc_label'] = df.label.apply(lambda x: labels[x])
df.head()

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df.comment, df.enc_label, test_size=0.3, random_state=SEED, stratify=df.enc_label)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp)

In [None]:
_, ax = plt.subplots()
# labels = df.label.unique()
# ax.set_xticklabels(labels=labels, rotation=45)
sns.countplot(x=y_train, ax=ax)

In [None]:
_, ax = plt.subplots()
# labels = df.label.unique()
# ax.set_xticklabels(labels=labels, rotation=45)
sns.countplot(x=y_val, ax=ax)

In [None]:
_, ax = plt.subplots()
# labels = df.label.unique()
# ax.set_xticklabels(labels=labels, rotation=45)
sns.countplot(x=y_test, ax=ax)

In [None]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
val_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [None]:
train_df.head()

In [None]:
cfg['tokenizer'] ={'name': MODEL_PATH, 'max_length': 256}
tokenizer = AutoTokenizer.from_pretrained(cfg['tokenizer']['name'])

In [None]:
cfg['train_batch_size'] = 32
cfg['valid_batch_size'] = 16
cfg['max_length'] = 256
cfg['epochs'] = 3
cfg['learning_rate'] = 1e-05

In [None]:
class CommentsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.len = len(df)
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        comment = self.df.loc[index, 'comment']
        inputs = self.tokenizer.encode_plus(
            comment,                                 
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_len,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        if cfg['tokenizer']['name']=='bert-base-uncased':
            token_type_ids = inputs['token_type_ids'] 
        else:
            token_type_ids = 1.
        
        target = self.df.loc[index, 'enc_label']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'masks': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(target, dtype=torch.long)
        }
    
    def __len__(self):
        return self.len

In [None]:
train_data = CommentsDataset(train_df, tokenizer, cfg['max_length'])
val_data = CommentsDataset(val_df, tokenizer, cfg['max_length'])
test_data = CommentsDataset(test_df, tokenizer, cfg['max_length'])

In [None]:
def collate_fn(data):
    text = []
    target = []
    for tupl in data:
        text.append(tupl[0])
        target.append(tupl[1])
    zipped = zip(text, target)
    return list(zipped)

In [None]:
train_params = {'batch_size': cfg['train_batch_size'],
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': cfg['valid_batch_size'],
                'shuffle': False,
                'num_workers': 0
                }

test_params = {'batch_size': cfg['valid_batch_size'],
                'shuffle': False,
                'num_workers': 0
                }

# train_loader = DataLoader(train_data, **train_params, collate_fn=collate_fn)
# val_loader = DataLoader(val_data, **valid_params, collate_fn=collate_fn)
# test_loader = DataLoader(test_data, **test_params, collate_fn=collate_fn)

train_loader = DataLoader(train_data, **train_params)
val_loader = DataLoader(val_data, **valid_params)
test_loader = DataLoader(test_data, **test_params)

In [None]:
class CommentRelevanceBERT(torch.nn.Module):
    def __init__(self, model_name, dropout=True):
        super(CommentRelevanceBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.model_name = model_name
        
        if model_name == BERT:
            self.in_features = self.bert.pooler.dense.out_features
        elif model_name == DISTIL_BERT:
            self.in_features = self.bert.transformer.layer[5].output_layer_norm.normalized_shape[0]
        elif model_name == ROBERTA:
            self.in_features = self.bert.pooler.dense.out_features
        else:
            self.in_features = 768
            
        self.dense = nn.Linear(self.in_features, self.in_features)
        self.activation = nn.ReLU()
        self.layer_norm = nn.LayerNorm(self.in_features)
        # self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, len(labels))

    def forward(self, input_ids, attention_mask, token_type_ids):    
        if self.model_name == BERT:
            last_hidden_state, output = self.bert(input_ids,
                                                  attention_mask=attention_mask,
                                                  token_type_ids=token_type_ids,
                                                  return_dict=False)
        elif self.model_name == DISTIL_BERT:
            last_hidden_state = self.bert(input_ids,
                                          attention_mask=attention_mask,
                                          return_dict=False)
            first_token_tensor = last_hidden_state[0][:, 0]
            output = self.dense(first_token_tensor)
            output = self.activation(output)
            
        elif self.model_name == ROBERTA:
            last_hidden_state, output = self.bert(input_ids,
                                                  attention_mask=attention_mask,
                                                  return_dict=False)
        
        output = self.layer_norm(output)
        # output = self.dropout(output)
        output = self.classifier(output)
        return output

In [None]:
model = CommentRelevanceBERT(MODEL_PATH)
model.to(device)

In [None]:
from torch.optim.lr_scheduler import StepLR
from transformers import get_cosine_schedule_with_warmup

In [None]:
num_training_steps = cfg['epochs']*len(train_loader)
num_warmup_steps = 0
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=cfg['learning_rate'])
scheduler = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)
# optimizer.step()
# scheduler.step()
# learning_rate_history.append(optimizer.param_groups[0]['lr'])

In [None]:
def calc_accuracy(preds, targets):
    accuracy = (preds==targets).cpu().numpy().mean() * 100
    return accuracy

In [None]:
def train(model, loss_fn, train_dataloader, val_dataloader=None, epochs=3, evaluation=False):
    
    print("Start training...\n")
    for epoch in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0
        train_loss = []
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            model.zero_grad()
            input_ids = batch['ids'].to(device, dtype = torch.long)
            attention_mask = batch['masks'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids, attention_mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            train_loss.append(loss.item())
            
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        
        if evaluation == True:
            val_loss, val_accuracy = evaluate(model, loss_fn, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
            print("-"*70)
        print("\n")
    
    print("Training complete!")

In [None]:
def evaluate(model, loss_fn, val_dataloader):
    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        input_ids = batch['ids'].to(device, dtype = torch.long)
        attention_mask = batch['masks'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask, token_type_ids)
        
        loss = loss_fn(outputs, targets)
        val_loss.append(loss.item())

        _, max_ids = torch.max(outputs.data, dim=1)
        val_accuracy.append(calc_accuracy(max_ids, targets))

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

Learning rate = 1e-03, model loss was not reducing at all. Reduced learning rate to 1e-05, loss started reducing. 

In [None]:
train(model, loss_function, train_loader, val_loader, epochs=5, evaluation=True)

82.25 = dropout 0.3, no layer norm, 3 epochs <br/>
82.31 = no droputout, layer norm, 3 epochs
82.03 = '', '', batch size = 64, distill bert, 5 epochs

In [None]:
output_model_file = 'roberta_youtube_comments.pth'
torch.save(model, output_model_file)

In [None]:
test_loss, test_accuracy = evaluate(model, loss_function, test_loader)
test_loss, test_accuracy

In [None]:
def predict(model, loss_fn, val_dataloader):
    model.eval()

    preds = []

    for batch in val_dataloader:
        input_ids = batch['ids'].to(device, dtype = torch.long)
        attention_mask = batch['masks'].to(device, dtype = torch.long)
        token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask, token_type_ids)
        
        loss = loss_fn(outputs, targets)
        _, max_ids = torch.max(outputs.data, dim=1)
        preds.extend(max_ids.cpu().numpy())

    return preds

In [None]:
preds = predict(model, loss_function, test_loader)
len(preds), len(test_df)

In [None]:
from sklearn.metrics import classification_report

In [None]:
preds[:10], y_test[:10]

In [None]:
report = classification_report(y_test, preds, labels=[0, 1, 2, 3, 4, 5])
print(report)

In [None]:
sns.countplot(x=y_test)