In [None]:
pip install transformers datasets torch scikit-learn

In [None]:
import numpy as np 
import pandas as pd 
import os
from transformers import BertModel,BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import spacy
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import seaborn as sns
import random
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import re
import unicodedata
from nltk.corpus import stopwords

df = pd.read_csv('/kaggle/input/nlp-cleaned/cleaned_dataset.csv')
stop_words = set(stopwords.words('romanian'))
def preprocess_review(review):
    # Normalize the text font
    review = unicodedata.normalize('NFKC', review)
    # Replace inconsistent characters
    review = review.replace('“', '"').replace('”', '"')
    review = review.replace('ş', 'ș').replace("Ş", "Ș").replace('ţ', 'ț').replace("Ţ", "Ț")
    # Remove links
    review = re.sub(r'https?://\S+', '', review)
    # Remove included english reviews
    rev_beginning = ["english review:", "english:", "[english]"]
    indices = [review.lower().find(b) for b in rev_beginning]
    if indices[0] != -1:
        index = indices[0]
    elif indices[1] != -1:
        index = indices[1]
    elif indices[2] != -1:
        index = indices[2]
    else:
        index = -1
    review = review[:index]

    return review

df['review'] = df['review'].apply(preprocess_review)

In [None]:
# Split data into train and validation 
x_train, x_validation, y_train, y_validation = train_test_split(
    df['review'], df['label'], test_size=0.2, random_state=42)

In [None]:
def downsample():
    size = df['label'].value_counts().min()
    
    new_df = pd.DataFrame()
    
    for label in df['label'].unique():
        class_data = df[df['label'] == label]
        
        resampled = resample(class_data,
                             replace=False,  
                             n_samples= size,  
                             random_state=42)
        new_df = pd.concat([new_df, resampled])
    
    
    x_train, x_validation, y_train, y_validation = train_test_split(
        new_df['review'], new_df['label'], test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained('readerbench/RoBERT-base', do_lower_case=True,truncation=True)
# tokenizer = BertTokenizer.from_pretrained('readerbench/RoBERT-small', do_lower_case=True,truncation=True)
# tokenizer = BertTokenizer.from_pretrained('dumitrescustefan/bert-base-romanian-cased-v1', do_lower_case=True,truncation=True)
# tokenizer = BertTokenizer.from_pretrained('snisioi/bert-legal-romanian-cased-v1', do_lower_case=True,truncation=True)

In [None]:
# Encode train data using the pretrained tokenizer 
encoded_x_train = tokenizer.batch_encode_plus(x_train,
                    add_special_tokens=True, 
                    return_attention_mask=True, 
                    truncation=True,
                    padding='longest', 
                    max_length=512, 
                    return_tensors='pt')

# Encode validation data using the pretrained tokenizer 
encoded_x_validation = tokenizer.batch_encode_plus(x_validation,
                    add_special_tokens=True, 
                    return_attention_mask=True, 
                    truncation=True,
                    padding='longest', 
                    max_length=512, 
                    return_tensors='pt')

In [None]:
# Extract input_ids and attention_mask for training
input_ids_x_train = encoded_x_train['input_ids']
attention_masks_x_train = encoded_x_train['attention_mask']

# Convert the labels for training into a torch tensor
y_train_vals = torch.tensor(y_train.values)

# Extract input_ids and attention_mask for validation
input_ids_x_validation = encoded_x_validation['input_ids']
attention_masks_x_validation = encoded_x_validation['attention_mask']

# Convert the labels for validation into a torch tensor
y_validation_vals = torch.tensor(y_validation.values)

# Make the train and validation Datasets
dataset_train = TensorDataset(input_ids_x_train, attention_masks_x_train, y_train_vals)
dataset_validation = TensorDataset(input_ids_x_validation, attention_masks_x_validation, y_validation_vals)

In [None]:
# Define the Model
model = BertForSequenceClassification.from_pretrained("readerbench/RoBERT-base", num_labels=3, output_attentions=False, output_hidden_states=False)
# model = BertForSequenceClassification.from_pretrained("reereaderbench/RoBERT-small", num_labels=3, output_attentions=False, output_hidden_states=False)
# model = BertForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1", num_labels=3, output_attentions=False, output_hidden_states=False)
# model = BertForSequenceClassification.from_pretrained("snisioi/bert-legal-romanian-cased-v1", num_labels=3, output_attentions=False, output_hidden_states=False)

In [None]:
# Add balanced weights to make up for the difference in items per class
class_weights = compute_class_weight(
    class_weight='balanced',  
    classes=[0,1,2], 
    y=df['label'].to_numpy())

# Modify class_weights
class_weights_dict = {i: w for i, w in enumerate(class_weights)}
# Increase weight for class 1
class_weights_dict[1] *= 1.2 
# Update weights
adjusted_class_weights = np.array([class_weights_dict[0], class_weights_dict[1], class_weights_dict[2]])

# Set the device to cuda
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Set the new weights
weight_tensor = torch.tensor(adjusted_class_weights, dtype=torch.float).to(device)
print("Class weights: ", weight_tensor)

# Define loss function with the modified weights
loss_function = CrossEntropyLoss(weight = weight_tensor)

# Set batch size and number of epochs
batch_size = 16
epochs = 1

# Define seed values
seed_value = 123

# Define dataloaders for training 
dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train), batch_size=batch_size) 
dataloader_validation = DataLoader(dataset_validation, sampler=SequentialSampler(dataset_validation), batch_size=batch_size)

# Define scheduler
optimizer = AdamW(model.parameters(),lr=1e-7, eps=1e-8) 
train_steps = len(dataloader_train)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps= train_steps)

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def calculate_metrics(predictions, labels):
    # Flatten the predictions and ground truths
    predictions_flatten = np.argmax(predictions, axis=1).flatten()
    labels_flatten = labels.flatten()

    # Display the Accuracy and Classification Report
    print("Accuracy: ", metrics.accuracy_score(labels_flatten, predictions_flatten))
    print("Confusion Matrix:", metrics.confusion_matrix(labels_flatten, predictions_flatten))
    print("Classification Report: ", classification_report(labels_flatten, predictions_flatten))

    # Define the confusion matrix
    confusion_matrix = np.array( metrics.confusion_matrix(labels_flatten, predictions_flatten))
    
    # Labels for the matrix
    classes = ['0', '1', '2']
    
    # Create the plot
    plt.figure(figsize=(6, 5))  
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="RdYlBu", 
                xticklabels=classes, yticklabels=classes, cbar=True, annot_kws={"size": 15})
    
    # Add labels, title, and adjust formatting
    plt.xlabel("Predicted", fontsize=12)  
    plt.ylabel("True", fontsize=12)  
    plt.title("Confusion Matrix", fontsize=14)  
    
    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
def evaluate(data):

    model.eval()

    # Define predictions and ground truths 
    predictions, ground_truth = [], []
    # Define total loss
    total_loss = 0
    
    # Define batch evaluation using the dataloader 
    for batch in data:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        # Extract the ground truth values
        ground_truth.append(inputs['labels'].cpu().numpy())
        
        with torch.no_grad():        
            outputs = model(**inputs)

        # Extract and predict the loss
        loss = outputs[0]
        total_loss += loss.item()

        # Extract and update the predicted values
        logits = outputs[1]
        predictions.append(logits.cpu().numpy())
        
    # Calculate the average loss
    average_loss = total_loss/len(data) 

    # Concatenate predictions
    predictions = np.concatenate(predictions, axis=0)
    # Concatenate ground truth values
    ground_truth = np.concatenate(ground_truth, axis=0)
            
    return average_loss, predictions, ground_truth

In [None]:
def train(data, best_loss):
    for epoch in tqdm(range(1, epochs+1)):
        
        model.train()

        # Define total train loss
        total_loss = 0
    
        progress_bar = tqdm(data, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:
    
            model.zero_grad()
            
            batch = tuple(b.to(device) for b in batch)
            
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }       
    
            # Extract the labels
            labels = inputs['labels']
            
            outputs = model(**inputs)
    
            # Extract the predicted values
            logits = outputs.logits
            
            # Calculate loss and update the total train loss 
            loss = loss_function(logits, labels)
            total_loss += loss.item()
            # Back propagate the loss
            loss.backward()
    
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
        # Save the current Model
        torch.save(model.state_dict(), f'models/ROBERT_base_{epoch}.model')
            
        tqdm.write(f'\nEpoch: {epoch}')
    
        # Calculate Average training Loss
        average_loss = total_loss/len(data)            
    
        # Evaluate the Validation Data
        validation_loss, predictions, ground_truth = evaluate(dataloader_validation)
    
        # Calculate the F1-Score
        predictions_flatten = np.argmax(predictions, axis=1).flatten()
        gt_flatten = ground_truth.flatten()
        validation_f1_score = f1_score(gt_flatten, predictions_flatten, average='weighted')

        # Write the Losses for this epoch
        tqdm.write(f'Train Loss: {average_loss}, Validation Loss: {validation_loss}, Validation F1-Score: {validation_f1_score}')
        
        # Check for Early Stopping 
        if validation_loss < best_loss:
            best_loss = validation_loss
            counter = 0
        else:
            counter += 1
        
        if counter >= no_epochs:
            print("Early stopping.")
            break

In [None]:
# Set the model to cuda
model.to(device)

# Create directory to save the models at every epoch
Path('/kaggle/working/models/').mkdir(parents=True, exist_ok=True)

# Implement early stopping
no_epochs = 5 # Wait for 5 epochs
best_loss = np.inf 
counter = 0

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [None]:
# Train the Model
train(dataloader_train, best_loss)

In [None]:
# Evaluate and display Accuracy, Confusion Matrix and Classification Report
_, predictions, labels= evaluate(dataloader_validation)
calculate_metrics(predictions, labels)