In [None]:
!pip install emoji
!pip install stop-words
!pip install transformers datasets imblearn seaborn

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import re
import emoji
import string
from nltk.corpus import stopwords
from stop_words import get_stop_words
import nltk
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
import random
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from transformers import AutoTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()

In [None]:
df = pd.read_csv('/kaggle/input/datnlp/final_full_data_main.csv')
df

In [None]:
label_counts = df['label'].value_counts()
print("Count of tweets with label 1.0 (sarcastic):", label_counts[1.0])
print("Count of tweets with label 0.0 (non-sarcastic):", label_counts[0.0])

### Setup and Configurations

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load and Split Dataset

In [None]:
X_trainf, X_test, y_trainf, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=seed, stratify=df['label']
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainf, y_trainf, test_size=0.1, random_state=seed, stratify=y_trainf
)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
X_valid = X_valid.astype(str)

y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

### Add Language Tokens

In [None]:
language_tokens = {
    "hindi": "<2hi>",
    "bengali": "<2bn>",
    "english": "<2en>",
}

def text_is_hindi(text):
    return any("\u0900" <= char <= "\u097F" for char in text)

def text_is_bengali(text):
    return any("\u0980" <= char <= "\u09FF" for char in text)

def assign_language_token(text):
    if text_is_hindi(text):
        return f"{text} </s> {language_tokens['hindi']}"
    elif text_is_bengali(text):
        return f"{text} </s> {language_tokens['bengali']}"
    else:
        return f"{text} </s> {language_tokens['english']}"

# Apply language tokens
X_train = X_train.apply(assign_language_token)
X_valid = X_valid.apply(assign_language_token)
X_test = X_test.apply(assign_language_token)

### Initialize Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "xlm-roberta-large", do_lower_case=False, use_fast=False, keep_accents=True
)

class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        labels = torch.tensor(self.labels[idx], dtype=torch.long)
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": labels,
        }

### Handle Class Imbalance Using Oversampling

In [None]:
ros = RandomOverSampler(random_state=seed)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.to_frame(), y_train)

label_counts = y_train_resampled.value_counts()
print("Count of tweets with label 1.0 (sarcastic) after sampling:", label_counts[1.0])
print("Count of tweets with label 0.0 (non-sarcastic) after sampling:", label_counts[0.0])

X_train_resampled = X_train_resampled['text']

### Create Datasets

In [None]:
train_dataset = SarcasmDataset(
    texts=X_train_resampled.tolist(),
    labels=y_train_resampled.tolist(),
    tokenizer=tokenizer,
)

valid_dataset = SarcasmDataset(
    texts=X_valid.tolist(),
    labels=y_valid.tolist(),
    tokenizer=tokenizer,
)

test_dataset = SarcasmDataset(
    texts=X_test.tolist(),
    labels=y_test.tolist(),
    tokenizer=tokenizer,
)

model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-large", num_labels=2
).to(device)

### Trainer Initialization

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    seed=seed
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=8)],
)

train_results = trainer.train()

### Visualize Training and Validation Loss

In [None]:
log_history = trainer.state.log_history

epoch_train_losses = {}
epoch_eval_losses = {}

for entry in log_history:
    if 'epoch' in entry:
        epoch = int(entry['epoch'])
        if 'loss' in entry:
            epoch_train_losses[epoch] = entry['loss']
        if 'eval_loss' in entry:
            epoch_eval_losses[epoch] = entry['eval_loss']

common_epochs = sorted(set(epoch_train_losses.keys()) & set(epoch_eval_losses.keys()))
final_train_losses = [epoch_train_losses[e] for e in common_epochs]
final_eval_losses = [epoch_eval_losses[e] for e in common_epochs]

for epoch, (train_loss, eval_loss) in enumerate(zip(final_train_losses, final_eval_losses), start=1):
    print(f"Epoch {epoch}: Training Loss = {train_loss:.4f}, Validation Loss = {eval_loss:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(common_epochs, final_train_losses, label="Training Loss", marker="o")
plt.plot(common_epochs, final_eval_losses, label="Validation Loss", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss per Epoch")
plt.legend()
plt.grid()
plt.show()

test_predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(test_predictions.predictions, axis=1)

# Accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion Matrix
cf_matrix = confusion_matrix(y_test, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='.2%', cmap="Reds")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print("Classification Report:\n", classification_report(y_test, predicted_labels))