In [None]:
!pip install emoji
!pip install stop-words
!pip install transformers datasets imblearn seaborn

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback,AutoModel
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import re
import emoji
import string
from nltk.corpus import stopwords
from stop_words import get_stop_words
import nltk
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import random
from imblearn.over_sampling import RandomOverSampler

nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()

In [None]:
df = pd.read_csv('/kaggle/input/final-dataset/final_full_data_main.csv')
df

In [None]:
label_counts = df['label'].value_counts()
print("Count of tweets with label 1.0 (sarcastic):", label_counts[1.0])
print("Count of tweets with label 0.0 (non-sarcastic):", label_counts[0.0])

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
X_trainf, X_test, y_trainf, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=seed, stratify=df['label']
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_trainf, y_trainf, test_size=0.1, random_state=seed, stratify=y_trainf
)

X_train = X_train.astype(str)
X_test = X_test.astype(str)
X_valid = X_valid.astype(str)

y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=150):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Balance the training dataset using Oversampling


In [None]:
ros = RandomOverSampler(random_state=seed)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train.to_frame(), y_train)

label_counts12 = y_train_resampled.value_counts()
print("Count of tweets with label 1.0 (sarcastic):", label_counts12[1.0])
print("Count of tweets with label 0.0 (non-sarcastic):", label_counts12[0.0])

X_train_resampled = X_train_resampled['text']

In [None]:
train_dataset = SarcasmDataset(
    texts=X_train_resampled.tolist(),
    labels=y_train_resampled.tolist(),
    tokenizer=tokenizer,
)

valid_dataset = SarcasmDataset(
    texts=X_valid.tolist(),
    labels=y_valid.tolist(),
    tokenizer=tokenizer,
)

test_dataset = SarcasmDataset(
    texts=X_test.tolist(),
    labels=y_test.tolist(),
    tokenizer=tokenizer,
)

# Initialize model for classification


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-6,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    seed=seed
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"Train accuracy": accuracy_score(p.label_ids, preds)}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics
)

# Training the model


In [None]:
train_results = trainer.train()

# Extracting training and validation losses by epoch


In [None]:
log_history = trainer.state.log_history

epoch_train_losses = {}
epoch_eval_losses = {}

for entry in log_history:
    if 'epoch' in entry:
        epoch = int(entry['epoch'])
        # Record training loss if present (last one encountered per epoch is used)
        if 'loss' in entry:
            epoch_train_losses[epoch] = entry['loss']
        # Record evaluation loss if present
        if 'eval_loss' in entry:
            epoch_eval_losses[epoch] = entry['eval_loss']

# Only consider epochs that have both training and eval losses
common_epochs = sorted(set(epoch_train_losses.keys()) & set(epoch_eval_losses.keys()))
final_train_losses = [epoch_train_losses[e] for e in common_epochs]
final_eval_losses = [epoch_eval_losses[e] for e in common_epochs]

for epoch, (train_loss, eval_loss) in enumerate(zip(final_train_losses, final_eval_losses), start=1):
    print(f"Epoch {epoch}: Training Loss = {train_loss:.4f}, Validation Loss = {eval_loss:.4f}")

plt.figure(figsize=(10, 6))
plt.plot(common_epochs, final_train_losses, label="Training Loss", marker="o")
plt.plot(common_epochs, final_eval_losses, label="Validation Loss", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss per Epoch")
plt.legend()
plt.grid()
plt.show()

# Final evaluation on the test set


In [None]:
test_predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(test_predictions.predictions, axis=1)

# Accuracy
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Test Accuracy: {accuracy:.4f}")

# Confusion Matrix
cf_matrix = confusion_matrix(y_test, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
print("Classification Report:\n", classification_report(y_test, predicted_labels))