In [None]:
!pip install emoji
!pip install stop-words

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import re
import emoji
import string
from nltk.corpus import stopwords
from stop_words import get_stop_words
import nltk

nltk.download('punkt')
nltk.download('stopwords')
tqdm.pandas()

In [None]:
dfb = pd.read_csv('bengali_dataset.csv')
dfb = dfb.drop_duplicates(subset='text').reset_index(drop=True)
dfb

In [None]:
dfh = pd.read_csv('hindi_dataset.csv')
dfh = dfh.drop_duplicates(subset='text').reset_index(drop=True)
dfh

In [None]:
bangla_stopwords=["অতএব","অথচ","অথবা","অনুযায়ী","অনেক","অনেকে","অনেকেই","অন্তত","অন্য","অবধি","অবশ্য","অর্থাত","আই","আগামী","আগে",
                  "আগেই","আছে","আজ","আদ্যভাগে","আপনার","আপনি","আবার","আমরা","আমাকে","আমাদের","আমার","আমি","আর","আরও",
                  "ই","ইত্যাদি","ইহা","উচিত","উত্তর","উনি","উপর","উপরে","এ","এঁদের","এঁরা","এই","একই","একটি","একবার","একে","এক্","এখন",
                  "এখনও","এখানে","এখানেই","এটা","এটাই","এটি","এত","এতটাই","এতে","এদের","এব","এবং","এবার","এমন","এমনকী","এমনি","এর",
                  "এরা","এল","এস","এসে","ঐ","ও","ওঁদের","ওঁর","ওঁরা","ওই","ওকে","ওখানে","ওদের","ওর","ওরা","কখনও","কত","কবে","কমনে",
                  "কয়েক","কয়েকটি","করছে","করছেন","করতে","করবে","করবেন","করলে","করলেন","করা","করাই","করায়","করার","করি","করিতে","করিয়া",
                  "করিয়ে","করে","করেই","করেছিলেন","করেছে","করেছেন","করেন","কাউকে","কাছ","কাছে","কাজ","কাজে","কারও","কারণ","কি","কিংবা","কিছু",
                  "কিছুই","কিন্তু","কী","কে","কেউ","কেউই","কেখা","কেন","কোটি","কোন","কোনও","কোনো","ক্ষেত্রে","কয়েক","খুব","গিয়ে","গিয়েছে","গিয়ে",
                  "গুলি","গেছে","গেল","গেলে","গোটা","চলে","চান","চায়","চার","চালু","চেয়ে","চেষ্টা","ছাড়া","ছাড়াও","ছিল","ছিলেন","জন","জনকে","জনের",
                  "জন্য","জন্যওজে","জানতে","জানা","জানানো","জানায়","জানিয়ে","জানিয়েছে","জে","জ্নজন","টি","ঠিক","তখন","তত","তথা","তবু","তবে",
                  "তা","তাঁকে","তাঁদের","তাঁর","তাঁরা","তাঁাহারা","তাই","তাও","তাকে","তাতে","তাদের","তার","তারপর","তারা","তারৈ","তাহলে","তাহা","তাহাতে",
                  "তাহার","তিনঐ","তিনি","তিনিও","তুমি","তুলে","তেমন","তো","তোমার","থাকবে","থাকবেন","থাকা","থাকায়","থাকে","থাকেন","থেকে","থেকেই",
                  "থেকেও","দিকে","দিতে","দিন","দিয়ে","দিয়েছে","দিয়েছেন","দিলেন","দু","দুই","দুটি","দুটো","দেওয়া","দেওয়ার","দেওয়া","দেখতে","দেখা","দেখে",
                  "দেন","দেয়","দ্বারা","ধরা","ধরে","ধামার","নতুন","নয়","না","নাই","নাকি","নাগাদ","নানা","নিজে","নিজেই","নিজেদের","নিজের","নিতে","নিয়ে",
                  "নিয়ে","নেই","নেওয়া","নেওয়ার","নেওয়া","নয়","পক্ষে","পর","পরে","পরেই","পরেও","পর্যন্ত","পাওয়া","পাচ","পারি","পারে","পারেন","পি","পেয়ে",
                  "পেয়্র্","প্রতি","প্রথম","প্রভৃতি","প্রযন্ত","প্রাথমিক","প্রায়","প্রায়","ফলে","ফিরে","ফের","বক্তব্য","বদলে","বন","বরং","বলতে","বলল","বললেন","বলা",
                  "বলে","বলেছেন","বলেন","বসে","বহু","বা","বাদে","বার","বি","বিনা","বিভিন্ন","বিশেষ","বিষয়টি","বেশ","বেশি","ব্যবহার","ব্যাপারে","ভাবে","ভাবেই",
                  "মতো","মতোই","মধ্যভাগে","মধ্যে","মধ্যেই","মধ্যেও","মনে","মাত্র","মাধ্যমে","মোট","মোটেই","যখন","যত","যতটা","যথেষ্ট","যদি","যদিও","যা","যাঁর",
                  "যাঁরা","যাওয়া","যাওয়ার","যাওয়া","যাকে","যাচ্ছে","যাতে","যাদের","যান","যাবে","যায়","যার","যারা","যিনি","যে","যেখানে","যেতে","যেন","যেমন","র",
                  "রকম","রয়েছে","রাখা","রেখে","লক্ষ","শুধু","শুরু","সঙ্গে","সঙ্গেও","সব","সবার","সমস্ত","সম্প্রতি","সহ","সহিত","সাধারণ","সামনে","সি","সুতরাং","সে",
                  "সেই","সেখান","সেখানে","সেটা","সেটাই","সেটাও","সেটি","স্পষ্ট","স্বয়ং","হইতে","হইবে","হইয়া","হওয়া","হওয়ায়","হওয়ার","হচ্ছে","হত","হতে","হতেই",
                  "হন","হবে","হবেন","হয়","হয়তো","হয়নি","হয়ে","হয়েই","হয়েছিল","হয়েছে","হয়েছেন","হল","হলে","হলেই","হলেও","হলো","হাজার","হিসাবে","হৈলে",
                  "হোক","হয়"]

hindi_stop_words = get_stop_words('hi')
english_stopwords= stopwords.words('english')

stopwords_set = set(english_stopwords + hindi_stop_words + bangla_stopwords)

In [None]:
url_pattern = re.compile(r'((www\.[\S]+)|(https?://[\S]+))')
mention_pattern = re.compile(r'@\w+')
space_pattern = re.compile(r'\s+')
hashtag_pattern = re.compile(r'#(\w+)')

def process_text(text):
    text = text.lower()
    text = url_pattern.sub('', text)
    text = mention_pattern.sub('', text)
    text = space_pattern.sub(' ', text)
    text = hashtag_pattern.sub(r'\1', text)
    text = emoji.demojize(text)
    #tokenized_text = nltk.word_tokenize(text) # check if tokenisation here is needed as bert tokeniser is used later
    return ' '.join([word for word in text if word not in string.punctuation and word not in stopwords_set])

In [None]:
tqdm.pandas()
dfb['text'] = dfb['text'].progress_apply(process_text)

In [None]:
tqdm.pandas()
dfh['text'] = dfh['text'].progress_apply(process_text)

In [None]:
df = pd.concat([dfb, dfh], axis=0)

In [None]:
label_counts = df['label'].value_counts()
print("Count of tweets with label 1.0 (sarcastic):", label_counts[1.0])
print("Count of tweets with label 0.0 (non-sarcastic):", label_counts[0.0])

In [None]:
x_train_full, x_test, y_train_full, y_test = train_test_split(
    df['text'].tolist(), df['label'], test_size=0.3, random_state=42, stratify=df['label']
)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full
)

# Convert labels to Series and reset index
y_train = pd.Series(y_train).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

# Load IndicBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

# Define Dataset class for BERT
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=150,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create train, validation, and test datasets
train_dataset = SarcasmDataset(x_train, y_train)
valid_dataset = SarcasmDataset(x_valid, y_valid)
test_dataset = SarcasmDataset(x_test, y_test)

In [None]:
# Load IndicBART model
model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluation strategy
    save_strategy="epoch",  # Save strategy to match evaluation strategy
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_steps=10,  # Log every 10 steps
    save_total_limit=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.1,
    load_best_model_at_end=True,  # Load the best model when training ends
    metric_for_best_model="eval_loss",  # Metric to monitor for the best model
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stop training if validation loss does not improve for 2 evaluations
)

In [None]:
# Train model
train_results = trainer.train()

# Store training and validation losses for plotting
losses = trainer.state.log_history
train_losses = [entry['loss'] for entry in losses if 'loss' in entry]
eval_losses = [entry['eval_loss'] for entry in losses if 'eval_loss' in entry]

# Predictions and performance metrics
predictions = trainer.predict(valid_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
accuracy = np.mean(predicted_labels == y_test)
print(f"Accuracy Score: {accuracy}")

# Plot training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label='Training Loss', color='blue')
plt.xlabel('Steps')
plt.ylabel('Training Loss', color='blue')
plt.tick_params(axis='y', labelcolor='blue')

# Create a second y-axis for validation loss
ax2 = plt.gca().twinx()
ax2.plot(eval_losses, label='Validation Loss', color='orange')
ax2.set_ylabel('Validation Loss', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')

plt.title('Training and Validation Loss')
plt.grid()
plt.show()

# Confusion Matrix
cf_matrix = confusion_matrix(y_test, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cf_matrix / np.sum(cf_matrix), annot=True, fmt='.2%', cmap='Reds')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Classification report
print("Classification Report:\n", classification_report(y_test, predicted_labels))