In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

In [70]:
from transformers import BertModel, BertTokenizer
BertModel.from_pretrained('bert-base-uncased', force_download = True)
BertTokenizer.from_pretrained('bert-base-uncased', force_download = True)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [74]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\AJIT ASHWATH R\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [76]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
    def preprocess(self, text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ''.join(tokens)

In [78]:
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = 1 if self.labels[idx] == "Sarcasm" else 0
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens = True,
            padding = "max_length",
            truncation = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype = torch.long)
        }

In [80]:
class BERTSarcasmClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super(BERTSarcasmClassifier, self).__init__()
        try:
            self.bert = BertModel.from_pretrained('bert-base-uncased')
        except Exception as e:
            print(f"Error loading BERT model: {e}")
            print("Attempting to load with offline mode...")
            self.bert = BertModel.from_pretrained('bert-base-uncased', local_files_only=True)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids = input_ids, attention_mask = attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        return self.linear(dropout_output)

In [87]:
class SarcasmDetector:
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        try:
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        except Exception as e:
            print(f"Error loading tokenizer: {e}")
            print("Attempting to load with offline mode...")
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only = True)
        self.model = None
        self.traditional_model = None

    def train_bert_model(self, train_loader, val_loader, epochs = 3):
        try:
            self.model = BERTSarcasmClassifier().to(self.device)
            optimizer = torch.optim.AdamW(self.model.parameters(), lr = 2e-5)
            criterion = nn.CrossEntropyLoss()
            for epoch in range(epochs):
                self.model.train()
                total_loss = 0
                correct = 0
                total = 0
                print(f"\nEpoch {epoch + 1} / {epochs}")
                print("Training progress:")
                for batch_idx, batch in enumerate(train_loader):
                    optimizer.zero_grad()
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)
                    outputs = self.model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
                    if (batch_idx + 1) % 10 == 0:
                        print(f"Batch [{batch_idx + 1} / {len(train_loader)}] "
                              f"Loss: {loss.item():.4f} "
                              f"Accuracy: {100 * correct/total:.2f}%")

                val_accuracy = self.evaluate_bert(val_loader)
                print(f'Epoch {epoch + 1} Summary:')
                print(f'Average Loss: {total_loss / len(train_loader):.4f}')
                print(f'Training Accuracy: {100 * correct / total:.2f}%')
                print(f'Validation Accuracy: {100 * val_accuracy:.2f}%')
                
        except Exception as e:
            print(f"Error during BERT training: {e}")
            print("Falling back to traditional model only...")
            self.model = None

In [84]:
def main():
    try:
        detector = SarcasmDetector()
        print("Loading and preparing data...")
        data = detector.prepare_data("C:\\Users\\AJIT ASHWATH R\\Downloads\\Sarcasm.json")
        X_train, X_test, y_train, y_test = train_test_split(
            data['processed_headline'],
            data['is_sarcastic'],
            test_size = 0.2,
            random_state = 42
        )
        print("\nTraining traditional model...")
        detector.train_traditional_model(X_train, y_train)
        print("\nPreparing BERT datasets...")
        train_dataset = SarcasmDataset(X_train.values, y_train.values, detector.tokenizer)
        val_dataset = SarcasmDataset(X_test.values, y_test.values, detector.tokenizer)
        train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
        val_loader = DataLoader(val_dataset, batch_size = 32)
        print("\nTraining BERT model...")
        detector.train_bert_model(train_loader, val_loader)
        test_texts = [
            "Scientists cure cancer with one simple trick",
            "New study shows benefits of exercise",
            "Area man becomes expert in everything after reading single article"
        ]
        print("\nTesting predictions:")
        for text in test_texts:
            model_type = 'ensemble' if detector.model is not None else 'traditional'
            prediction = detector.predict(text, model_type=model_type)
            print(f"\nText: {text}")
            print(f"Prediction: {prediction}")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please ensure all required libraries are installed and the data file path is correct.")

if __name__ == "__main__":
    main()

Using device: cpu
Loading and preparing data...
An error occurred: 'SarcasmDetector' object has no attribute 'prepare_data'
Please ensure all required libraries are installed and the data file path is correct.
