In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
import json
import warnings
warnings.filterwarnings('ignore')

In [28]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\AJIT ASHWATH
[nltk_data]     R\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\AJIT ASHWATH R\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def preprocess(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stop_words]
        return ' '.join(tokens)

In [32]:
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [34]:
class BERTSarcasmClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super(BERTSarcasmClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        return self.linear(dropout_output)

In [36]:
class SarcasmDetector:
    def __init__(self):
        self.preprocessor = TextPreprocessor()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = None
        self.traditional_model = None
        print(f"Using device: {self.device}")

    def prepare_data(self, file_path):
        try:
            # First attempt: Read as JSON Lines
            data_list = []
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():
                        data_list.append(json.loads(line))
            data = pd.DataFrame(data_list)
            
        except json.JSONDecodeError:
            try:
                # Second attempt: Read as single JSON array
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = pd.read_json(f)
            except Exception as e2:
                print(f"Error reading JSON file: {e2}")
                return None
        
        # Preprocess headlines
        data['processed_headline'] = data['headline'].apply(self.preprocessor.preprocess)
        
        print(f"Loaded {len(data)} records")
        print("Data sample:")
        print(data.head())
        
        return data

    def train_traditional_model(self, X_train, y_train):
        self.traditional_model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000)),
            ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
        ])
        self.traditional_model.fit(X_train, y_train)

    def evaluate_bert(self, val_loader):
        self.model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                
                outputs = self.model(input_ids, attention_mask)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        return correct / total

    def train_bert_model(self, train_loader, val_loader, epochs=3):
        self.model = BERTSarcasmClassifier().to(self.device)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        criterion = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            correct = 0
            total = 0
            
            print(f"\nEpoch {epoch + 1}/{epochs}")
            for batch_idx, batch in enumerate(train_loader):
                optimizer.zero_grad()
                
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                
                outputs = self.model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
                if (batch_idx + 1) % 10 == 0:
                    print(f"Batch [{batch_idx + 1}/{len(train_loader)}] "
                          f"Loss: {loss.item():.4f} "
                          f"Accuracy: {100 * correct/total:.2f}%")
            
            val_accuracy = self.evaluate_bert(val_loader)
            print(f'Epoch {epoch + 1} Summary:')
            print(f'Average Loss: {total_loss/len(train_loader):.4f}')
            print(f'Training Accuracy: {100 * correct/total:.2f}%')
            print(f'Validation Accuracy: {100 * val_accuracy:.2f}%')

    def predict(self, text, model_type='ensemble'):
        processed_text = self.preprocessor.preprocess(text)
        
        if model_type == 'traditional' or (model_type == 'ensemble' and self.model is None):
            return self.traditional_model.predict([processed_text])[0]
        
        elif model_type == 'bert' or model_type == 'ensemble':
            self.model.eval()
            encoding = self.tokenizer.encode_plus(
                processed_text,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)
            
            with torch.no_grad():
                outputs = self.model(input_ids, attention_mask)
                _, predicted = torch.max(outputs.data, 1)
                
            if model_type == 'bert':
                return predicted.item()
            
            # Ensemble prediction
            traditional_pred = self.traditional_model.predict([processed_text])[0]
            bert_pred = predicted.item()
            return int((traditional_pred + bert_pred) > 0.5)

In [None]:
def main():
    try:
        detector = SarcasmDetector()
        print("Loading and preparing data...")
        data = detector.prepare_data("C:\\Users\\AJIT ASHWATH R\\Downloads\\Sarcasm.json")
        
        if data is None:
            print("Failed to load data. Please check the file path and format.")
            return
            
        X_train, X_test, y_train, y_test = train_test_split(
            data['processed_headline'],
            data['is_sarcastic'],
            test_size=0.2,
            random_state=42
        )
        
        print("\nTraining traditional model...")
        detector.train_traditional_model(X_train, y_train)
        
        print("\nPreparing BERT datasets...")
        train_dataset = SarcasmDataset(X_train.values, y_train.values, detector.tokenizer)
        val_dataset = SarcasmDataset(X_test.values, y_test.values, detector.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32)
        
        print("\nTraining BERT model...")
        detector.train_bert_model(train_loader, val_loader)
        
        # Test predictions
        test_texts = [
            "Scientists cure cancer with one simple trick",
            "New study shows benefits of exercise",
            "Area man becomes expert in everything after reading single article"
        ]
        
        print("\nTesting predictions:")
        for text in test_texts:
            prediction = detector.predict(text, model_type='ensemble')
            print(f"\nText: {text}")
            print(f"Prediction: {'Sarcastic' if prediction == 1 else 'Not Sarcastic'}")
            
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Please ensure all required libraries are installed and the data file path is correct.")

if __name__ == "__main__":
    main()

Using device: cpu
Loading and preparing data...
Loaded 26709 records
Data sample:
                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  \
0  former versace store clerk sues over secret 'b...             0   
1  the 'roseanne' revival catches up to our thorn...             0   
2  mom starting to fear son's web series closest ...             1   
3  boehner just wants wife to listen, not come up...             1   
4  j.k. rowling wishes snape happy birthday in th...             0   

                                  processed_headline  
0  former versace store clerk sue secret black co...  
1  roseanne revival catch thorny political mood b.