### Data Loading and initial understanding

In [24]:
import pandas as pd

# first upload the dataset json file...

# Path to dataset
path = ("/content/Sarcasm_Headlines_Dataset.json")

# Load dataset
df = pd.read_json(path, lines=True)

In [25]:
# Basic info
shape = df.shape
columns = df.columns.tolist()
missing = df.isnull().sum()
class_counts = df['is_sarcastic'].value_counts()
sarcastic_ratio = (class_counts[1] / len(df)) * 100

In [26]:
# Display some sample rows
df.head(5)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [27]:
shape, columns, missing, class_counts, sarcastic_ratio

((26709, 3),
 ['article_link', 'headline', 'is_sarcastic'],
 article_link    0
 headline        0
 is_sarcastic    0
 dtype: int64,
 is_sarcastic
 0    14985
 1    11724
 Name: count, dtype: int64,
 np.float64(43.89531618555543))

### Text Preprocessing Plan for Sarcasm Detection

In [28]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Function for text cleaning
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text


df['clean_headline'] = df['headline'].apply(clean_text)

# Prepare data and labels
X = df['clean_headline'].values
y = df['is_sarcastic'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = max(len(seq) for seq in train_sequences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

print(f"Training shape: {train_padded.shape}")
print(f"Testing shape: {test_padded.shape}")
print(f"Max sequence length: {max_length}")

Training shape: (21367, 39)
Testing shape: (5342, 39)
Max sequence length: 39


### Sarcasm Detection Model

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

# Parameters
embedding_dim = 128
lstm_units = 64

# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(lstm_units, return_sequences=False)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# Summary
model.summary()

# Train the model
history = model.fit(
    train_padded, y_train,
    validation_data=(test_padded, y_test),
    epochs=30,
    batch_size=64,
    verbose=1
)

# Evaluate on test set
test_loss, test_acc = model.evaluate(test_padded, y_test, verbose=0)
print(f"Test Accuracy: {test_acc:.4f}")




Epoch 1/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.7077 - loss: 0.5383 - val_accuracy: 0.8540 - val_loss: 0.3393
Epoch 2/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9154 - loss: 0.2300 - val_accuracy: 0.8613 - val_loss: 0.3248
Epoch 3/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9549 - loss: 0.1385 - val_accuracy: 0.8605 - val_loss: 0.3692
Epoch 4/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.9752 - loss: 0.0816 - val_accuracy: 0.8502 - val_loss: 0.5357
Epoch 5/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9854 - loss: 0.0480 - val_accuracy: 0.8469 - val_loss: 0.7054
Epoch 6/30
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9898 - loss: 0.0348 - val_accuracy: 0.8435 - val_loss: 0.7540
Epoch 7/30
[1m334/334

In [30]:
## saving the model...
import pickle
import json

# Save model
model.save("sarcasm_detector.h5")

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save max_length
with open("preprocessing.json", "w") as f:
    json.dump({"max_length": max_length, "vocab_size": vocab_size}, f)




### Testing

In [31]:
def predict_sarcasm(headline):
    cleaned = [clean_text(headline)]
    seq = tokenizer.texts_to_sequences(cleaned)
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(padded)
    return "Sarcastic" if pred[0][0] > 0.5 else "Not Sarcastic"

In [32]:
predict_sarcasm("police thinks he himself is thief")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step


'Sarcastic'