# Dependencies

In [23]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pickle

# Data Preparation


In [16]:
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [17]:
X = df[['headline']]
y = df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=3)

# Model Creation


In [18]:
# Tokenization and padding
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train['headline'])
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train['headline'])
test_sequences = tokenizer.texts_to_sequences(X_test['headline'])

train_padded = pad_sequences(train_sequences, padding='post')
test_padded = pad_sequences(test_sequences, padding='post')
print(train_padded[0])
print(train_padded.shape)

[ 717  762 9891   12  136   11  583   58 3857 3858    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
(22895, 152)


In [19]:
# Ensure VOCAB_SIZE is correct
VOCAB_SIZE = len(word_index) + 1  # Adding 1 to include the padding index

# Set the embedding dimension
EMBEDDING_DIM = 100

# Check and adjust indices
train_padded = np.clip(train_padded, 0, VOCAB_SIZE - 1)
test_padded = np.clip(test_padded, 0, VOCAB_SIZE - 1)

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Model Tranning

In [21]:
# Compile the model with a modified learning rate
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
history = model.fit(train_padded, y_train, epochs=30,validation_data=(test_padded, y_test),batch_size=64)

# Print the model summary
model.summary()

Epoch 1/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 66ms/step - accuracy: 0.5285 - loss: 0.6918 - val_accuracy: 0.5190 - val_loss: 0.6787
Epoch 2/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 38ms/step - accuracy: 0.5740 - loss: 0.6612 - val_accuracy: 0.8057 - val_loss: 0.4553
Epoch 3/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 38ms/step - accuracy: 0.7671 - loss: 0.4792 - val_accuracy: 0.8162 - val_loss: 0.4963
Epoch 4/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 38ms/step - accuracy: 0.8317 - loss: 0.3815 - val_accuracy: 0.8328 - val_loss: 0.4855
Epoch 5/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 37ms/step - accuracy: 0.8407 - loss: 0.3644 - val_accuracy: 0.8379 - val_loss: 0.5316
Epoch 6/30
[1m358/358[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 38ms/step - accuracy: 0.8724 - loss: 0.2977 - val_accuracy: 0.8422 - val_loss: 0.5207
Epoch 7/30
[1m3

# Model Testing

In [22]:
# Make predictions on the test data
predictions = model.predict(test_padded)

# Convert probabilities to binary predictions
predicted_labels = (predictions > 0.5).astype(int)

# Print the classification report
report = classification_report(y_test, predicted_labels)
print(report)

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      2959
           1       0.83      0.85      0.84      2765

    accuracy                           0.85      5724
   macro avg       0.84      0.85      0.84      5724
weighted avg       0.85      0.85      0.85      5724

Confusion Matrix:
[[2490  469]
 [ 418 2347]]


# Deploying

In [24]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully!")

Model saved successfully!


In [34]:
def model_predict(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, padding='post', maxlen=train_padded.shape[1])
    prediction = model.predict(padded_sequence)
    print(prediction)
    if (prediction > 0.5):
        print("Sarcastic")
    else:
        print("Not Sarcastic")

In [37]:
text = "elonmusk is resigning from openai board"
model_predict(text)
text = "op snake handler leaves sinking huckabee campaign"
model_predict(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[[0.20609061]]
Not Sarcastic
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[[0.9940084]]
Sarcastic
