In [8]:
import pandas as pd

# Load the first dataset
df1 = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)

# Load the second dataset
df2 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)

# Concatenate both datasets into a single dataframe
df = pd.concat([df1, df2], ignore_index=True)

# Calculate the total number of headlines
total_headlines = len(df)

# Calculate the percentage of sarcastic and non-sarcastic headlines
num_sarcastic = df['is_sarcastic'].sum()
num_non_sarcastic = total_headlines - num_sarcastic
percentage_sarcastic = (num_sarcastic / total_headlines) * 100
percentage_non_sarcastic = (num_non_sarcastic / total_headlines) * 100

# Calculate the average headline length
df['headline_length'] = df['headline'].apply(lambda x: len(x.split()))
average_headline_length = df['headline_length'].mean()

# Find the minimum and maximum headline lengths
min_headline_length = df['headline_length'].min()
max_headline_length = df['headline_length'].max()

# Print the results
print("Total number of headlines:", total_headlines)
print("Percentage of sarcastic headlines:", percentage_sarcastic, "%")
print("Percentage of non-sarcastic headlines:", percentage_non_sarcastic, "%")
print("Average headline length:", average_headline_length, "words")
print("Minimum headline length:", min_headline_length, "words")
print("Maximum headline length:", max_headline_length, "words")


Total number of headlines: 55328
Percentage of sarcastic headlines: 45.83212839791787 %
Percentage of non-sarcastic headlines: 54.16787160208213 %
Average headline length: 9.951417004048583 words
Minimum headline length: 2 words
Maximum headline length: 151 words


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the datasets and combine them
data_v1 = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
data_v2 = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True)
data = pd.concat([data_v1, data_v2])

# Split the data into training and testing sets (80% training, 20% testing)
X = data['headline']
y = data['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bag-of-Words representation of the headlines
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes classifier
clf = MultinomialNB(alpha=1.0)
clf.fit(X_train_bow, y_train)

# Make predictions on the test set with adjusted threshold
threshold = 0.5
y_pred_probs = clf.predict_proba(X_test_bow)
y_pred_adjusted = (y_pred_probs[:, 1] > threshold).astype(int)

# Evaluate the model with the adjusted threshold
accuracy = accuracy_score(y_test, y_pred_adjusted)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred_adjusted)
print("Classification Report:\n", report)

conf_matrix = confusion_matrix(y_test, y_pred_adjusted)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 0.8970721127778782
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90      5878
           1       0.90      0.88      0.89      5188

    accuracy                           0.90     11066
   macro avg       0.90      0.90      0.90     11066
weighted avg       0.90      0.90      0.90     11066

Confusion Matrix:
 [[5374  504]
 [ 635 4553]]
