In [16]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

# Load your dataset
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/fake_or_real_news.csv')

df = df[['title', 'text', 'label']].sample(n=1000, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\b[a-zA-Z]{3,}\b', min_df=5, max_df=0.7, ngram_range=(1,2), max_features=1000)
X = vectorizer.fit_transform(df['text'])
y = df['label']

print(vectorizer.get_feature_names_out()[:10])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = MultinomialNB()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predicted)
interval = 1.96 * np.sqrt((accuracy * (1 - accuracy)) / len(y_test))
print("Accuracy:", accuracy)
print("95% CI:", (accuracy - interval, accuracy + interval))

print(df.columns)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['ability' 'able' 'abortion' 'access' 'according' 'account' 'accused'
 'act' 'action' 'actions']
Accuracy: 0.865
95% CI: (0.8176395291408541, 0.9123604708591458)
Index(['title', 'text', 'label'], dtype='object')


In [10]:
vectorizer_wo_stopwords = TfidfVectorizer(stop_words=None, max_features=1000)
X_wo_stopwords = vectorizer_wo_stopwords.fit_transform(df['text'])

y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X_wo_stopwords, y, test_size=0.2, random_state=42)

clf_wo_stopwords = MultinomialNB()
clf_wo_stopwords.fit(X_train, y_train)
predicted_wo_stopwords = clf_wo_stopwords.predict(X_test)
accuracy_wo_stopwords = metrics.accuracy_score(y_test, predicted_wo_stopwords)
print("Accuracy without stopwords:", accuracy_wo_stopwords)

interval_wo_stopwords = 1.96 * np.sqrt((accuracy_wo_stopwords * (1 - accuracy_wo_stopwords)) / len(y_test))
print("95% CI without stopwords:", (accuracy_wo_stopwords - interval_wo_stopwords, accuracy_wo_stopwords + interval_wo_stopwords))


Accuracy without stopwords: 0.845
95% CI without stopwords: (0.7948425698425448, 0.8951574301574552)


First 10: ability, able, abortion, access, according, account, accused, act, action, actions

For the Naive Bayes classifier trained with stopwords removed, the  accuracy was: 0.865

The 95% confidence interval for this classifier is: (81.8%, 91.2%)

With Stopwords Removed: Accuracy was 0.865.
Without Stopwords Removed: Accuracy dropped to 0.845.
The respective 95% confidence intervals were:

With Stopwords Removed: CI = (81.8%, 91.2%)
Without Stopwords Removed: CI = (79.5%, 89.5%)

There is a difference in accuracy depending on whether stopwords are removed or not. Removing stopwords resulted in a slightly higher accuracy, suggesting that this preprocessing step helped in focusing the model on more meaningful content, thereby improving its performance in distinguishing real news from fake news. This indicates the effectiveness of the preprocessing steps applied.

In [17]:
# Extra Credit

vectorizer_title = TfidfVectorizer(stop_words='english', max_features=1000)
X_title = vectorizer_title.fit_transform(df['title'])
y_title = df['label']

X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y_title, test_size=0.2, random_state=42)

clf_title = MultinomialNB()
clf_title.fit(X_train_title, y_train_title)

predicted_title = clf_title.predict(X_test_title)
accuracy_title = metrics.accuracy_score(y_test_title, predicted_title)
print("Accuracy with titles:", accuracy_title)

interval_title = 1.96 * np.sqrt((accuracy_title * (1 - accuracy_title)) / len(y_test_title))
print("95% CI for titles:", (accuracy_title - interval_title, accuracy_title + interval_title))

Accuracy with titles: 0.71
95% CI for titles: (0.6471117880680329, 0.772888211931967)


Accuracy: 71%.

95% Confidence Interval: 64.71% to 77.29%

Comparison with Text Model:

Text Classifier achieved an accuracy of 86.5%.
Title Classifier achieved an accuracy of 71%.
There is a noticeable difference of 15.5% in accuracy, with the text  classifier performing better.

Text Classifier 95% CI was (81.76%, 91.24%).
Title Classifier 95% CI was (64.71%, 77.29%).
The text  classifier not only showed higher accuracy but also a tighter confidence interval, suggesting more stable and reliable performance across different samples.