In [1]:
import pandas as pd
from numba import jit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Import the files
true_articles = pd.read_csv('Datasets/True.csv')
false_articles = pd.read_csv('Datasets/Fake.csv')

# Add a column to each dataframe to indicate whether the article is true or false
true_articles['true'] = 1
false_articles['true'] = 0

# Combine the two dataframes into one
combined_articles = pd.concat([true_articles, false_articles])

In [3]:
# Clean the text in the dataframe
from cleantext import clean
combined_articles['text'] = combined_articles['text'].apply(lambda x: clean(x, clean_all=True, stopwords=True, lowercase=True, numbers=True, punct=True, stemming=True).lower())
combined_articles['title'] = combined_articles['title'].apply(lambda x: clean(x, clean_all=True, stopwords=True, lowercase=True, numbers=True, punct=True, stemming=True).lower())

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_articles['text'], combined_articles['true'], test_size=0.2, random_state=42)

In [5]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [6]:
# Create a Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train_vectorized.toarray(), y_train)
y_pred = model.predict(X_test_vectorized.toarray())

In [7]:
# Print the accuracy score
print(accuracy_score(y_test, y_pred))

0.9416481069042316


In [8]:
# Print presicion, recall, and f1-score
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      4650
           1       0.92      0.96      0.94      4330

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980

