# Import Depencies

In [184]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


from sklearn.neighbors import KNeighborsClassifier

import pickle

import warnings
warnings.filterwarnings("ignore")

import scipy.sparse

In [185]:
df = pd.read_csv('WELFake_Dataset.csv').dropna().reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [186]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zroy1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Remove Stopwords/Filler words from Text

In [187]:
def clean_text(text):
    text = re.sub(r"[!@#$(),\n%^*?.'\:;~`0-9]", '', str(text)).lower()
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text, language='english', preserve_line=True)
    text = " ".join(word for word in word_tokens if not word.lower() in stop_words)
    return text

In [189]:
titles = []
texts = []
for title, text in df[["title", "text"]].to_numpy():
    titles.append(clean_text(title))
    texts.append(clean_text(text))

# Vectorize Text and Train Model

In [250]:
X_train_title, X_test_title, X_train_text, X_test_text, y_train, y_test = train_test_split(titles, texts, df["label"], test_size=0.2, random_state=42)

In [251]:
cv_titles = CountVectorizer(token_pattern=r'[^\s]+')
cv_texts = CountVectorizer(token_pattern=r'[^\s]+')
# Vectorize the title and text data separately
X_train_title = cv_titles.fit_transform(X_train_title)
X_train_text = cv_texts.fit_transform(X_train_text)

# Concatenate the vectorized representations
X_train_combined = scipy.sparse.hstack([X_train_title, X_train_text])

# Create and train the Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_combined, y_train)
# model = KNeighborsClassifier()
# model.fit(X_train_combined, y_train)

# Test Accuracy of Model

In [252]:
X_test_title = cv_titles.transform(X_test_title)
X_test_text = cv_texts.transform(X_test_text)
X_test_combined = scipy.sparse.hstack([X_test_title, X_test_text])

In [253]:
accuracy = model.score(X_test_combined, y_test)
accuracy

0.9361895443108751

# Use Model

In [254]:
def prediction(title, text, model, cv_titles, cv_texts):
    title = clean_text(title)
    text = clean_text(text)
    X_title = cv_titles.transform([title])
    X_text = cv_texts.transform([text])
    X = scipy.sparse.hstack([X_title, X_text])
    return model.predict_proba(X)

In [255]:
prediction("BREAKING NEWS: Tornado Warning", "There has been a tornado that killed 10000 people in Mississauga.", model, cv_titles, cv_texts)

array([[4.63869798e-04, 9.99536130e-01]])

# Save Model

In [222]:
pickle.dump(model, open("fake_news_kn.pickle", "wb"))
pickle.dump(cv_titles, open("titles_cv_kn.pickle", "wb"))
pickle.dump(cv_texts, open("texts_cv_kn.pickle", "wb"))