In [47]:
import numpy as np
import pandas as pd
import itertools

In [48]:
# Import testing data
df = pd.read_csv('news/news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [49]:
test_df = df.rename(columns={"Unnamed: 0": "id"})
test_df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [51]:
# Drop any null values
test_df = test_df.dropna(subset=['text', 'label'])

In [52]:
test_df.shape

(6335, 4)

In [34]:
test_df.dtypes

id        int64
title    object
text     object
label    object
dtype: object

In [53]:
x = test_df['text']
x.head()

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [54]:
y = test_df.label
y.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [55]:
from sklearn.model_selection import train_test_split

# Split Dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform train set, transform test set
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [67]:
#Accuracy was at 94% but it went down as we trained it too much! :(
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize a PassiveAggressiveClassifier
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

# Predict on the test set
y_pred = pac.predict(tfidf_test)


In [58]:
# Calculate accuracy
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 92.58%


In [59]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('nbmodel', MultinomialNB())])

In [61]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [62]:
test_score = pipeline.score(X_test, y_test)
print(f'Accuracy: {round(test_score*100,2)}%')

Accuracy: 84.06%


In [64]:
predict = pipeline.predict(X_test)

In [68]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

        FAKE       0.97      0.71      0.82       638
        REAL       0.77      0.98      0.86       629

    accuracy                           0.84      1267
   macro avg       0.87      0.84      0.84      1267
weighted avg       0.87      0.84      0.84      1267



In [71]:
# # Build confusion matrix
confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

#593 true positives, 585 true negatives, 45 false positives, & 44 false negatives

array([[588,  50],
       [ 42, 587]], dtype=int64)

In [70]:
# Save the model to Pickle
import pickle

# Dump the model
with open('fake_news_model.pickle','wb') as modelFile:
     pickle.dump(pac, modelFile)

In [10]:
# Testing the model using article from "The Onion" (news source)
prediction_text = "WINSTON-SALEM, NC—As a token of its appreciation to members of the public doing their part to fight Covid-19, leading doughnut purveyor Krispy Kreme announced Wednesday it would begin offering vaccinated customers a free ride on its glaze conveyor belt. “We know a lot of our customers stayed home this past year and missed visiting their local Krispy Kreme, so the least we can do is offer them a complimentary glazing once they get inoculated,” said CEO Michael J. Tattersfield, explaining that anyone who displays a qualifying Centers for Disease Control Vaccination Record Card would get to hop on the conveyor and receive a full-body coating of sugar, milk, and light corn syrup. “Spraying our patrons head-to-toe in a fresh, piping-hot layer of our trademark icing is our way of saying thank you to those who choose to get vaccinated. After everything our communities have been through during this pandemic, we think it’s important to stop and enjoy the things that make life worth living. Please note this offer is limited to one ride per customer per dose of an FDA-approved immunization.” Tattersfield went on to state that upon receiving their second dose of the Moderna or Pfizer vaccine, customers were welcome to come back for a complimentary dip in the deep fat fryer."

In [11]:
test_pred = pd.Series(prediction_text)
tfidf_vec_test = tfidf_vectorizer.transform(test_pred)
pac.predict(tfidf_vec_test)

array(['FAKE'], dtype='<U4')

In [9]:
# Test the model using df of scraped articles from Facebook
fb_news_df = pd.read_csv('csvs/combined_news_data.csv')
fb_news_df.head()

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies
0,CNN,Po Murray writes that the two mass killings in...,Opinion: What has to happen after the Colorado...,https://www.facebook.com/cnn/?__cft__[0]=AZVAV...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,102,98 Comments,6 Shares,1 Reply
1,CNN,Mississippi is one of three states that have e...,More states aim to offer Covid-19 vaccines to ...,https://www.facebook.com/cnn/?__cft__[0]=AZU2o...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,574,98 Comments,31 Shares,1 Reply
2,CNN,"In 2018, the city of Boulder, Colorado, passed...",Boulder banned assault weapons in 2018. A judg...,https://www.facebook.com/cnn/?__cft__[0]=AZVNa...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,1.1K,1.2K Comments,119 Shares,112 Replies
3,CNN,"""The life of a single parent is challenging to...",Opinion: My Covid life as a single mom: Like j...,https://www.facebook.com/cnn/?__cft__[0]=AZWhO...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,554,202 Comments,37 Shares,9 Replies
4,CNN,British people attempting to go on vacation wh...,"New UK law means $7,000 fines for vacations ab...",https://www.facebook.com/cnn/?__cft__[0]=AZXIu...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,912,390 Comments,83 Shares,View more comments


In [13]:
# Tested single article from df
fb_articles = pd.Series(fb_news_df["Article"][952])

tfidf_vector_test = tfidf_vectorizer.transform(fb_articles)
pac.predict(tfidf_vector_test)

array(['FAKE'], dtype='<U4')

In [10]:
# Loop through articles to categorize and append to new column
results = []
for index in range(len(fb_news_df)):
    fb_articles = pd.Series(fb_news_df["Article"][index])
    if fb_articles.dtype == 'float64': 
        results.append("NaN")
        continue
    
    tfidf_vector_test = tfidf_vectorizer.transform(fb_articles)
    results.append(pac.predict(tfidf_vector_test))

fb_news_df['results'] = results
fb_news_df.head()

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies,results
0,CNN,Po Murray writes that the two mass killings in...,Opinion: What has to happen after the Colorado...,https://www.facebook.com/cnn/?__cft__[0]=AZVAV...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,102,98 Comments,6 Shares,1 Reply,[REAL]
1,CNN,Mississippi is one of three states that have e...,More states aim to offer Covid-19 vaccines to ...,https://www.facebook.com/cnn/?__cft__[0]=AZU2o...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,574,98 Comments,31 Shares,1 Reply,[FAKE]
2,CNN,"In 2018, the city of Boulder, Colorado, passed...",Boulder banned assault weapons in 2018. A judg...,https://www.facebook.com/cnn/?__cft__[0]=AZVNa...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,1.1K,1.2K Comments,119 Shares,112 Replies,[FAKE]
3,CNN,"""The life of a single parent is challenging to...",Opinion: My Covid life as a single mom: Like j...,https://www.facebook.com/cnn/?__cft__[0]=AZWhO...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,554,202 Comments,37 Shares,9 Replies,[FAKE]
4,CNN,British people attempting to go on vacation wh...,"New UK law means $7,000 fines for vacations ab...",https://www.facebook.com/cnn/?__cft__[0]=AZXIu...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,912,390 Comments,83 Shares,View more comments,[FAKE]


In [11]:
print (type(fb_news_df.loc[0, 'results']))

<class 'numpy.ndarray'>


In [12]:
# Get rid of brackets and null "results" (null results are from NAN articles)
fb_news_df["results"] = pd.DataFrame([str(line).strip('[').strip(']').strip() for line in fb_news_df["results"]])
fb_one_df = fb_news_df.dropna(subset=['Article'])
fb_one_df

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies,results
0,CNN,Po Murray writes that the two mass killings in...,Opinion: What has to happen after the Colorado...,https://www.facebook.com/cnn/?__cft__[0]=AZVAV...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,102,98 Comments,6 Shares,1 Reply,'REAL'
1,CNN,Mississippi is one of three states that have e...,More states aim to offer Covid-19 vaccines to ...,https://www.facebook.com/cnn/?__cft__[0]=AZU2o...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,574,98 Comments,31 Shares,1 Reply,'FAKE'
2,CNN,"In 2018, the city of Boulder, Colorado, passed...",Boulder banned assault weapons in 2018. A judg...,https://www.facebook.com/cnn/?__cft__[0]=AZVNa...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,1.1K,1.2K Comments,119 Shares,112 Replies,'FAKE'
3,CNN,"""The life of a single parent is challenging to...",Opinion: My Covid life as a single mom: Like j...,https://www.facebook.com/cnn/?__cft__[0]=AZWhO...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,554,202 Comments,37 Shares,9 Replies,'FAKE'
4,CNN,British people attempting to go on vacation wh...,"New UK law means $7,000 fines for vacations ab...",https://www.facebook.com/cnn/?__cft__[0]=AZXIu...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,912,390 Comments,83 Shares,View more comments,'FAKE'
...,...,...,...,...,...,...,...,...,...,...
1039,Fox News,A 38-year-old military veteran tossed a smoke ...,Military veteran upset over stimulus check thr...,https://www.facebook.com/FoxNews/?__cft__[0]=A...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,13K,8.9K Comments,970 Shares,View previous comments,'FAKE'
1040,Fox News,BREAKING: The IRS may have decided to heed the...,IRS planning to delay tax filing deadline unti...,https://www.facebook.com/FoxNews/?__cft__[0]=A...,https://external-sjc3-1.xx.fbcdn.net/safe_imag...,1.1K,588 Comments,314 Shares,View previous comments,'FAKE'
1041,Fox News was live.,White House press secretary Jen Psaki holds a ...,,https://www.facebook.com/FoxNews/?__cft__[0]=A...,,5.8K,12K Comments,461 Shares,View 9 more replies,'REAL'
1042,Fox News was live.,Federal Reserve Chairman Jerome Powell holds a...,,https://www.facebook.com/FoxNews/?__cft__[0]=A...,,1.6K,2.9K Comments,275 Shares,7 Replies,'FAKE'


In [13]:
# Export to CSV
fb_one_df.to_csv("fb1_test_data.csv", index=True)

In [18]:
# Training the model using another df of articles scraped from Facebook
fb_news_df2 = pd.read_csv('csvs/combined_news_data2.csv', encoding='cp1252')
fb_news_df2.head()

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies
0,Conservative Post,"This program, and the $500 monthly checks it o...",Struggling White Families Not Allowed To Parte...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,180,11 Comments,29 Shares,View 40 more comments
1,Conservative Post,Biden appeared to simply give up on what he wa...,The First TV: Biden Lapses Into Nonsense Durin...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,220,7 Comments,391 Shares,View previous comments
2,Conservative Post,Right observation! Biden used notes extensivel...,Fox’s Baier: I Haven’t Seen Presidents Flip Th...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,128,43 Comments,2.6K Shares,View previous comments
3,Conservative Post,She has been charged with felony obstruction o...,Georgia Dem Arrested After Defying Police and ...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,364,64 Comments,43 Shares,View previous comments
4,Conservative Post,Let's go Georgia!,Georgia Governor Signs Into Law Sweeping Elect...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,845,58 Comments,1.1K Shares,View previous comments


In [15]:
fb_news_df2 = fb_news_df2.dropna(subset=['Article'])

In [16]:
fb_news_df2.dtypes

Title                object
Article              object
Headline             object
Title_URL            object
Image                object
Facebook_likes       object
Facebook_comments    object
Facebook_shares      object
Comment_replies      object
dtype: object

In [19]:
fb_news_df2.columns = fb_news_df2.columns.str.strip()

In [20]:
# Loop through articles to categorize and append to new column
result = []
for index in range(len(fb_news_df2)):
    fb_articles = pd.Series(fb_news_df2["Article"][index])
#     print(fb_articles)
    if fb_articles.dtype == 'float64': 
        result.append("NaN")
        continue
    
    tfidf_vector_test2 = tfidf_vectorizer.transform(fb_articles)
#     results = pac.predict(tfidf_vector_test2)
    result.append(pac.predict(tfidf_vector_test2))
    
# print(result)

fb_news_df2['results'] = result
fb_news_df2.head()

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies,results
0,Conservative Post,"This program, and the $500 monthly checks it o...",Struggling White Families Not Allowed To Parte...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,180,11 Comments,29 Shares,View 40 more comments,[FAKE]
1,Conservative Post,Biden appeared to simply give up on what he wa...,The First TV: Biden Lapses Into Nonsense Durin...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,220,7 Comments,391 Shares,View previous comments,[FAKE]
2,Conservative Post,Right observation! Biden used notes extensivel...,Fox’s Baier: I Haven’t Seen Presidents Flip Th...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,128,43 Comments,2.6K Shares,View previous comments,[FAKE]
3,Conservative Post,She has been charged with felony obstruction o...,Georgia Dem Arrested After Defying Police and ...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,364,64 Comments,43 Shares,View previous comments,[FAKE]
4,Conservative Post,Let's go Georgia!,Georgia Governor Signs Into Law Sweeping Elect...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,845,58 Comments,1.1K Shares,View previous comments,[REAL]


In [21]:
# Get rid of brackets and null "results" (null results are from NAN articles)
fb_news_df2["results"] = pd.DataFrame([str(line).strip('[').strip(']').strip() for line in fb_news_df2["results"]])
fb_two_df = fb_news_df2.dropna(subset=['Article'])
fb_two_df.head()

Unnamed: 0,Title,Article,Headline,Title_URL,Image,Facebook_likes,Facebook_comments,Facebook_shares,Comment_replies,results
0,Conservative Post,"This program, and the $500 monthly checks it o...",Struggling White Families Not Allowed To Parte...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,180,11 Comments,29 Shares,View 40 more comments,'FAKE'
1,Conservative Post,Biden appeared to simply give up on what he wa...,The First TV: Biden Lapses Into Nonsense Durin...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,220,7 Comments,391 Shares,View previous comments,'FAKE'
2,Conservative Post,Right observation! Biden used notes extensivel...,Fox’s Baier: I Haven’t Seen Presidents Flip Th...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,128,43 Comments,2.6K Shares,View previous comments,'FAKE'
3,Conservative Post,She has been charged with felony obstruction o...,Georgia Dem Arrested After Defying Police and ...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,364,64 Comments,43 Shares,View previous comments,'FAKE'
4,Conservative Post,Let's go Georgia!,Georgia Governor Signs Into Law Sweeping Elect...,https://www.facebook.com/heyconservativepost/?...,https://l.facebook.com/l.php?u=https%3A%2F%2Fc...,845,58 Comments,1.1K Shares,View previous comments,'REAL'


In [22]:
# Export to CSV
fb_two_df.to_csv("fb2_test_data.csv", index=True)