In [2]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
import en_core_web_sm
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn import svm
from time import process_time, time
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


In [3]:
os.chdir('C:/Users/adamj/Documents/Github/Fake News NLP Project')

In [5]:
Fake_True_news=open('Fake_True_news_shuffled.csv','r',encoding='utf8')
Fake_True_news_csv=pd.read_csv(Fake_True_news)

### Inspect

In [6]:
Fake_True_news_csv.head(10)

Unnamed: 0,title,text,subject,date,real/fake,title_nlp,text_nlp
0,WHOA! 8 ACTUAL QUOTES FROM HILLARY That Prove ...,Hillary came out with a heavily edited TV ad y...,left-news,2017-12-31,1,WHOA ACTUAL QUOTES FROM HILLARY prove shes unf...,hillary came heavily edited TV ad yesterday ca...
1,PULITZER PRIZE WINNING AUTHOR TONI MORRISON: “...,The recipient of the Presidential Medal of Fre...,left-news,2017-12-31,1,PULITZER PRIZE WINNING AUTHOR TONI MORRISON I ...,recipient presidential medal freedom barack ob...
2,Prosecutors will not pursue Bridgegate charges...,(Reuters) - New Jersey prosecutors on Friday s...,politicsNews,2017-12-31,0,prosecutors not pursue bridgegate charges new ...,reuters new jersey prosecutors friday said wou...
3,"Pope to meet head of Myanmar army, Rohingya re...",VATICAN CITY (Reuters) - Pope Francis will mee...,worldnews,2017-12-31,0,pope meet head myanmar army rohingya refugees ...,VATICAN CITY reuters pope francis meet head my...
4,U.S. aerospace industry urges Trump to help Ex...,WASHINGTON (Reuters) - The chief executive of ...,politicsNews,2017-12-31,0,U S aerospace industry urges trump help ex im ...,WASHINGTON reuters chief executive U S aerospa...
5,DEMOCRATS CAUGHT Paying Halfway House Patients...,A lawless party whose end always justifies the...,politics,2017-12-31,1,DEMOCRATS CAUGHT paying halfway house patients...,A lawless party whose end always justifies mea...
6,U.S. defense elite rally behind Trump's unusua...,"SIMI VALLEY, California (Reuters) - Often fier...",politicsNews,2017-12-31,0,U S defense elite rally behind trumps unusual ...,SIMI VALLEY california reuters often fiercely ...
7,Vatican prepared in case of Barcelona-style at...,VATICAN CITY (Reuters) - It is perhaps only a...,worldnews,2017-12-31,0,vatican prepared case barcelona style attack s...,VATICAN CITY reuters perhaps matter time rome ...
8,BENGHAZI SPOKESLIAR SUSAN RICE TELLS CNN: ‘We ...,"Tell us Susan what s worse, Iran with a nuclea...",left-news,2017-12-31,1,BENGHAZI SPOKESLIAR SUSAN RICE TELLS CNN expec...,tell us susan worse iran nuclear weapon billio...
9,Trump disbands business councils after CEOs qu...,WASHINGTON/NEW YORK (Reuters) - President Dona...,politicsNews,2017-12-31,0,trump disbands business councils ceos quit pro...,WASHINGTON NEW YORK reuters president donald t...


In [7]:
Fake_True_news_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 7 columns):
title        44898 non-null object
text         44898 non-null object
subject      44898 non-null object
date         44898 non-null object
real/fake    44898 non-null int64
title_nlp    44898 non-null object
text_nlp     44267 non-null object
dtypes: int64(1), object(6)
memory usage: 2.4+ MB


In [9]:
#for some reason we lost rows in 'text_nlp' during the pre-processing. Let's drop them
Fake_True_news_csv = Fake_True_news_csv.dropna(axis=0, subset=['text_nlp'])


In [10]:
Fake_True_news_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44267 entries, 0 to 44897
Data columns (total 7 columns):
title        44267 non-null object
text         44267 non-null object
subject      44267 non-null object
date         44267 non-null object
real/fake    44267 non-null int64
title_nlp    44267 non-null object
text_nlp     44267 non-null object
dtypes: int64(1), object(6)
memory usage: 2.7+ MB


### Build training and testing datasets

In [11]:
#create feature data frame and isolate target variable

#feature dataframe
X = Fake_True_news_csv['text_nlp'] # load the dataset as a pandas data frame

#target variable
y = Fake_True_news_csv['real/fake']

# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=27)

### Feature engineering

In [12]:
%%time

# build BOW features on train news
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(X_train)


# build TFIDF features on train news
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(X_train)

Wall time: 1min 7s


In [13]:
#transform test news into features
cv_test_features = cv.transform(X_test)
tv_test_features = tv.transform(X_test)

In [14]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35413, 273556)  Test features shape: (8854, 273556)
TFIDF model:> Train features shape: (35413, 273556)  Test features shape: (8854, 273556)


## Model Training, Prediction and Performance Evaluation
### Try out Logistic Regression¶

In [15]:
%%time


# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
#(the following LogisticRegression parameters are all defaults)
lr = LogisticRegression(max_iter=500, random_state=27)

# train model
lr.fit(cv_train_features, y_train)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)



Wall time: 10.7 s


In [16]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['real', 'fake']
print(classification_report(y_test, lr_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4335
           1       1.00      1.00      1.00      4519

    accuracy                           1.00      8854
   macro avg       1.00      1.00      1.00      8854
weighted avg       1.00      1.00      1.00      8854



Unnamed: 0,real,fake
real,4327,8
fake,11,4508


In [17]:
%%time

# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, y_train)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

Wall time: 1.73 s


In [18]:
labels = ['real', 'fake']
print(classification_report(y_test, lr_tfidf_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4335
           1       1.00      0.99      0.99      4519

    accuracy                           0.99      8854
   macro avg       0.99      0.99      0.99      8854
weighted avg       0.99      0.99      0.99      8854



Unnamed: 0,real,fake
real,4321,14
fake,39,4480


### Try out Random Forest

In [19]:
%%time 

# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=27)

# train model
rf.fit(cv_train_features, y_train)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

Wall time: 33.4 s


In [20]:
labels = ['real', 'fake']
print(classification_report(y_test, rf_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, rf_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4335
           1       1.00      0.99      0.99      4519

    accuracy                           0.99      8854
   macro avg       0.99      0.99      0.99      8854
weighted avg       0.99      0.99      0.99      8854



Unnamed: 0,real,fake
real,4325,10
fake,42,4477


In [21]:
%%time

# Random Forest model on TF-IDF features

# train model
rf.fit(tv_train_features, y_train)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

Wall time: 33.1 s


In [23]:
labels = ['real', 'fake']
print(classification_report(y_test, rf_tfidf_predictions))
pd.DataFrame(confusion_matrix(y_test, rf_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      4335
           1       1.00      0.99      1.00      4519

    accuracy                           1.00      8854
   macro avg       1.00      1.00      1.00      8854
weighted avg       1.00      1.00      1.00      8854



Unnamed: 0,real,fake
real,4324,11
fake,30,4489
