# NLP Fake News Prediction

In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt

In [4]:
df=pd.read_csv('news.csv', index_col=None)
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [5]:
# Remove column
dataset=df.drop("Unnamed: 0",axis=1)
dataset

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...
6330,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [8]:
# input and output split and train and testset split
y=dataset["label"]
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], y, test_size=0.30, random_state=53)
X_train,

(2576                                                     
 1539    Report Copyright Violation Do you think there ...
 5163    The election in 232 photos, 43 numbers and 131...
 2615    Email Ever wonder what’s on the mind of today’...
 4270    Wells Fargo is Rotting from the Top Down Wells...
                               ...                        
 662     —Debby Borza stood before a wall of photos of ...
 3261    Presumptive Republican nominee Donald Trump ha...
 5883    December's job growth numbers are in, and they...
 2933    In a wide-ranging discussion, Trump also said ...
 797     Top officials of the Cruz campaign are convinc...
 Name: text, Length: 4244, dtype: object,)

In [20]:
# CountVectorizer preprocessing
count_vectorizer = CountVectorizer(stop_words='english')
count_train_x = count_vectorizer.fit_transform(X_train)
print(count_train)
count_test_x = count_vectorizer.transform(X_test)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1119820 stored elements and shape (4244, 56922)>
  Coords	Values
  (1, 42470)	1
  (1, 12105)	1
  (1, 54177)	1
  (1, 50628)	1
  (1, 15924)	2
  (1, 44520)	2
  (1, 51896)	2
  (1, 35783)	4
  (1, 35256)	1
  (1, 21881)	1
  (1, 42534)	1
  (1, 8399)	1
  (1, 29531)	2
  (1, 15927)	2
  (1, 25686)	1
  (1, 49203)	2
  (1, 16814)	1
  (1, 36087)	1
  (1, 21568)	1
  (1, 25684)	1
  (1, 38823)	1
  (1, 47506)	1
  (1, 36831)	1
  (2, 16972)	1
  (2, 762)	1
  :	:
  (4243, 41435)	1
  (4243, 53607)	1
  (4243, 659)	1
  (4243, 38834)	1
  (4243, 19003)	1
  (4243, 11415)	1
  (4243, 7545)	1
  (4243, 22426)	1
  (4243, 54007)	1
  (4243, 7113)	1
  (4243, 4932)	1
  (4243, 39497)	1
  (4243, 50053)	1
  (4243, 38849)	1
  (4243, 20702)	1
  (4243, 42139)	1
  (4243, 17247)	1
  (4243, 50052)	1
  (4243, 55228)	1
  (4243, 29255)	1
  (4243, 49435)	1
  (4243, 11257)	1
  (4243, 52945)	1
  (4243, 20905)	1
  (4243, 7962)	1


In [21]:
len(count_vectorizer.get_feature_names_out())

56922

In [32]:
print(count_train_x.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [24]:
#model creation
clf = MultinomialNB()
clf.fit(count_train_x, y_train)

# Model validation
y_pred = clf.predict(count_test_x)
score = metrics.accuracy_score(y_test, y_pred)
print("accuracy:   %0.3f" % score)

cm = metrics.confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
cm

accuracy:   0.893


array([[ 865,  143],
       [  80, 1003]])

In [28]:
from sklearn.metrics import classification_report
report=classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

        FAKE       0.92      0.86      0.89      1008
        REAL       0.88      0.93      0.90      1083

    accuracy                           0.89      2091
   macro avg       0.90      0.89      0.89      2091
weighted avg       0.89      0.89      0.89      2091



In [29]:
dataset["text"][0]



In [37]:
print(count_train_x[[0]])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 0 stored elements and shape (1, 56922)>


In [None]:
# deployment

In [41]:
X_train[[2]]

2    U.S. Secretary of State John F. Kerry said Mon...
Name: text, dtype: object

In [44]:
y_train[[2]]

2    REAL
Name: label, dtype: object

In [43]:
count_train_x[[2]]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (1, 56922)>

In [42]:
#model loaded and predicti
clf.predict(count_train_x[[2]])

array(['REAL'], dtype='<U4')