# Detect Fake News

In [92]:
# import library
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus.reader import wordnet
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [94]:
import warnings
warnings.filterwarnings("ignore")

In [55]:
# read data
df = pd.read_csv("news.csv")
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


### Preprocessing

In [56]:
df.rename(columns={"Unnamed: 0": "id"}, inplace=True)
df

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [57]:
# check id null
np.where(df['id'].isnull())

(array([], dtype=int64),)

In [58]:
# check title null
np.where(df['title'].isnull())

(array([], dtype=int64),)

In [59]:
# check text null
np.where(df['text'].isnull())

(array([], dtype=int64),)

In [60]:
# check lable null
np.where(df['label'].isnull())

(array([], dtype=int64),)

In [61]:
# check label type
np.unique(df['label'], return_counts=True)

(array(['FAKE', 'REAL'], dtype=object), array([3164, 3171], dtype=int64))

Everything is OK. Next, i will clean text to make my model more efficient.

In [62]:
# clean text
stop_w = stopwords.words('English')
stop_w[:10]
# A stop word is a commonly used word (such as “the”, “a”, “an”, “in” ...), we should filtered out before
# processing of natural language processing.

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [63]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [64]:
def tag(t):
    if t.startswith("N"):
        return wordnet.NOUN
    elif t.startswith("V"):
        return wordnet.VERB
    elif t.startswith("R"):
        return wordnet.ADV
    elif t.startswith("J"):
        return wordnet.ADJ
    else:
        return wordnet.NOUN

In [65]:
def remove_num(text, num_only=False):
    if not num_only:
        # just remove number like 1, 123123, 55325794, ...
        token = [w for w in text if not w.isdigit()]
    else:
        # remove all word which contains num like 1, 213, as89, 12th, 3rd ...
        token = [w for w in text if not any(map(str.isdigit, w))]
    return ' '.join(token)

In [66]:
def clean_text(text, num_only=False):
    # First of all, remove special character.
    # string.punctuation contains all special character like @ $ # * & ...
    translators = str.maketrans("", "", string.punctuation)
    text = text.translate(translators)
    
    # Next, i will lowercase character and remove number.
    words = word_tokenize(text.lower())
    tokens = word_tokenize(remove_num(words, False))
    
    # Then, remove stop word and word that length <= 1
    # Because, word has length <= 1 usually dont have meaning
    tokens = [w for w in tokens if w not in stop_w and len(w) > 1]
    
    # Finally, Process lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(w, pos=tag(wn)) for w, wn in tokens]
    
    return " ".join(tokens)

In [67]:
df['text'] = df['text'].apply(clean_text)
df.head(5)

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,u secretary state john kerry say monday stop p...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november lesson tonight...,FAKE
4,875,The Battle of New York: Why This Primary Matters,primary day new york frontrunners hillary clin...,REAL


### Feature engineering

1. Using Doc2Vec

In [70]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['text'].apply(lambda x: x.split(" ")))]

# Train a Doc2Vec model with out text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# Transform each documnet into a vector data
doc2vec_df = df["text"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.head(5)

Unnamed: 0,0,1,2,3,4
0,2.109134,-2.335723,-0.908344,-1.834141,-3.542918
1,1.170258,0.52366,1.020423,-1.164976,-2.90197
2,-1.616024,-0.490358,-1.698216,-2.277802,-1.866062
3,1.213935,1.09331,0.836142,-1.584317,-2.744152
4,1.073639,1.758667,0.711466,-2.378876,-2.066014


In [73]:
doc2vec_df.columns = ['doc2vec_vector_' + str(x) for x in doc2vec_df.columns]
new_df = pd.concat([df, doc2vec_df], axis=1)
new_df

Unnamed: 0,id,title,text,label,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,8476,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE,2.109134,-2.335723,-0.908344,-1.834141,-3.542918
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE,1.170258,0.523660,1.020423,-1.164976,-2.901970
2,3608,Kerry to go to Paris in gesture of sympathy,u secretary state john kerry say monday stop p...,REAL,-1.616024,-0.490358,-1.698216,-2.277802,-1.866062
3,10142,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november lesson tonight...,FAKE,1.213935,1.093310,0.836142,-1.584317,-2.744152
4,875,The Battle of New York: Why This Primary Matters,primary day new york frontrunners hillary clin...,REAL,1.073639,1.758667,0.711466,-2.378876,-2.066014
...,...,...,...,...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,state department tell republican national comm...,REAL,-0.593983,-1.499963,-3.060659,-1.646625,-4.594342
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,pb stand plutocratic pentagon post oct wikimed...,FAKE,-1.569178,-2.350283,-0.210426,-2.556691,-1.001033
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,antitrump protester tool oligarchy reform alw...,FAKE,-0.088734,-2.758209,-0.456095,-3.268192,-0.790693
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...",addis ababa ethiopia —president obama convene ...,REAL,-2.576141,-0.768642,-0.944718,-3.748509,-0.765507


2. Using TfIdfVectorizer

In [77]:
tfidf = TfidfVectorizer()

In [78]:
tfidf_res = tfidf.fit_transform(df['text']).toarray()
tfidf_res[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [82]:
tfidf_df = pd.DataFrame(tfidf_res, columns=tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.head(5)

Unnamed: 0,word_00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom,word_0004s,word_0005s,word_0006s,word_0007s,word_0008s,word_00addmouselistenerthis,word_00addmousemotionlistenerthis,word_00repaint,word_00setpreferredsizenew,...,word_حلب,word_عربي,word_عن,word_لم,word_ما,word_محاولات,word_من,word_هذا,word_والمرضى,word_ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
tfidf_df.index = new_df.index
new_df = pd.concat([new_df, tfidf_df], axis=1)
new_df.head()

Unnamed: 0,id,title,text,label,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,word_00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom,...,word_حلب,word_عربي,word_عن,word_لم,word_ما,word_محاولات,word_من,word_هذا,word_والمرضى,word_ยงade
0,8476,You Can Smell Hillary’s Fear,daniel greenfield shillman journalism fellow f...,FAKE,2.109134,-2.335723,-0.908344,-1.834141,-3.542918,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,FAKE,1.170258,0.52366,1.020423,-1.164976,-2.90197,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3608,Kerry to go to Paris in gesture of sympathy,u secretary state john kerry say monday stop p...,REAL,-1.616024,-0.490358,-1.698216,-2.277802,-1.866062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10142,Bernie supporters on Twitter erupt in anger ag...,kaydee king kaydeeking november lesson tonight...,FAKE,1.213935,1.09331,0.836142,-1.584317,-2.744152,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,875,The Battle of New York: Why This Primary Matters,primary day new york frontrunners hillary clin...,REAL,1.073639,1.758667,0.711466,-2.378876,-2.066014,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### NOW WE HAVE LIST OF FEATURES THAT WERE CONBINED OF 2 APPROACHES
+ Doc2Vec
+ TF-IDF

In [89]:
df_train = new_df.drop(['id', 'title', 'text', 'label'], axis=1)

In [90]:
df_train

Unnamed: 0,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,word_00017b2908ff9fa45188d243fd49aaeeb2dhrcofficecom,word_0004s,word_0005s,word_0006s,word_0007s,...,word_حلب,word_عربي,word_عن,word_لم,word_ما,word_محاولات,word_من,word_هذا,word_والمرضى,word_ยงade
0,2.109134,-2.335723,-0.908344,-1.834141,-3.542918,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.170258,0.523660,1.020423,-1.164976,-2.901970,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.616024,-0.490358,-1.698216,-2.277802,-1.866062,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.213935,1.093310,0.836142,-1.584317,-2.744152,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.073639,1.758667,0.711466,-2.378876,-2.066014,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6330,-0.593983,-1.499963,-3.060659,-1.646625,-4.594342,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6331,-1.569178,-2.350283,-0.210426,-2.556691,-1.001033,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6332,-0.088734,-2.758209,-0.456095,-3.268192,-0.790693,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6333,-2.576141,-0.768642,-0.944718,-3.748509,-0.765507,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df_train, new_df['label'], test_size=0.3)

I will try with 3 algorithms to choose the best in this case.
+ PassiveAggressiveClassifier
+ LogisticRegression
+ RandomForestRegression


In [95]:
algo = [
    ('LGR', LogisticRegression()),
    ('RF', RandomForestClassifier()),
    ('PAC', PassiveAggressiveClassifier(max_iter=50))
]

for name, alg in algo:
    y_pred = alg.fit(X_train, y_train).predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print('{}: {}'.format(name, score))

LGR: 0.9042609153077328
RF: 0.9105733824302998
PAC: 0.9305628616517623


I got an accuracy of 93.05% with PAC. So, let’s chose it and print out a confusion matrix to gain insight into the number of false and true negatives and positives.

In [97]:
pac = PassiveAggressiveClassifier(max_iter=50)
y_pred = pac.fit(X_train, y_train).predict(X_test)


In [98]:
confusion_matrix(y_test, y_pred)

array([[937,  41],
       [ 92, 831]], dtype=int64)

So with this model, I has 937 true positives, 831 true negatives, 92 false positives, and 41 false negatives.