In [50]:
import pandas as pd
import numpy as np
import re  #checks for matching strings in a set of words (diminutive for regular expressions)
from nltk.corpus import stopwords #Filters non-impactful words from sentences
from nltk.stem.porter import PorterStemmer #Finds the root words in a set of words
from sklearn.feature_extraction.text import TfidfVectorizer #Transform strings into numerical values
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression #This model is used for classification for supervised models
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [2]:
FN_df = pd.read_csv('fake_news.csv')
FN_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
FN_df.shape

(20800, 5)

In [4]:
#Statistical data
#It only applies to numerical values that's why we don't see the other columns
FN_df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [5]:
#As we can see here there are some missing values in text columns
#We will choose to drop the columns because we can't apply statistical methods

In [6]:
#Drop all missing values
FN_df_2 = FN_df.dropna()

In [7]:
#data has been cleaned
FN_df_2.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [8]:
#Shape of the cleaned data
FN_df_2.shape

(18285, 5)

In [9]:
FN_df_2['label'].value_counts()

0    10361
1     7924
Name: label, dtype: int64

In [10]:
#Let's combine the two columns 'title' and 'author' into one column called 'author/title'
#The purpose of this is to apply the ML model on that column
#The 'text' column is massive and it will take forever to do the processing on it
FN_df_2['author/title'] = FN_df_2['author'] +' '+ FN_df_2['title']
FN_df_2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FN_df_2['author/title'] = FN_df_2['author'] +' '+ FN_df_2['title']


Unnamed: 0,id,title,author,text,label,author/title
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riadanas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
#printign all stop words in english language
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
#Stemming reduces the word to its root (i.e: alive, lively => live)
port_stem = PorterStemmer()

In [19]:
#stemming function that reads regular expressions in the content
#Then transforms all letters into lower case
#Then the content will be split on lists
#then the function will stem the words that are not stopwords
#Finally, we joing all the stem words together
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [20]:
FN_df_2['author/title'] = FN_df_2['author/title'].apply(stemming)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FN_df_2['author/title'] = FN_df_2['author/title'].apply(stemming)


In [22]:
#We can see that the text have been stemmed properly
print(FN_df_2['author/title'])

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: author/title, Length: 18285, dtype: object


In [23]:
X = FN_df_2['author/title'] 
Y = FN_df_2['label']

In [24]:
print(X)
print(Y)

0        darrel lucu hous dem aid even see comey letter...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exercis ...
20799                            david swanson keep f aliv
Name: author/title, Length: 18285, dtype: object
0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 18285, dtype: int64


### in order to do machine learning on this data we need to convert text data into numerical values

In [28]:
#Term frequency and inverse document frequency function to transform strings into numerical values
#We only do it on 'X' because 'Y' is binary (0,1) so no need for conversion
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [29]:
print(X)

  (0, 14626)	0.2853880981846006
  (0, 12567)	0.25566372256502734
  (0, 8310)	0.3609049070394367
  (0, 8048)	0.29347549279156676
  (0, 7190)	0.24556189342497173
  (0, 6552)	0.21745594418933306
  (0, 4637)	0.23016077319140021
  (0, 3543)	0.2684494960336511
  (0, 3359)	0.3609049070394367
  (0, 2757)	0.2466340295002162
  (0, 2312)	0.3745612250433202
  (0, 247)	0.26982554594264346
  (1, 15663)	0.3053027963338981
  (1, 6377)	0.19285723710368197
  (1, 5140)	0.7119376870709988
  (1, 3328)	0.2623789770430963
  (1, 2619)	0.19368327535633711
  (1, 2066)	0.38191890436039194
  (1, 1764)	0.1509985164277699
  (1, 1391)	0.29617980713962144
  (2, 14560)	0.4180284001448272
  (2, 8973)	0.4948460479407663
  (2, 5579)	0.3490632212946542
  (2, 5031)	0.38709995799949964
  (2, 2895)	0.4581003415623782
  :	:
  (18282, 12239)	0.252743907968046
  (18282, 11515)	0.2748252773264482
  (18282, 11321)	0.24588400571511215
  (18282, 9605)	0.07665665104558947
  (18282, 8942)	0.1712955017712004
  (18282, 8879)	0.29296479

## Train / test split

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, stratify=Y, random_state=2)

In [33]:
#Model chosen: LogisticRegression which is a classification ML model
model = LogisticRegression()

In [40]:
#Let's do it first on the training set
training_FN = model.fit(X_train, Y_train)

In [38]:
X_train_predic = model.predict(X_train)
X_train_accuracy = accuracy_score(X_train_predic, Y_train)

In [39]:
print("The training accuracy is: ", X_train_accuracy * 100, "%")

The training accuracy is:  99.0824015556636 %


In [41]:
#Let's do it on the testing set
testing_FN = model.fit(X_test, Y_test)

In [43]:
X_test_predic = model.predict(X_test)
X_test_accuracy = accuracy_score(X_test_predic, Y_test)

In [44]:
print("The testing accuracy is: ", X_test_accuracy * 100, "%")

The testing accuracy is:  98.96118097320941 %


## Predictive system

We're going to create a predictive system and test a few values from the dataset and see if the model responds correctly. So after choosing randomly in the dataset, row=2 => 1 (fake news), row=5 => 0 (Real news), let's try them

In [48]:
X_new = X_test[5]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [49]:
print(Y_test[5])

0


The model responds very accurately.