# This project aims to build an AI model which would be able to distinguish genuine news from fake news. It uses Natural Language Processing (NLP) tools.

### Importing pandas package for data manupulation and handling (mainly tabular data)
### And creating the data frames for the two news data sets.

In [93]:
import pandas as pd

In [94]:
# Creating the data-frame objects
trueNews_df = pd.read_csv("Data/True_News.csv")
fakeNews_df = pd.read_csv("Data/Fake_News.csv")

### Data Description

In [95]:
# For true news data set
print("-----------------------------------------------------------------------")
print("~~A short overview of True_News dataset~~", end="\n\n")
print(trueNews_df.info(), end="\n\n")
print("True_News dataset:-")
trueNews_df

-----------------------------------------------------------------------
~~A short overview of True_News dataset~~

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None

True_News dataset:-


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [96]:
# For fake news data set
print("-----------------------------------------------------------------------")
print("~~A short overview of Fake_News dataset~~", end="\n\n")
print(fakeNews_df.info(), end="\n\n")
print("Fake_News dataset:-")
fakeNews_df

-----------------------------------------------------------------------
~~A short overview of Fake_News dataset~~

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None

Fake_News dataset:-


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


### Data Cleaning

In [97]:
# Remove all the rows containing NaN/missing values
trueNews_df = trueNews_df.dropna()
fakeNews_df = fakeNews_df.dropna()

In [98]:
# Here in these data sets we are mainly focused on the news text.
# So it is safe to drop/remove the other columns for faster and easier data
# handling and text processing.
print("Before dropping:-")
print("Columns of true news data set: ", trueNews_df.columns)
print("Columns of fake news data set: ", fakeNews_df.columns)

trueNews_df = trueNews_df.drop(['title', 'subject', 'date'], axis=1)
fakeNews_df = fakeNews_df.drop(['title', 'subject', 'date'], axis=1)

print("\nAfter dropping:-")
print("Columns of true news data set: ", trueNews_df.columns)
print("Columns of fake news data set: ", fakeNews_df.columns)
display(trueNews_df)
display(fakeNews_df)

Before dropping:-
Columns of true news data set:  Index(['title', 'text', 'subject', 'date'], dtype='object')
Columns of fake news data set:  Index(['title', 'text', 'subject', 'date'], dtype='object')

After dropping:-
Columns of true news data set:  Index(['text'], dtype='object')
Columns of fake news data set:  Index(['text'], dtype='object')


Unnamed: 0,text
0,WASHINGTON (Reuters) - The head of a conservat...
1,WASHINGTON (Reuters) - Transgender people will...
2,WASHINGTON (Reuters) - The special counsel inv...
3,WASHINGTON (Reuters) - Trump campaign adviser ...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...
...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...
21413,"LONDON (Reuters) - LexisNexis, a provider of l..."
21414,MINSK (Reuters) - In the shadow of disused Sov...
21415,MOSCOW (Reuters) - Vatican Secretary of State ...


Unnamed: 0,text
0,Donald Trump just couldn t wish all Americans ...
1,House Intelligence Committee Chairman Devin Nu...
2,"On Friday, it was revealed that former Milwauk..."
3,"On Christmas day, Donald Trump announced that ..."
4,Pope Francis used his annual Christmas Day mes...
...,...
23476,21st Century Wire says As 21WIRE reported earl...
23477,21st Century Wire says It s a familiar theme. ...
23478,Patrick Henningsen 21st Century WireRemember ...
23479,21st Century Wire says Al Jazeera America will...


In [99]:
# Now to distinguish the true news from the fake ones, we shall assign a simple
# label to both the data sets.
# label = 0 for fake news
# label = 1 for true news
# We shall add a new column called label which will hold the values - 0 or 1
trueNews_df["label"] = 1
print("The true news data set after addition of 'label' column:-")
display(trueNews_df)
fakeNews_df["label"] = 0
print("The fake news data set after addition of 'label' column:-")
display(fakeNews_df)

The true news data set after addition of 'label' column:-


Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
21412,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1
21413,"LONDON (Reuters) - LexisNexis, a provider of l...",1
21414,MINSK (Reuters) - In the shadow of disused Sov...,1
21415,MOSCOW (Reuters) - Vatican Secretary of State ...,1


The fake news data set after addition of 'label' column:-


Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0
...,...,...
23476,21st Century Wire says As 21WIRE reported earl...,0
23477,21st Century Wire says It s a familiar theme. ...,0
23478,Patrick Henningsen 21st Century WireRemember ...,0
23479,21st Century Wire says Al Jazeera America will...,0


In [100]:
# Now we will concatinate the two data sets into one so we can work on one data-
# set and train and test our AI model.
news_df = pd.concat([trueNews_df, fakeNews_df], axis=0)
news_df.reset_index(drop=True, inplace=True) # restting the indices of the new
                                             # data frame
print("The new data frame object is as follows:-")
news_df

The new data frame object is as follows:-


Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
44893,21st Century Wire says As 21WIRE reported earl...,0
44894,21st Century Wire says It s a familiar theme. ...,0
44895,Patrick Henningsen 21st Century WireRemember ...,0
44896,21st Century Wire says Al Jazeera America will...,0


### Now that we have created a clean data frame object to work on, we shall focus on building the AI model
### We shall fisrt import nltk to use the tools required to build this model

In [101]:
import nltk

### TEXT PRE-PROCESSING

#### TOKENISATION
##### dividing a large piece of continuous text into distinct units or tokens for easier and faster text processing

In [102]:
nltk.download('punkt') # downloading the resource
from nltk.tokenize import word_tokenize
news_df['text'] = news_df['text'].apply(word_tokenize)
news_df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vkfak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,label
0,"[WASHINGTON, (, Reuters, ), -, The, head, of, ...",1
1,"[WASHINGTON, (, Reuters, ), -, Transgender, pe...",1
2,"[WASHINGTON, (, Reuters, ), -, The, special, c...",1
3,"[WASHINGTON, (, Reuters, ), -, Trump, campaign...",1
4,"[SEATTLE/WASHINGTON, (, Reuters, ), -, Preside...",1
...,...,...
44893,"[21st, Century, Wire, says, As, 21WIRE, report...",0
44894,"[21st, Century, Wire, says, It, s, a, familiar...",0
44895,"[Patrick, Henningsen, 21st, Century, WireRemem...",0
44896,"[21st, Century, Wire, says, Al, Jazeera, Ameri...",0


#### LEMMATISATION
##### reducing different forms of a word to a core root word to reduce the text size to be handled and processed.

In [105]:
nltk.download('wordnet') # downloading the resources for lemmatisation
from nltk.stem import WordNetLemmatizer
# Initialising the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Defining a function returning the lemmatised list of words
def lemmatise(sentence):
    return ([lemmatizer.lemmatize(word) for word in sentence])

# Applying lemmatization to the text of the news_df
news_df['text'] = news_df['text'].apply(lemmatise)

news_df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vkfak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,text,label
0,"[WASHINGTON, (, Reuters, ), -, The, head, of, ...",1
1,"[WASHINGTON, (, Reuters, ), -, Transgender, pe...",1
2,"[WASHINGTON, (, Reuters, ), -, The, special, c...",1
3,"[WASHINGTON, (, Reuters, ), -, Trump, campaign...",1
4,"[SEATTLE/WASHINGTON, (, Reuters, ), -, Preside...",1
...,...,...
44893,"[21st, Century, Wire, say, As, 21WIRE, reporte...",0
44894,"[21st, Century, Wire, say, It, s, a, familiar,...",0
44895,"[Patrick, Henningsen, 21st, Century, WireRemem...",0
44896,"[21st, Century, Wire, say, Al, Jazeera, Americ...",0


#### STOPWORDS REMOVAL
##### commonly used words (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore so that the amount of words to be processed is less and only the most important and significant words are considered

In [106]:
nltk.download("stopwords") # downloading the resources
from nltk.corpus import stopwords
print("These are the stopwords in English:-", stopwords.words('english'),
      sep="\n")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vkfak\AppData\Roaming\nltk_data...


These are the stopwords in English:-
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', '

[nltk_data]   Unzipping corpora\stopwords.zip.


In [107]:
# Defining a function returning a list of words excluding the stopwords
def removeStopwords(listOfWords):
    return ([word for word in listOfWords
             if word not in stopwords.words('english')])

In [108]:
# Applying the stopwords removal function to the news_df text
news_df['text'] = news_df['text'].apply(removeStopwords)

news_df

Unnamed: 0,text,label
0,"[WASHINGTON, (, Reuters, ), -, The, head, cons...",1
1,"[WASHINGTON, (, Reuters, ), -, Transgender, pe...",1
2,"[WASHINGTON, (, Reuters, ), -, The, special, c...",1
3,"[WASHINGTON, (, Reuters, ), -, Trump, campaign...",1
4,"[SEATTLE/WASHINGTON, (, Reuters, ), -, Preside...",1
...,...,...
44893,"[21st, Century, Wire, say, As, 21WIRE, reporte...",0
44894,"[21st, Century, Wire, say, It, familiar, theme...",0
44895,"[Patrick, Henningsen, 21st, Century, WireRemem...",0
44896,"[21st, Century, Wire, say, Al, Jazeera, Americ...",0


In [109]:
# Joining the words back to form the text that the AI model will train and test
# on
news_df['text'] = news_df['text'].apply(' '.join)

### Splitting Up of Data into Training Set and Testing Set

In [137]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(news_df['text'],
                                                news_df['label'],
                                                test_size=0.25)

print("The training set:-")
print("xTrain:-")
display(xTrain) # contains 'text'
print("yTrain:-")
display(yTrain) # contains 'label'
print("-----------------------------------------------------------------------")
print("The testing set:-")
print("xTest:-")
display(xTest) # contains 'text'
print("yTest:-")
display(yTest) # contains 'label' - predicted value should match with this value

The training set:-
xTrain:-


8297     WASHINGTON ( Reuters ) - U.S. Democrat Hillary...
29164    Last night , Hillary Clinton Democratic nomina...
27915    Ivy League economist Guido Menzio ha reportedl...
2425     WASHINGTON ( Reuters ) - Reince Priebus , wa r...
22556    Hours gunman opened fire Republican Congressme...
                               ...                        
35713    Wow ! Bernie Hillary making hay Flint water cr...
16899    TBILISI ( Reuters ) - Georgian President Giorg...
35380                                                     
40040    The Washington Post got called BIG TIME fake s...
19743    MOSCOW ( Reuters ) - Russia China agreed North...
Name: text, Length: 33673, dtype: object

yTrain:-


8297     1
29164    0
27915    0
2425     1
22556    0
        ..
35713    0
16899    1
35380    0
40040    0
19743    1
Name: label, Length: 33673, dtype: int64

-----------------------------------------------------------------------
The testing set:-
xTest:-


34285    Can Stop Laughing ! ? ? Bernie # BasementDwell...
3107     ( This June 19 story ha refiled correct Muelle...
27485    Bill Gates richest man world . Unlike Donald T...
12868    TUNIS ( Reuters ) - More 90 mayor across Libya...
36944    It early endorse Hillary ? What Al Gore waitin...
                               ...                        
9595     WASHINGTON ( Reuters ) - Planned Parenthood fi...
10256    HAVANA ( Reuters ) - U.S. President Barack Oba...
36903    To watch black comedian net worth $ 70 million...
42243    Wow ! We fast becoming like Europe sexual assa...
9020     WASHINGTON ( Reuters ) - The U.S. federal prog...
Name: text, Length: 11225, dtype: object

yTest:-


34285    0
3107     1
27485    0
12868    1
36944    0
        ..
9595     1
10256    1
36903    0
42243    0
9020     1
Name: label, Length: 11225, dtype: int64

### VECTORISATION
#### used to convert textual data to numerical format; a matrix is created - each column represents a feature, each row represents an individual review

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer
nc_tfidf = TfidfVectorizer(max_df=0.7)

# Calculating the TF-IDF value for both the sets
tfidf_train = nc_tfidf.fit_transform(xTrain)
tfidf_test = nc_tfidf.transform(xTest)

# Displaying the TF-IDF matrices
print("TF-IDF matrix for training set:-", tfidf_train, sep="\n", end='\n\n')
print("TF-IDF matrix for testing set:-", tfidf_test, sep="\n")

TF-IDF matrix for training set:-
  (0, 46541)	0.03899108135712689
  (0, 39669)	0.03877572594079617
  (0, 101323)	0.06401962771615816
  (0, 58938)	0.044473808664721504
  (0, 98724)	0.026793338168286114
  (0, 46534)	0.029021563635860965
  (0, 35848)	0.03075234993278476
  (0, 66499)	0.07157837036717703
  (0, 65936)	0.06860471348096842
  (0, 79561)	0.05430305114889709
  (0, 91884)	0.03983801341675674
  (0, 102036)	0.019075716927786555
  (0, 24569)	0.08389641105885311
  (0, 50464)	0.038015688462627525
  (0, 76588)	0.09856857169031856
  (0, 21113)	0.07171240388869166
  (0, 18707)	0.04764889142908793
  (0, 39514)	0.045560885039175165
  (0, 62166)	0.07839541702541221
  (0, 22432)	0.1051235330957045
  (0, 32735)	0.07940877064347954
  (0, 7964)	0.03634181673721099
  (0, 66501)	0.05824807863695535
  (0, 92785)	0.02630955804171978
  (0, 97826)	0.05796222484027728
  :	:
  (33672, 58192)	0.0397249158511457
  (33672, 93013)	0.01780466803477611
  (33672, 44454)	0.02182617780068318
  (33672, 103001)	0.

### LOGISTIC REGRESSION
#### ML algorithm; used for the classification problems; a predictive analysis algorithm; based on the concept of probability

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [134]:
nc_model_lr = LogisticRegression(max_iter=900)
nc_model_lr.fit(tfidf_train, yTrain)
predictions_lr = nc_model_lr.predict(tfidf_test)
accuracy_lr = accuracy_score(yTest, predictions_lr)
print("The accuracy percentage of this AI model is ", accuracy_lr * 100, "%",
      sep="")

The accuracy percentage of this AI model is 98.72605790645879%


### PASSIVE-AGGRESSIVE CLASSIFIER
#### passive: if prediction is correct, keep the model; aggressive: if prediction is incorrect, change the model to correct it

In [135]:
from sklearn.linear_model import PassiveAggressiveClassifier

nc_model_pac = PassiveAggressiveClassifier(max_iter=50)
nc_model_pac.fit(tfidf_train, yTrain)

PassiveAggressiveClassifier(max_iter=50)

In [136]:
predictions_pac = nc_model_pac.predict(tfidf_test)
accuracy = accuracy_score(yTest, predictions_pac)
print("The accuracy percentage of this AI model is ", accuracy * 100, "%",
      sep="")

The accuracy percentage of this AI model is 99.40311804008908%
