In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import bigrams
from nltk.stem import PorterStemmer

import re
import datetime



[nltk_data] Downloading package stopwords to /Users/kilo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. Get data 
[data source](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset)

In [2]:
df_0 = pd.read_csv("../data/Fake.csv")
df_1 = pd.read_csv("../data/True.csv")

### 1. EDA

<img src="../images/Screenshot 2021-05-17 at 16.24.54.png
" width="300" height="50" />

In [3]:
df_0.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
df_1.tail()

Unnamed: 0,title,text,subject,date
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017"


In [5]:
# adding category 0 to fake news and category 1 to true news
df_0["category"] = 0
df_1["category"] = 1

In [6]:
# concatenating dataframes
df = pd.concat([df_0, df_1],axis=0)
df = df.reset_index()
df = df.drop(['index'], axis=1)
df

Unnamed: 0,title,text,subject,date,category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [7]:
# saving dataframe as CSV
df.to_csv(f'../data/df_fakenews_merge.csv', index=False)

In [8]:
# quick overview of the new dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44898 non-null  object
 1   text      44898 non-null  object
 2   subject   44898 non-null  object
 3   date      44898 non-null  object
 4   category  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [9]:
# Return the number of missing values in each column
df.isnull( ).sum( )

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [10]:
# Number of unique elements in "subjetc" column
df["subject"].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east', 'politicsNews', 'worldnews'], dtype=object)

In [11]:
# Statistical summary for numerical columns present in the dataset. 
# Not to much sense on this dataframe
df.describe()

Unnamed: 0,category
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [12]:
# getting number of dimensions as well as the size in each dimension
df.shape

(44898, 5)

### 3. Data Cleaning

<img src="../images/data_cleaning.jpeg" width="300" height="50" />

In [13]:
# Detect missing values for an array-like object.
df.isnull().sum()

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [14]:
# removing a row that has a NaN or missing values in it.
df.dropna(inplace=True)

In [15]:
# analyzing duplicated values
df.duplicated().sum()

209

In [16]:
# drop duplicates
df = df.drop_duplicates()

In [17]:
# counting duplicated rows in "title" column
df["title"].duplicated().sum()

5960

In [18]:
df["text"].duplicated().sum()

6043

---------------

##### I want to see how many duplicated "titles" and "texts" belong to fake_news 

In [19]:
df_0["title"].duplicated().sum()

5578

In [20]:
df_0["text"].duplicated().sum()

6026

Most of the duplicated rows belong to fake news..... maybe this goes back to the need to repeat a message to reaffirm it or probably as well to the weak imagination of the inventors of lies. 

-------

In [21]:
# drop duplicated rows on "text" column
df = df.drop_duplicates(subset=['text'])

In [22]:
# drop duplicated rows on "title" column
df = df.drop_duplicates(subset=['title'])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38270 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     38270 non-null  object
 1   text      38270 non-null  object
 2   subject   38270 non-null  object
 3   date      38270 non-null  object
 4   category  38270 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


In [24]:
# setting up the function to work on the sample and later work with the whole dataframe.
from nltk.corpus import stopwords
ps = PorterStemmer()
en_stops = stopwords.words('english')
us_stops = stopwords.words('english')
us_stops.append("us")
us_stops.append("would")
pattern = '[0-9]'
pattern2 = "[_]"
    
def stopwords(news):
    new_news = []
    news = news.lower().split()
    news = [word for word in news if not word in us_stops]
    news = " ".join(str(i) for i in news)
    news = re.sub(r'\$[^\s]+', 'dollar', news) # Changing $ symbol to the word "dollar"
    news = re.sub('https?://\S+|www\.\S+', '', news) # Removing link of web page from string
    news = re.sub(r'[^\w\s]+',"",news) # Removing characters
    news = re.sub(" \d+", " ", news) # Removing digits
    news = re.sub(r'(?:^| )\w(?:$| )', ' ', news) # Removing any single letter on a string 
    news = re.sub("reuters","", news) 
    news = news.split((" "))
    news = [re.sub(pattern, '', i) for i in news]
    news = [re.sub(pattern2, '', i) for i in news]
    #news = list(map(lambda x: ps.stem(x),news)) # Using PorterStemmer to get the standard version of some words (example: working= work, worked= work)
    news = [word for word in news if not word in en_stops]
    news = " ".join(str(i) for i in news)
    return news

In [25]:
en_stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [26]:
df['text'] = df['text'].apply(stopwords)
df['title'] = df['title'].apply(stopwords)
df

Unnamed: 0,title,text,subject,date,category
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,"December 31, 2017",0
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,"December 31, 2017",0
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,"December 30, 2017",0
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced back work...,News,"December 29, 2017",0
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,"December 25, 2017",0
...,...,...,...,...,...
44892,north korea shipments syria chemical arms agen...,united nations two north korean shipments sy...,worldnews,"August 21, 2017",1
44894,lexisnexis withdrew two products chinese market,london lexisnexis provider legal regulatory ...,worldnews,"August 22, 2017",1
44895,minsk cultural hub becomes authorities,minsk shadow disused sovietera factories min...,worldnews,"August 22, 2017",1
44896,vatican upbeat possibility pope francis visiti...,moscow vatican secretary state cardinal piet...,worldnews,"August 22, 2017",1


#### Now I goint to work on the date column

In [27]:
# there some rows that have another information but not date-info I will delete them
df = df[df['date'].str.len() < 22]

In [28]:
# the date column is a object, I will change it to datetime and then to ordinal
def date_time(dtime):
    print("object type: ",dtime.dtype)
    dtime =pd.to_datetime(dtime) # convert it to datetime
    print("object type: ",dtime.dtype) # check datatype again
    print("min time: ",dtime.min()) # check when data begin
    print("max time: ",dtime.max()) # check when data end
    return dtime
df['date'] = date_time(df['date'])

object type:  object
object type:  datetime64[ns]
min time:  2015-03-31 00:00:00
max time:  2018-02-19 00:00:00


In [29]:
def split_words(news):
    news = news.split()
    return news 

In [30]:
df['text'] = df['text'].apply(split_words)
df['title'] = df['title'].apply(split_words)
df

Unnamed: 0,title,text,subject,date,category
0,"[donald, trump, sends, embarrassing, new, year...","[donald, trump, wish, americans, happy, new, y...",News,2017-12-31,0
1,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev...",News,2017-12-31,0
2,"[sheriff, david, clarke, becomes, internet, jo...","[friday, revealed, former, milwaukee, sheriff,...",News,2017-12-30,0
3,"[trump, obsessed, even, obamas, name, coded, w...","[christmas, day, donald, trump, announced, bac...",News,2017-12-29,0
4,"[pope, francis, called, donald, trump, christm...","[pope, francis, used, annual, christmas, day, ...",News,2017-12-25,0
...,...,...,...,...,...
44892,"[north, korea, shipments, syria, chemical, arm...","[united, nations, two, north, korean, shipment...",worldnews,2017-08-21,1
44894,"[lexisnexis, withdrew, two, products, chinese,...","[london, lexisnexis, provider, legal, regulato...",worldnews,2017-08-22,1
44895,"[minsk, cultural, hub, becomes, authorities]","[minsk, shadow, disused, sovietera, factories,...",worldnews,2017-08-22,1
44896,"[vatican, upbeat, possibility, pope, francis, ...","[moscow, vatican, secretary, state, cardinal, ...",worldnews,2017-08-22,1


In [31]:
def joining_list(news):
    news = " ".join(str(i) for i in news) # Convert to string
    return news

In [32]:
df['text'] = df['text'].apply(joining_list)
df['title'] = df['title'].apply(joining_list)
df

Unnamed: 0,title,text,subject,date,category
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,2017-12-31,0
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,2017-12-31,0
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,2017-12-30,0
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced back work...,News,2017-12-29,0
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,2017-12-25,0
...,...,...,...,...,...
44892,north korea shipments syria chemical arms agen...,united nations two north korean shipments syri...,worldnews,2017-08-21,1
44894,lexisnexis withdrew two products chinese market,london lexisnexis provider legal regulatory bu...,worldnews,2017-08-22,1
44895,minsk cultural hub becomes authorities,minsk shadow disused sovietera factories minsk...,worldnews,2017-08-22,1
44896,vatican upbeat possibility pope francis visiti...,moscow vatican secretary state cardinal pietro...,worldnews,2017-08-22,1


In [33]:
df["length_text"] = [len(word.split()) for word in df["text"]]
df["length_title"] = [len(word.split()) for word in df["title"]]
df

Unnamed: 0,title,text,subject,date,category,length_text,length_title
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,2017-12-31,0,254,9
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,2017-12-31,0,178,8
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,2017-12-30,0,315,10
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced back work...,News,2017-12-29,0,239,8
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,2017-12-25,0,208,7
...,...,...,...,...,...,...,...
44892,north korea shipments syria chemical arms agen...,united nations two north korean shipments syri...,worldnews,2017-08-21,1,275,10
44894,lexisnexis withdrew two products chinese market,london lexisnexis provider legal regulatory bu...,worldnews,2017-08-22,1,74,6
44895,minsk cultural hub becomes authorities,minsk shadow disused sovietera factories minsk...,worldnews,2017-08-22,1,168,5
44896,vatican upbeat possibility pope francis visiti...,moscow vatican secretary state cardinal pietro...,worldnews,2017-08-22,1,116,7


In [34]:
print(df.iloc[6,1])

donald trump spent good portion day golf club marking th day done since taking oath office must bad game trump lashed fbi deputy director andrew mccabe twitter following report saying mccabe plans retire months report follows mccabe testimony front congressional committees week well mounting criticism republicans regarding russia probeso naturally trump attacked mccabe lie fbi deputy director andrew mccabe man charge along leakin james comey phony hillary clinton investigation including illegally deleted emails given dollar wife campaign clinton puppets investigation trump tweetedhow fbi deputy director andrew mccabe man charge along leakin james comey phony hillary clinton investigation including illegally deleted emails given dollar wife campaign clinton puppets investigation donald trump realdonaldtrump december stop therefbi deputy director andrew mccabe racing clock retire full benefits days go donald trump realdonaldtrump december wow fbi lawyer james baker reassigned according f

In [35]:
df = df[df['text'].notnull()]
df

Unnamed: 0,title,text,subject,date,category,length_text,length_title
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,2017-12-31,0,254,9
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,2017-12-31,0,178,8
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,2017-12-30,0,315,10
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced back work...,News,2017-12-29,0,239,8
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,2017-12-25,0,208,7
...,...,...,...,...,...,...,...
44892,north korea shipments syria chemical arms agen...,united nations two north korean shipments syri...,worldnews,2017-08-21,1,275,10
44894,lexisnexis withdrew two products chinese market,london lexisnexis provider legal regulatory bu...,worldnews,2017-08-22,1,74,6
44895,minsk cultural hub becomes authorities,minsk shadow disused sovietera factories minsk...,worldnews,2017-08-22,1,168,5
44896,vatican upbeat possibility pope francis visiti...,moscow vatican secretary state cardinal pietro...,worldnews,2017-08-22,1,116,7


In [36]:
df = df[df['text'].notnull()]
df = df.dropna()
df

Unnamed: 0,title,text,subject,date,category,length_text,length_title
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,2017-12-31,0,254,9
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,2017-12-31,0,178,8
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,2017-12-30,0,315,10
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced back work...,News,2017-12-29,0,239,8
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,2017-12-25,0,208,7
...,...,...,...,...,...,...,...
44892,north korea shipments syria chemical arms agen...,united nations two north korean shipments syri...,worldnews,2017-08-21,1,275,10
44894,lexisnexis withdrew two products chinese market,london lexisnexis provider legal regulatory bu...,worldnews,2017-08-22,1,74,6
44895,minsk cultural hub becomes authorities,minsk shadow disused sovietera factories minsk...,worldnews,2017-08-22,1,168,5
44896,vatican upbeat possibility pope francis visiti...,moscow vatican secretary state cardinal pietro...,worldnews,2017-08-22,1,116,7


In [37]:
# saving dataframe as CSV
df.to_csv(f'../data/df_fake_news_without_steemming.csv', index=False)