In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import bigrams
from nltk.stem import PorterStemmer

import re
import datetime



[nltk_data] Downloading package stopwords to /Users/kilo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### <span style="color:#003049">1. Get data </span>
[data source](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset) 

In [2]:
df_0 = pd.read_csv("../data/Fake.csv")
df_1 = pd.read_csv("../data/True.csv")

### <span style="color:#003049">2. EDA</span> 

<img src="../images/Screenshot 2021-05-17 at 16.24.54.png
" width="300" height="50" />

In [3]:
df_0

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [4]:
df_1.tail()

Unnamed: 0,title,text,subject,date
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017"


In [5]:
# Adding category 0 to fake news and category 1 to true news
df_0["category"] = 0
df_1["category"] = 1

In [6]:
# Concatenating dataframes
df = pd.concat([df_0, df_1],axis=0)
df = df.reset_index()
df = df.drop(['index'], axis=1)
df

Unnamed: 0,title,text,subject,date,category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [7]:
# Saving dataframe as CSV
df.to_csv(f'../data/df_fakenews_merge.csv', index=False)

In [8]:
# Quick overview of the new dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44898 non-null  object
 1   text      44898 non-null  object
 2   subject   44898 non-null  object
 3   date      44898 non-null  object
 4   category  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [9]:
# Returning the number of missing values in each column
df.isnull( ).sum( )

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [10]:
# Number of unique elements in "subject" column
df["subject"].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east', 'politicsNews', 'worldnews'], dtype=object)

In [11]:
# Statistical summary for numerical columns present in the dataset. 
# This step does not make much sense on this dataframe
df.describe()

Unnamed: 0,category
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [12]:
# Getting number of dimensions as well as the size in each dimension
df.shape

(44898, 5)

### <span style="color:#003049">3. Data Cleaning</span> 

<img src="../images/data_cleaning.jpeg" width="300" height="50" />

In [13]:
# Detecting missing values for an array-like object.
df.isnull().sum()

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [14]:
# Removing rows containing NaN or missing values
df.dropna(inplace=True)

In [15]:
# Analyzing duplicated values
df.duplicated().sum()

209

In [16]:
# Dropping duplicates
df = df.drop_duplicates()

In [17]:
# Counting duplicated rows in "title" column
df["title"].duplicated().sum()

5960

In [18]:
df["text"].duplicated().sum()

6043

---------------

 ##### <span style="color:#003049">I would like to find out how many duplicated "titles" and "texts" are to be found in fake_news</span> 

In [19]:
df_0["title"].duplicated().sum()

5578

In [20]:
df_0["text"].duplicated().sum()

6026

###### <span style="color:#003049">Most of the duplicated rows are part of fake_news. Maybe this is due to the need to repeat a message to reaffirm it or probably as well to the little imagination of the inventors of lies. </span>


-------

In [21]:
# Dropping duplicated rows in "text" column
df = df.drop_duplicates(subset=['text'])

In [22]:
# Dropping duplicated rows in "title" column
df = df.drop_duplicates(subset=['title'])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38270 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     38270 non-null  object
 1   text      38270 non-null  object
 2   subject   38270 non-null  object
 3   date      38270 non-null  object
 4   category  38270 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


### <span style="color:#003049">3. Preprocessing </span> 

<img src="../images/Python-data-preprocessing.png" width="300" height="50" />

#### <span style="color:#003049">Next steps: </span> 
<span style="color:#003049">
    <ul>
        <li><b>Removing stopwords before removing characters. I will add the word "us" to the list of stopwords so that after the removal of characters the only "us" that remains comes from the abbreviation U.S. (United States).</b></li>
        <li><b>Changing the $ symbol to the word "Dollar"</b></li>
        <li><b>Removing characters</b></li>
        <li><b>Removing digits</b></li>
        <li><b>Removing single letters</b></li>
        <li><b>Removing the word "reuter"</b></li>
        <li><b>Removing stopwords again, maintaining the word "us" this time </b></li>
        <li><b>Change date column to ordinal format</b></li>
        <li><b>Add columns length_title and length_text</b></li>       
    </ul>
</span> 

In [24]:
# Setting up the function to work on the sample and later work with the whole dataframe.
from nltk.corpus import stopwords
ps = PorterStemmer()
en_stops = stopwords.words('english')
us_stops = stopwords.words('english')
us_stops.append("us")
us_stops.append("would")
pattern = '[0-9]'
pattern2 = "[_]"
    
def stopwords(news):
    new_news = []
    news = news.lower().split()
    news = [word for word in news if not word in us_stops]
    news = " ".join(str(i) for i in news)
    news = re.sub(r'\$[^\s]+', 'dollar', news) # Changing $ symbol to the word "dollar"
    news = re.sub('https?://\S+|www\.\S+', '', news) # Removing link of web page from string
    news = re.sub(r'[^\w\s]+',"",news) # Removing characters
    news = re.sub('\w*\d\w*', '', news) # Removes digits and words that have digits
    news = re.sub(" \d+", " ", news) # Removing digits
    news = re.sub(r'(?:^| )\w(?:$| )', ' ', news) # Removing any single letter on a string 
    news = re.sub("reuters","", news) 
    news = news.split((" "))
    #news = [re.sub(pattern, '', i) for i in news]
    news = [re.sub(pattern2, '', i) for i in news]
    news = list(map(lambda x: ps.stem(x),news)) # Using PorterStemmer to get the standard version of some words (example: working= work, worked= work)
    news = [word for word in news if not word in en_stops]
    news = " ".join(str(i) for i in news)
    return news

In [25]:
df['text'] = df['text'].apply(stopwords)
df['title'] = df['title'].apply(stopwords)
df

Unnamed: 0,title,text,subject,date,category
0,donald trump send embarrass new year eve messa...,donald trump wish american happi new year leav...,News,"December 31, 2017",0
1,drunk brag trump staffer start russian collus ...,hous intellig committe chairman devin nune go ...,News,"December 31, 2017",0
2,sheriff david clark becom internet joke threat...,friday reveal former milwauke sheriff david cl...,News,"December 30, 2017",0
3,trump obsess even obama name code websit imag,christma day donald trump announc back work fo...,News,"December 29, 2017",0
4,pope franci call donald trump christma speech,pope franci use annual christma day messag reb...,News,"December 25, 2017",0
...,...,...,...,...,...
44892,north korea shipment syria chemic arm agenc in...,unit nation two north korean shipment syrian...,worldnews,"August 21, 2017",1
44894,lexisnexi withdrew two product chines market,london lexisnexi provid legal regulatori bus...,worldnews,"August 22, 2017",1
44895,minsk cultur hub becom author,minsk shadow disus sovietera factori minsk s...,worldnews,"August 22, 2017",1
44896,vatican upbeat possibl pope franci visit russia,moscow vatican secretari state cardin pietro...,worldnews,"August 22, 2017",1


#### <span style="color:#003049">Now I will work on the date column</span> 

In [26]:
# There some rows that have information but not date. I will delete them.
df = df[df['date'].str.len() < 22]

In [27]:
def date_time(dtime):
    print("object type: ",dtime.dtype)
    dtime =pd.to_datetime(dtime) # Converting it to datetime
    print("object type: ",dtime.dtype) # Checking datatype again
    print("min time: ",dtime.min()) # Checking for earliest data
    print("max time: ",dtime.max()) # Checking for latest data
    dtime = dtime.apply(lambda x: x.toordinal()) # Changing the datatype to ordinal
    return dtime
df['date'] = date_time(df['date'])

object type:  object
object type:  datetime64[ns]
min time:  2015-03-31 00:00:00
max time:  2018-02-19 00:00:00


In [28]:
df["length_text"] = [len(word.split()) for word in df["text"]]
df["length_title"] = [len(word.split()) for word in df["title"]]
df

Unnamed: 0,title,text,subject,date,category,length_text,length_title
0,donald trump send embarrass new year eve messa...,donald trump wish american happi new year leav...,News,736694,0,245,9
1,drunk brag trump staffer start russian collus ...,hous intellig committe chairman devin nune go ...,News,736694,0,178,8
2,sheriff david clark becom internet joke threat...,friday reveal former milwauke sheriff david cl...,News,736693,0,302,10
3,trump obsess even obama name code websit imag,christma day donald trump announc back work fo...,News,736692,0,230,8
4,pope franci call donald trump christma speech,pope franci use annual christma day messag reb...,News,736688,0,208,7
...,...,...,...,...,...,...,...
44892,north korea shipment syria chemic arm agenc in...,unit nation two north korean shipment syrian...,worldnews,736562,1,274,10
44894,lexisnexi withdrew two product chines market,london lexisnexi provid legal regulatori bus...,worldnews,736563,1,73,6
44895,minsk cultur hub becom author,minsk shadow disus sovietera factori minsk s...,worldnews,736563,1,169,5
44896,vatican upbeat possibl pope franci visit russia,moscow vatican secretari state cardin pietro...,worldnews,736563,1,116,7


In [29]:
print(df.iloc[6,1])

donald trump spent good portion day golf club mark  day done sinc take oath offic must bad game trump lash fbi deputi director andrew mccabe twitter follow report say mccabe plan retir month report follow mccabe testimoni front congression committe week well mount critic republican regard russia probeso natur trump attack mccabe lie fbi deputi director andrew mccabe man charg along leakin jame comey phoni hillari clinton investig includ  illeg delet email given dollar wife campaign clinton puppet investig trump tweetedhow fbi deputi director andrew mccabe man charg along leakin jame comey phoni hillari clinton investig includ  illeg delet email given dollar wife campaign clinton puppet investig donald trump realdonaldtrump decemb   stop therefbi deputi director andrew mccabe race clock retir full benefit  day go donald trump realdonaldtrump decemb   fbi lawyer jame baker reassign accord foxnew donald trump realdonaldtrump decemb   intel trump dispos get inform fox news mccabe spent car

### <span style="color:#003049">The data is ready to be processed.</span> 

In [30]:
# Saving dataframe as CSV
df.to_csv(f'../data/df_fake_real_news.csv', index=False)

Link to [fake and real news viz](fake_and_real_news_viz.ipynb)<br>

Link to [machine learning notebook](fake_and_real_news_machine_learning.ipynb)