In [36]:
import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import bigrams
from nltk.stem import PorterStemmer

import re
import datetime

[nltk_data] Downloading package stopwords to /Users/kilo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### <span style="color:#003049">1. Get data </span>
[data source](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset) 

In [38]:
df_0 = pd.read_csv("../data/Fake.csv")
df_1 = pd.read_csv("../data/True.csv")

### <span style="color:#003049">2. EDA</span> 

<img src="../images/Screenshot 2021-05-17 at 16.24.54.png
" width="300" height="50" />

In [39]:
df_0.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [40]:
df_1.tail()

Unnamed: 0,title,text,subject,date
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017"


In [41]:
# adding category 0 to fake news and category 1 to true news
df_0["category"] = 0
df_1["category"] = 1

In [42]:
# concatenating dataframes
df = pd.concat([df_0, df_1],axis=0)
df = df.reset_index()
df = df.drop(['index'], axis=1)
df

Unnamed: 0,title,text,subject,date,category
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [7]:
# saving dataframe as CSV
df.to_csv(f'../data/fake_news_merge.csv', index=False)

In [8]:
# quick overview of the new dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44898 non-null  object
 1   text      44898 non-null  object
 2   subject   44898 non-null  object
 3   date      44898 non-null  object
 4   category  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [9]:
# Return the number of missing values in each column
df.isnull( ).sum( )

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [10]:
# Number of unique elements in "subjetc" column
df["subject"].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east', 'politicsNews', 'worldnews'], dtype=object)

In [11]:
# Statistical summary for numerical columns present in the dataset. 
# Not to much sense on this dataframe
df.describe()

Unnamed: 0,category
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [12]:
# getting number of dimensions as well as the size in each dimension
df.shape

(44898, 5)

### <span style="color:#003049">3. Data Cleaning</span> 

<img src="../images/data_cleaning.jpeg" width="300" height="50" />

In [13]:
# removing a row that has a NaN or missing values in it.
df.dropna(inplace=True)

In [14]:
# analyzing duplicated values
df.duplicated().sum()

209

In [15]:
# drop duplicates
df = df.drop_duplicates()

In [16]:
# counting duplicated rows in "title" column
df["title"].duplicated().sum()

5960

In [17]:
df["text"].duplicated().sum()

6043

In [18]:
# drop duplicated rows on "text" column
df = df.drop_duplicates(subset=['text'])

In [19]:
# drop duplicated rows on "title" column
df = df.drop_duplicates(subset=['title'])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38270 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     38270 non-null  object
 1   text      38270 non-null  object
 2   subject   38270 non-null  object
 3   date      38270 non-null  object
 4   category  38270 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.8+ MB


In [33]:
# preparing a sample to work on it better than working whit the whole dataframe
df_sample = df.sample(100)
df_sample

Unnamed: 0,title,text,subject,date,category
25307,Factbox: Trump on Twitter (September 7) - Shei...,The following statements were posted to the ve...,politicsNews,"September 8, 2017",1
17129,ULTIMATE DISRESPECT: VA Sent Mail Related To D...,Our current government can t be bothered with ...,Government News,"Aug 20, 2015",0
26314,Senate intel committee Democrat wants to speak...,WASHINGTON (Reuters) - The Senate intelligence...,politicsNews,"July 10, 2017",1
21003,UNHINGED LEFTIST Apologizes To “Refugees” Who ...,"Why stop there? Using liberal logic, shouldn t...",left-news,"Feb 13, 2016",0
7661,Fox News Humiliates Mitt Romney For Flip-Flop...,Mitt Romney looked like a defeated man too emb...,News,"March 6, 2016",0
...,...,...,...,...,...
40324,Iraqi forces complete Kirkuk province takeover...,"BAGHDAD/ERBIL, Iraq (Reuters) - Iraqi forces o...",worldnews,"October 20, 2017",1
10620,Candidate Handel’s Excellent Response to Alexa...,We should not allow our political differences...,politics,"Jun 14, 2017",0
42813,Instant View: UK's May calls for two-year tran...,LONDON (Reuters) - Prime Minister Theresa May ...,worldnews,"September 22, 2017",1
10836,Deputy Attorney General Doubles Down On Comey ...,Deputy Attorney General Rod Rosenstein doubled...,politics,"May 19, 2017",0


### <span style="color:#003049">3. Preprocessing </span> 

<img src="../images/Python-data-preprocessing.png" width="300" height="50" />

In [34]:
# Setting up the function to work on the sample and later work with the whole dataframe.
from nltk.corpus import stopwords
ps = PorterStemmer()
en_stops = stopwords.words('english')
us_stops = stopwords.words('english')
us_stops.append("us")
us_stops.append("would")
pattern = '[0-9]'
pattern2 = "[_]"
    
def stopwords(news):
    new_news = []
    news = news.lower().split()
    news = [word for word in news if not word in us_stops]
    news = " ".join(str(i) for i in news)
    news = re.sub(r'\$[^\s]+', 'dollar', news) # Changing $ symbol to the word "dollar"
    news = re.sub('https?://\S+|www\.\S+', '', news) # Removing link of web page from string
    news = re.sub(r'[^\w\s]+',"",news) # Removing characters
    news = re.sub(" \d+", " ", news) # Removing digits
    news = re.sub(r'(?:^| )\w(?:$| )', ' ', news) # Removing any single letter on a string 
    news = re.sub("reuters","", news) 
    news = news.split((" "))
    news = [re.sub(pattern, '', i) for i in news]
    news = [re.sub(pattern2, '', i) for i in news]
    news = list(map(lambda x: ps.stem(x),news)) # Using PorterStemmer to get the standard version of some words (example: working= work, worked= work)
    news = [word for word in news if not word in en_stops]
    news = " ".join(str(i) for i in news)
    return news

In [35]:
df_sample['text'] = df_sample['text'].apply(stopwords)
df_sample['title'] = df_sample['title'].apply(stopwords)
df_sample

Unnamed: 0,title,text,subject,date,category
25307,factbox trump twitter septemb sheikh sabah h...,follow statement post verifi twitter account u...,politicsNews,"September 8, 2017",1
17129,ultim disrespect va sent mail relat disabl cla...,current govern bother resolv heinou act vetera...,Government News,"Aug 20, 2015",0
26314,senat intel committe democrat want speak trump...,washington senat intellig committe top democ...,politicsNews,"July 10, 2017",1
21003,unhing leftist apolog refuge gang rape,stop use liber logic year old girl rape refug...,left-news,"Feb 13, 2016",0
7661,fox news humili mitt romney flipflop donald tr...,mitt romney look like defeat man embarrass adm...,News,"March 6, 2016",0
...,...,...,...,...,...
40324,iraqi forc complet kirkuk provinc takeov clash...,baghdaderbil iraq iraqi forc friday took con...,worldnews,"October 20, 2017",1
10620,candid handel excel respons alexandria shooter...,allow polit differ escal violent attack must r...,politics,"Jun 14, 2017",0
42813,instant view uk may call twoyear transit brexit,london prime minist theresa may call friday ...,worldnews,"September 22, 2017",1
10836,deputi attorney gener doubl comey memo wrote b...,deputi attorney gener rod rosenstein doubl tod...,politics,"May 19, 2017",0


In [None]:
# there some rows that have another information but not date-info I will delete them
df_sample = df_sample[df_sample['date'].str.len() < 22]

In [None]:
### <span style="color:#003049">Now I goint to work on the date column</span> 

In [None]:
# the date column is a object, I will change it to datetime and then to ordinal
def date_time(dtime):
    print("object type: ",dtime.dtype)
    dtime =pd.to_datetime(dtime) # convert it to datetime
    print("object type: ",dtime.dtype) # check datatype again
    print("min time: ",dtime.min()) # check when data begin
    print("max time: ",dtime.max()) # check when data end
    dtime = dtime.apply(lambda x: x.toordinal()) # we will change the effective to date column to ordinal
    return dtime
df_sample['date'] = date_time(df_sample['date'])

In [None]:
df_sample

In [None]:
# splitting text and title columns into list for better counting words

In [None]:
def split_words(news):
    news = news.split()
    return news 

In [None]:
df_sample['text'] = df_sample['text'].apply(split_words)
df_sample['title'] = df_sample['title'].apply(split_words)
df_sample

In [None]:
# to detect the most repeated keywords in df_sample'text'.
words = [wrd for x in df_sample['text'] for wrd in x]
# importing Counter to count repeated words
from collections import Counter
keywords_text = Counter(words)
keywords_text = pd.DataFrame.from_dict(keywords_text, orient='index').reset_index()
keywords_text = keywords_text.sort_values(by=[0], ascending=False)
keywords_text.columns = ['word','count']
keywords_text.head(7)

In [None]:
# to detect the most repeated keywords in df_sample'title'.
words = [wrd for x in df_sample['title'] for wrd in x]
# importing Counter to count repeated words
from collections import Counter
keywords_title = Counter(words)
keywords_title = pd.DataFrame.from_dict(keywords_title, orient='index').reset_index()
keywords_title = keywords_title.sort_values(by=[0], ascending=False)
keywords_title.columns = ['word','count']
keywords_title.head(7)

In [None]:
df_sample.head(5)

----------

Before join the list I want to find the two words combinations

In [None]:
all_words = []
for i in df_sample["text"]:
    all_words += i
all_words2 = [Counter(bigrams(all_words))]
bg_dict = dict(all_words2[0])
bg_dict_sorted = sorted(bg_dict.items(), key=lambda kv: kv[1], reverse=True)
bg, counts = list(zip(*bg_dict_sorted))
bg_str = list(map(lambda x: ' '.join(x), bg))
d = {'two_word':bg_str,'count':counts}
bigrams_true_text = pd.DataFrame(d, columns=['two_word','count'])
bigrams_true_text

-------------

In [None]:
def joining_words(news):
    newss = " ".join(str(i) for i in news) # Convert to string
    return newss

In [None]:
df_sample['text'] = df_sample['text'].apply(joining_words)
df_sample['title'] = df_sample['title'].apply(joining_words)
df_sample

In [None]:
# counting the length of the texts and the titles
df_sample["length_text"] = [len(word.split()) for word in df_sample["text"]]
df_sample["length_title"] = [len(word.split()) for word in df_sample["title"]]
df_sample

now lets perform the cleaning in the whole data set. Link to [final project | cleaning complete data](final_project_cleaning_complete_data.ipynb)