# Download fake news data

This notebook downloads the fake news data from kaggle and does some first analysis.

In [181]:
%%capture

# download required packages
!pip install kaggle
!pip install zipfile

In [182]:
# import libraries
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import time
from bs4 import BeautifulSoup
nltk.download('stopwords', quiet=True)

True

In [183]:
# user parameters
data_path = 'data'

In [184]:
def download_kaggle_data(data_path, data_set, extract_zip=True):
    
    print('Download {} from kaggle.com...'.format(data_set), end='')
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file(
        dataset='benjaminperucco/udacitydata',
        file_name=data_set,
        path=data_path,
        force=True
    )

    file_path = '{}/{}.zip'.format(data_path, data_set)
    
    if extract_zip:
        zf = ZipFile(file_path)
        zf.extractall(data_path)
        zf.close()
        !rm $file_path

    print('done')

In [185]:
# download fake news data
download_kaggle_data(data_path, 'Fake.csv')
download_kaggle_data(data_path, 'True.csv')

Download Fake.csv from kaggle.com...done
Download True.csv from kaggle.com...done


In [186]:
def read_kaggle_data(data_path, data_set):
    
    file_path = '{}/{}'.format(data_path, data_set)
    if os.path.exists(file_path):
        
        print('Import {}...'.format(data_set), end='')
    
        file_path = '{}/{}'.format(data_path, data_set)
        imported_data = pd.read_csv(file_path)
        !rm $file_path
    
        print('done')
    
    else:
        
        empty_dict = {'dummy_col_1': [3, 2, 1, 0], 'dummy_col_2': ['a', 'b', 'c', 'd']} # just some test data
        imported_data = pd.DataFrame.from_dict(empty_dict)
        print('{} does not exist. Please download again kaggle data.'.format(file_path))
    
    return imported_data

In [187]:
# import data
fake = read_kaggle_data(data_path, 'Fake.csv')
true = read_kaggle_data(data_path, 'True.csv')

Import Fake.csv...done
Import True.csv...done


In [188]:
# add class 
fake['class'] = 1 # 1 = fake
true['class'] = 0 # 0 = true

In [189]:
fake.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [190]:
fake[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [191]:
true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [192]:
true[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


We can see here that we have less unique values for title and text as counted values. This means several times the same titles and texts occur. I will remove not unique features once the text is processed further. 

In [193]:
# merge data together and add classification
original_document = pd.concat([true, fake])
true = fake = None # to save memory

In [194]:
original_document.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [195]:
original_document[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,44898,44898.0,44898,44898
unique,38729,38646.0,8,2397
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017"
freq,14,627.0,11272,182


In [196]:
def detailed_investigation(index, df, number_strings=0):
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    if number_strings > 0:
        text = text[:number_strings]
        
    print('Index {} - Class {} - {}\n{}\n'.format(index, doc_class, title, text))

In [197]:
detailed_investigation(11010, original_document)

Index 11010 - Class 0 - Obama highlights five priorities in meeting with Republican leaders
WASHINGTON (Reuters) - U.S. President Barack Obama highlighted five legislative priorities in a meeting on Tuesday with Republican congressional leaders, including the financial crisis in Puerto Rico and passage of the Trans-Pacific Partnership, the White House said. The White House said Obama also discussed the opioid  epidemic, cancer research and criminal justice reform with Republican House Speaker Paul Ryan and Senate Majority Leader Mitch McConnell. “The president is eager to see Congress take that action as soon as possible this year,” White House spokesman Josh Earnest said of the sweeping trade pact. 



In [198]:
select_from = 1
select_to = 2000

In [199]:
for i in range(select_from, select_to):
    detailed_investigation(i, original_document, 800)

Index 1 - Class 0 - U.S. military to accept transgender recruits on Monday: Pentagon
WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on Jan. 1. A Justice Department official said the administration will not challenge those rulings. “The Department of Defense has announced that it will be releasing an independent study of these issues in the coming weeks. So rather than litigate this interim appeal before that oc

Index 2 - Class 0 - Senior U.S. Republican senator: 'Let Mr. Mueller do his job'
WASHINGTON (Reuters) - The speci

Index 1143 - Class 0 - U.S. House members ask EPA not to lower biofuels requirements
NEW YORK (Reuters) - A group of 22 members of the U.S. House of Representatives asked the Environmental Protection Agency in a letter on Thursday not to lower some requirements for mixing biofuels into the country’s fuel supply, but also not to let ethanol exports qualify for renewable fuel credits, according to a copy of the letter obtained by Reuters. The members of Congress are part of a bipartisan voting bloc dedicated to supporting the biofuels industry called the House Biofuels Caucus. They represent districts in states such as Iowa and Illinois where farmers grow corn for ethanol and other biofuels. They urged the agency to increase biomass-based biodiesel requirements and not to decrease the amount of advanced biofuels required to be added to the fuel supply.  “Our farmers and biof

Index 1144 - Class 0 - Putin says Trump hampered from delivering electoral promises
SOCHI, Russia (Reuters) - Rus

Index 1462 - Class 0 - Rich would benefit most from Trump tax cut plan: policy group
WASHINGTON (Reuters) - The wealthiest Americans would benefit the most from President Donald Trump’s proposed tax cuts while many upper middle-income people would face higher taxes, independent experts said on Friday in the first detailed analysis of the plan. A U.S. Senate panel took Trump’s proposal, announced on Wednesday, a step forward by unveiling a budget plan for the coming fiscal year that acknowledges lost revenues from tax cuts, while Trump pressed ahead with selling the plan to the public. A report from the non-profit Washington-based Tax Policy Center found that in 2018, about 12 percent of taxpayers would face a tax increase of roughly $1,800 on average.  That includes more than a third of taxpayers making between about $150,000 and $300,000, mainly because most itemized dedu

Index 1463 - Class 0 - Trump administration in spotlight as U.S. top court returns
WASHINGTON (Reuters) - The Tru

# Text processing

First I will check what a regular text processing does, like removal of html tags, lower case but with no stemming. Title and text are put together for processing.

## Check indices

Check the following indices if text processing works reliable

- 1112 (Trump twitter)
- 284 (Trump twitter)

In [200]:
def clear_text(text, do_stem):
    stemmer = PorterStemmer()
    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()) # convert to lower case
    words = text.split() # split string into words
    words = [w for w in words if w not in stopwords.words('english')] # Remove stopwords
    
    if do_stem:
        words = [PorterStemmer().stem(w) for w in words] # stem
        
    text = ' '.join(words)
    return text

In [201]:
def remove_us(text):
    text = re.sub('[uU].[sS].', '', text)
    return text    

In [202]:
def remove_twitter_factbox(text):
    text = re.sub('Factbox: Trump on Twitter', '', text)
    return text    

In [203]:
def remove_twitter_intro(text):
    text = re.sub('The following statements.*@realDonaldTrump', '', text)
    return text    

In [204]:
def remove_reuters(text):
    text = re.sub('.*\(Reuters\) -', '', text)
    return text

In [205]:
def remove_dates(text):
    text = re.sub('[a-zA-Z]+ [0-9][0-9], [0-9][0-9][0-9][0-9]', '', text)
    return text

In [206]:
def remove_brackets(text):    
    text = re.sub('[\(\[].*?[\)\]]', '', text)
    return text

In [207]:
def remove_source_link(text):
    text = re.sub('[sS][oO][uU][rR][cC][eE] [lL][iI][nN][kK]', '', text)
    return text

In [208]:
def remove_links(text):    
    text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', text)
    return text

In [209]:
def text_processing(index, df, do_print=False, do_stem=True):
    doc_class = df['class'].iloc[index]
    df_title = df['title'].iloc[index]    
    df_text = df['text'].iloc[index]
    text = remove_reuters(df_text) # reuters part needs to be removed first before title is added
    text = '{} {}'.format(df_title, text)
    text = remove_twitter_intro(text)
    text = remove_twitter_factbox(text)
    text = remove_dates(text)
    text = remove_brackets(text)
    text = remove_links(text)
    text = remove_source_link(text)
    text = remove_us(text)
    text = clear_text(text, do_stem)
    
    if do_print:
        print('Index {} - Class {} - {}\n'.format(index, doc_class, text))
    else:
        return text

In [210]:
check_index = 23876
detailed_investigation(check_index, original_document)
text_processing(check_index, original_document, do_print=True)

Index 23876 - Class 1 -  Republicans Attack NASA For Trying To Save Earth Instead Of Putting Space First
Republicans don t seem to understand that Earth is the planet we live on.Donald Trump and his Republicans supporters in Congress are taking aim at any agency that studies climate change.Climate change has had a devastating impact on our planet. The ice sheets are melting as an unprecedented rate and sea level rise threatens to drown our coastlines, which includes Trump s Mar-a-Lago resort at Palm Beach, Florida.Furthermore, weather patterns have become more unpredictable and we are seeing more instances of major disastrous hurricanes, wildfires, droughts, etc Whether Republicans like it or not, humans have been the driving force behind climate change, and it will get even worse since Trump is giving oil, gas, coal and chemical companies the freedom to pollute more than ever before. Soon, our cities will be choked by smog and our waterways will be poisoned as global temperatures cont

# Save a clean dataset 

We also need to make sure to remove duplicates!

In [233]:
def restruct_text(df):
    nrow_data = len(df.index)
    print('{} entries to process'.format(nrow_data))
    iteration = 0
    start_time = time.time()
    for i in df.index:
        iteration += 1
        if iteration % 100 is 0:
            time_now = time.time()
            time_diff = time_now - start_time
            total_time_estimated = nrow_data / iteration * time_diff
            time_remaining = total_time_estimated - time_diff
            print('Iterate {}/{} (time used: {:.0f}s, remaining time: {:.0f}s)'.
                  format(iteration, nrow_data, time_diff, time_remaining))
            
        df.loc[i, 'proc_text'] = text_processing(i, df)
        
    print('done')
    return df    

In [234]:
processed_document = restruct_text(original_document)

600 entries to process
Iterate 100/600 (time used: 8s, remaining time: 39s)
Iterate 200/600 (time used: 14s, remaining time: 29s)
Iterate 300/600 (time used: 22s, remaining time: 22s)
Iterate 400/600 (time used: 29s, remaining time: 15s)
Iterate 500/600 (time used: 35s, remaining time: 7s)
Iterate 600/600 (time used: 43s, remaining time: 0s)
done


In [241]:
def save_document(df, data_path, data_set):
    
    # keep only class and proc_text
    df_save = df[['class', 'proc_text']].copy()
    df_save.drop_duplicates(inplace=True)
    df_save.to_csv('{}/{}'.format(data_path, data_set), index=False)
    old_length = len(df.index)
    new_length = len(df_save.index)
    
    
    number_duplicates = (old_length - new_length) / old_length * 100
    print('{} saved, data changed from {} to {}, approximately {:.2f}% duplicates'.
          format(data_set, len(df.index), len(df_save.index), number_duplicates))

In [242]:
save_document(processed_document, 'data', 'clean_document.csv')

clean_document.csv saved, data changed from 600 to 598, approximately 0.33% duplicates
