# Download fake news data

This notebook downloads the fake news data from kaggle and does some first analysis.

In [117]:
%%capture

# download required packages
!pip install kaggle
!pip install zipfile

In [118]:
# import libraries
import random
import unittest
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import time
from bs4 import BeautifulSoup
nltk.download('stopwords', quiet=True)

True

In [119]:
# user parameters
data_path = 'data'

In [120]:
def download_kaggle_data(data_path, data_set, extract_zip=True):
    
    print('Download {} from kaggle.com...'.format(data_set), end='')
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file(
        dataset='benjaminperucco/udacitydata',
        file_name=data_set,
        path=data_path,
        force=True
    )

    file_path = '{}/{}.zip'.format(data_path, data_set)
    
    if extract_zip:
        zf = ZipFile(file_path)
        zf.extractall(data_path)
        zf.close()
        !rm $file_path

    print('done')

In [121]:
# download fake news data
download_kaggle_data(data_path, 'Fake.csv')
download_kaggle_data(data_path, 'True.csv')

Download Fake.csv from kaggle.com...done
Download True.csv from kaggle.com...done


In [122]:
def read_kaggle_data(data_path, data_set):
    
    file_path = '{}/{}'.format(data_path, data_set)
    if os.path.exists(file_path):
        
        print('Import {}...'.format(data_set), end='')
    
        file_path = '{}/{}'.format(data_path, data_set)
        imported_data = pd.read_csv(file_path)
        !rm $file_path
    
        print('done')
    
    else:
        
        empty_dict = {'dummy_col_1': [3, 2, 1, 0], 'dummy_col_2': ['a', 'b', 'c', 'd']} # just some test data
        imported_data = pd.DataFrame.from_dict(empty_dict)
        print('{} does not exist. Please download again kaggle data.'.format(file_path))
    
    return imported_data

In [123]:
# import data
fake = read_kaggle_data(data_path, 'Fake.csv')
true = read_kaggle_data(data_path, 'True.csv')

Import Fake.csv...done
Import True.csv...done


In [124]:
# add class 
fake['class'] = 1 # 1 = fake
true['class'] = 0 # 0 = true

In [125]:
fake.head()

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [126]:
fake[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [127]:
true.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [128]:
true[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


We can see here that we have less unique values for title and text as counted values. This means several times the same titles and texts occur. I will remove not unique features once the text is processed further. 

In [129]:
# merge data together and add classification
original_document = pd.concat([true, fake])
true = fake = None # to save memory

In [130]:
original_document.head()

Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [131]:
original_document[['title', 'text', 'subject', 'date']].describe()

Unnamed: 0,title,text,subject,date
count,44898,44898.0,44898,44898
unique,38729,38646.0,8,2397
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017"
freq,14,627.0,11272,182


In [132]:
def unprossed_text_investigation(index, df, number_strings=0):
    assert number_strings >= 0, 'number_strings must be 0 or positive'   
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {} {}\n'.format(index, doc_class, title, text))

In [133]:
unprossed_text_investigation(11010, original_document)

INDEX 11010 - CLASS 0 - Obama highlights five priorities in meeting with Republican leaders WASHINGTON (Reuters) - U.S. President Barack Obama highlighted five legislative priorities in a meeting on Tuesday with Republican congressional leaders, including the financial crisis in Puerto Rico and passage of the Trans-Pacific Partnership, the White House said. The White House said Obama also discussed the opioid  epidemic, cancer research and criminal justice reform with Republican House Speaker Paul Ryan and Senate Majority Leader Mitch McConnell. “The president is eager to see Congress take that action as soon as possible this year,” White House spokesman Josh Earnest said of the sweeping trade pact. 



In [134]:
select_from = 1
select_to = 2000

In [135]:
for i in range(select_from, select_to):
    unprossed_text_investigation(i, original_document, 800)

INDEX 1 - CLASS 0 - U.S. military to accept transgender recruits on Monday: Pentagon WASHINGTON (Reuters) - Transgender people will be allowed for the first time to enlist in the U.S. military starting on Monday as ordered by federal courts, the Pentagon said on Friday, after President Donald Trump’s administration decided not to appeal rulings that blocked his transgender ban. Two federal appeals courts, one in Washington and one in Virginia, last week rejected the administration’s request to put on hold orders by lower court judges requiring the military to begin accepting transgender recruits on Jan. 1. A Justice Department official said the administration will not challenge those rulings. “The Department of Defense has announced that it will be releasing an independent study of these issues in the coming weeks. So rather than litigate this interim appeal before that oc

INDEX 2 - CLASS 0 - Senior U.S. Republican senator: 'Let Mr. Mueller do his job' WASHINGTON (Reuters) - The speci

INDEX 903 - CLASS 0 - Factbox: What is in Republican tax bill? Here is the framework (Reuters) - Republicans in the U.S. House of Representatives are due to release tax legislation on Thursday calling for slashing taxes on corporations, repealing some taxes paid primarily by the wealthy and adjusting other taxes on families and individuals.  As questions about the final shape of the bill swirl around Washington, the following is a look at the basic features of the plan released in September, though changes are expected: * Reduce the U.S. corporate income tax rate to 20 percent from a current statutory 35 percent. * Eliminate the corporate alternative minimum tax. * Move to a territorial tax system that no longer imposes the U.S. corporate tax on foreign profits of U.S. companies. * Require U.S. corporations to return assets held overseas at lowered one-time tax rates. * Es

INDEX 904 - CLASS 0 - Factbox: Trump tax plan stumbles on local tax deduction, 401(k) (Reuters) - The rollout of 

INDEX 1233 - CLASS 0 - U.S. stance on auto industry sows more doubt about NAFTA overhaul ARLINGTON, Va. (Reuters) - The Trump administration on Friday demanded that U.S.-made  content account for half the value of the cars and trucks sold under the North American Free Trade Agreement, raising further doubts about any potential deal to renew the pact. Three sources briefed on the protectionist U.S. proposal, which is in line with President Donald Trump’s goal of shrinking a trade deficit with Mexico and stemming the loss of U.S. manufacturing jobs, said it also seeks sharply higher North American automotive content overall. The proposal was made during contentious talks in Washington, in the fourth of seven planned rounds of negotiations to overhaul the treaty. Some Mexican sources denounced it as “absurd,” but Juan Carlos Baker, Mexico’s deputy economy minister, put a brave fa

INDEX 1234 - CLASS 0 - Exclusive: Trump administration reduces support for prisoner halfway houses WASHINGTON

# Text processing

First I will check what a regular text processing does, like removal of html tags, lower case but with no stemming. Title and text are put together for processing.

## Check indices

Check the following indices if text processing works reliable

- 1112 (Trump twitter)
- 284 (Trump twitter)
- links (bit.ly/2jBh4LU) (bit.ly/2jpEXYR) bit.ly/2lnpKaq [1814 EST]

In [136]:
def clear_text(text):
    stemmer = PorterStemmer()
    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()) # convert to lower case
    words = text.split() # split string into words
    words = [w for w in words if w not in stopwords.words('english')] # remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem 
    text = ' '.join(words)
    return text

In [137]:
def remove_us(text):
    text = re.sub('[uU].[sS].', '', text)
    return text    

In [138]:
def test_remove_us():
    text_1 = 'The U.S. President or the u.s. President or even U.s. President'
    result_text_1 = 'The  President or the  President or even  President'
    assert remove_us(text_1) == result_text_1, 'test_remove_us: check of test 1 failed'
    print('all test_remove_us tests passed')

In [139]:
test_remove_us()

all test_remove_us tests passed


In [140]:
def remove_twitter_factbox(text):
    text = re.sub('Factbox: Trump on Twitter', '', text)
    return text    

In [141]:
def remove_twitter_intro(text):
    text = re.sub('The following statements.*@realDonaldTrump', '', text)
    return text    

In [142]:
def remove_reuters(text):
    text = re.sub('.*\(Reuters\) -', '', text)
    return text

In [143]:
def remove_dates(text):
    text = re.sub('[a-zA-Z]+ [0-9]?[0-9],? [0-9][0-9][0-9][0-9]', '', text)
    return text

In [144]:
def test_remove_dates():
    text_1 = 'Shall we see each other March 5, 2017 again?'
    result_text_1 = 'Shall we see each other  again?'
    text_2 = 'Shall we see each other April 15, 2017 again?'
    result_text_2 = 'Shall we see each other  again?'
    text_3 = 'Shall we see each other May 5 2017 again?'
    result_text_3 = 'Shall we see each other  again?'
    text_4 = 'Shall we see each other January 17 again?'
    result_text_4 = 'Shall we see each other January 17 again?'
    assert remove_dates(text_1) == result_text_1, 'test_remove_dates: check of test 1 failed'
    assert remove_dates(text_2) == result_text_2, 'test_remove_dates: check of test 2 failed'
    assert remove_dates(text_3) == result_text_3, 'test_remove_dates: check of test 3 failed'
    assert remove_dates(text_4) == result_text_4, 'test_remove_dates: check of test 4 failed'
    print('all test_remove_dates tests passed')

In [145]:
test_remove_dates()

all test_remove_dates tests passed


In [146]:
def test_remove_brackets():
    text_1 = 'We will meet each other again April 20, 1980 and then we will have fun'
    result_text_1 = 'We will meet each other again  and then we will have fun'
    text_2 = 'We will meet each other again Dec 2, 1980 and then we will have fun'
    result_text_2 = 'We will meet each other again  and then we will have fun'
    text_3 = 'We will meet each other again Jan 17 and then we will have fun'
    result_text_3 = 'We will meet each other again Jan 17 and then we will have fun'
    text_4 = 'We will meet each other again Jan 17 1980 and then we will have fun'
    result_text_4 = 'We will meet each other again  and then we will have fun'
    assert remove_dates(text_1) == result_text_1, 'test_remove_brackets: check of test 1 failed'
    assert remove_dates(text_2) == result_text_2, 'test_remove_brackets: check of test 2 failed'
    assert remove_dates(text_3) == result_text_3, 'test_remove_brackets: check of test 3 failed'
    assert remove_dates(text_4) == result_text_4, 'test_remove_brackets: check of test 4 failed'
    print('all test_remove_brackets tests passed')

In [147]:
test_remove_brackets()

all test_remove_brackets tests passed


In [148]:
def remove_brackets(text):    
    text = re.sub('[\(\[].*?[\)\]]', '', text)
    return text

In [149]:
def test_remove_brackets():
    text_1 = '[VIDEO], [video], (other stuff), but not this'
    result_text_1 = ', , , but not this'
    assert remove_brackets(text_1) == result_text_1, 'test_remove_brackets: check of test 1 failed'
    print('all test_remove_brackets tests passed')

In [150]:
test_remove_brackets()

all test_remove_brackets tests passed


In [151]:
def remove_source_link(text):
    text = re.sub('[sS][oO][uU][rR][cC][eE] [lL][iI][nN][kK]', '', text)
    return text

In [152]:
def test_remove_source_link():
    text_1 = 'All kind of source link or SOURCE LINK or Source Link or just link'
    result_text_1 = 'All kind of  or  or  or just link'  
    assert remove_source_link(text_1) == result_text_1, 'test_remove_source_link: check of test 1 failed'
    print('all test_remove_source_link tests passed')

In [153]:
test_remove_source_link()

all test_remove_source_link tests passed


In [154]:
def remove_links(text): 
    text = re.sub('(?:\\s)[^\\s\\.]*\\.[^\\s]+', '', text)
    return text

In [155]:
def test_remove_links():
    text_1 = 'This text contains some tiny urls such as (bit.ly/2jBh4LU) and another (bit.ly/2jpEXYR) or also ' + \
             'normal URLs such as https://www.ubs.com/ch/de.html or http://www.ubs.com/ch/de.html'
    result_text_1 = 'This text contains some tiny urls such as and another or also normal URLs such as or'  
    text_2 = 'In another text, we use tiny urls without brackets like bit.ly/2jBh4LU and combining with longer ' + \
             'URLs like https://stackoverflow.com/questions/9043820/regex-to-match-words-of-a-certain-length ' + \
             'but it should not stop at the end but continue'
    result_text_2 = 'In another text, we use tiny urls without brackets like and combining with longer ' + \
                    'URLs like but it should not stop at the end but continue'
    text_3 = 'But is it also a problem when there is a slash at the end of the URL: https://stackoverflow.com/ ?'
    result_text_3 = 'But is it also a problem when there is a slash at the end of the URL: ?'
    assert remove_links(text_1) == result_text_1, 'test_remove_links: check of test 1 failed'
    assert remove_links(text_2) == result_text_2, 'test_remove_links: check of test 2 failed'
    assert remove_links(text_3) == result_text_3, 'test_remove_links: check of test 3 failed'
    print('all test_remove_links tests passed')

In [156]:
test_remove_links()

all test_remove_links tests passed


In [157]:
def text_processing(title, text):
    text = remove_reuters(text) # reuters part needs to be removed first before title is added
    text = '{} {}'.format(title, text)
    text = remove_twitter_intro(text)
    text = remove_twitter_factbox(text)
    text = remove_dates(text)
    text = remove_brackets(text)
    text = remove_links(text)
    text = remove_source_link(text)
    text = remove_us(text)
    text = clear_text(text)
    return text

In [158]:
def prossed_text_investigation(index, df, number_strings=0):
    
    assert number_strings >= 0, 'number_strings must be 0 or positive' 
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    text = text_processing(title, text)
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {}\n'.format(index, doc_class, text))

In [159]:
check_index = 23876
unprossed_text_investigation(check_index, original_document)
prossed_text_investigation(check_index, original_document)

INDEX 23876 - CLASS 1 -  Republicans Attack NASA For Trying To Save Earth Instead Of Putting Space First Republicans don t seem to understand that Earth is the planet we live on.Donald Trump and his Republicans supporters in Congress are taking aim at any agency that studies climate change.Climate change has had a devastating impact on our planet. The ice sheets are melting as an unprecedented rate and sea level rise threatens to drown our coastlines, which includes Trump s Mar-a-Lago resort at Palm Beach, Florida.Furthermore, weather patterns have become more unpredictable and we are seeing more instances of major disastrous hurricanes, wildfires, droughts, etc Whether Republicans like it or not, humans have been the driving force behind climate change, and it will get even worse since Trump is giving oil, gas, coal and chemical companies the freedom to pollute more than ever before. Soon, our cities will be choked by smog and our waterways will be poisoned as global temperatures cont

# Save a clean dataset 

We also need to make sure to remove duplicates!

In [160]:
def restruct_text(df):
    nrow_data = len(df.index)
    print('{} entries to process'.format(nrow_data))
    iteration = 0
    start_time = time.time()
    for i in df.index:
        title = df['title'].iloc[i]
        text = df['text'].iloc[i]
        iteration += 1
        if iteration % 100 is 0:
            time_now = time.time()
            time_diff = time_now - start_time
            total_time_estimated = nrow_data / iteration * time_diff
            time_remaining = total_time_estimated - time_diff
            print('Iterate {}/{} (time used: {:.0f}s, remaining time: {:.0f}s)'.
                  format(iteration, nrow_data, time_diff, time_remaining))
            
        df.loc[i, 'processed_text'] = text_processing(title, text)
        
    print('done')
    return df    

In [162]:
# sample the dataset to mix classes
nrow_data = len(original_document.index)
random.seed(4)
sample_index = random.sample(range(nrow_data), nrow_data)
original_document = original_document.iloc[sample_index]
original_document.reset_index(inplace=True, drop=True)

In [163]:
# in order to program the feature engineering, select only the top 200 entries from the sampled dataset
sampled_document = original_document.iloc[:200].copy()

In [165]:
# start text processing
processed_document = restruct_text(sampled_document)

200 entries to process
Iterate 100/200 (time used: 6s, remaining time: 6s)
Iterate 200/200 (time used: 14s, remaining time: 0s)
done


In [166]:
def save_document(df, data_path, data_set):
    
    # keep only class and proc_text
    df_save = df[['class', 'processed_text']].copy()
    df_save.drop_duplicates(inplace=True)
    df_save.to_csv('{}/{}'.format(data_path, data_set), index=False)
    old_length = len(df.index)
    new_length = len(df_save.index)
    
    
    number_duplicates = (old_length - new_length) / old_length * 100
    print('{} saved, data changed from {} to {}, approximately {:.2f}% duplicates'.
          format(data_set, len(df.index), len(df_save.index), number_duplicates))

In [167]:
save_document(processed_document, 'data', 'clean_document.csv')

clean_document.csv saved, data changed from 200 to 200, approximately 0.00% duplicates
