# Download fake news data

This notebook downloads the fake news data from kaggle and does some first analysis.

In [None]:
%%capture

# download required packages
!pip install kaggle
!pip install zipfile

In [None]:
# import libraries
import random
import unittest
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import time
from bs4 import BeautifulSoup
nltk.download('stopwords', quiet=True)

In [None]:
# user parameters
data_path = 'data'

In [None]:
def download_kaggle_data(data_path, data_set, extract_zip=True):
    
    print('Download {} from kaggle.com...'.format(data_set), end='')
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file(
        dataset='benjaminperucco/udacitydata',
        file_name=data_set,
        path=data_path,
        force=True
    )

    file_path = '{}/{}.zip'.format(data_path, data_set)
    
    if extract_zip:
        zf = ZipFile(file_path)
        zf.extractall(data_path)
        zf.close()
        !rm $file_path

    print('done')

In [None]:
# download fake news data
download_kaggle_data(data_path, 'Fake.csv')
download_kaggle_data(data_path, 'True.csv')

In [None]:
def read_kaggle_data(data_path, data_set):
    
    file_path = '{}/{}'.format(data_path, data_set)
    if os.path.exists(file_path):
        
        print('Import {}...'.format(data_set), end='')
    
        file_path = '{}/{}'.format(data_path, data_set)
        imported_data = pd.read_csv(file_path)
        !rm $file_path
    
        print('done')
    
    else:
        
        empty_dict = {'dummy_col_1': [3, 2, 1, 0], 'dummy_col_2': ['a', 'b', 'c', 'd']} # just some test data
        imported_data = pd.DataFrame.from_dict(empty_dict)
        print('{} does not exist. Please download again kaggle data.'.format(file_path))
    
    return imported_data

In [None]:
# import data
fake = read_kaggle_data(data_path, 'Fake.csv')
true = read_kaggle_data(data_path, 'True.csv')

In [None]:
# add class 
fake['class'] = 1 # 1 = fake
true['class'] = 0 # 0 = true

In [None]:
fake.head()

In [None]:
fake[['title', 'text', 'subject', 'date']].describe()

In [None]:
true.head()

In [None]:
true[['title', 'text', 'subject', 'date']].describe()

We can see here that we have less unique values for title and text as counted values. This means several times the same titles and texts occur. I will remove not unique features once the text is processed further. 

In [None]:
# merge data together and add classification
original_document = pd.concat([true, fake])
true = fake = None # to save memory

In [None]:
original_document.head()

In [None]:
original_document[['title', 'text', 'subject', 'date']].describe()

In [None]:
def unprossed_text_investigation(index, df, number_strings=0):
    assert number_strings >= 0, 'number_strings must be 0 or positive'   
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {} {}\n'.format(index, doc_class, title, text))

In [None]:
unprossed_text_investigation(11010, original_document)

In [None]:
select_from = 1
select_to = 2000

In [None]:
for i in range(select_from, select_to):
    unprossed_text_investigation(i, original_document, 800)

# Text processing

First I will check what a regular text processing does, like removal of html tags, lower case but with no stemming. Title and text are put together for processing.

## Check indices

Check the following indices if text processing works reliable

- 1112 (Trump twitter)
- 284 (Trump twitter)
- links (bit.ly/2jBh4LU) (bit.ly/2jpEXYR) bit.ly/2lnpKaq [1814 EST]

In [None]:
def clear_text(text):
    stemmer = PorterStemmer()
    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()) # convert to lower case
    words = text.split() # split string into words
    words = [w for w in words if w not in stopwords.words('english')] # remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem 
    text = ' '.join(words)
    return text

In [None]:
def remove_us(text):
    text = re.sub('[uU].[sS].', '', text)
    return text    

In [None]:
def test_remove_us():
    text_1 = 'The U.S. President or the u.s. President or even U.s. President'
    result_text_1 = 'The  President or the  President or even  President'
    assert remove_us(text_1) == result_text_1, 'test_remove_us: check of test 1 failed'
    print('all test_remove_us tests passed')

In [None]:
test_remove_us()

In [None]:
def remove_twitter_factbox(text):
    text = re.sub('Factbox: Trump on Twitter', '', text)
    return text    

In [None]:
def remove_twitter_intro(text):
    text = re.sub('The following statements.*@realDonaldTrump', '', text)
    return text    

In [None]:
def remove_reuters(text):
    text = re.sub('.*\(Reuters\) -', '', text)
    return text

In [None]:
def remove_dates(text):
    text = re.sub('[a-zA-Z]+ [0-9]?[0-9],? [0-9][0-9][0-9][0-9]', '', text)
    return text

In [None]:
def test_remove_dates():
    text_1 = 'Shall we see each other March 5, 2017 again?'
    result_text_1 = 'Shall we see each other  again?'
    text_2 = 'Shall we see each other April 15, 2017 again?'
    result_text_2 = 'Shall we see each other  again?'
    text_3 = 'Shall we see each other May 5 2017 again?'
    result_text_3 = 'Shall we see each other  again?'
    text_4 = 'Shall we see each other January 17 again?'
    result_text_4 = 'Shall we see each other January 17 again?'
    assert remove_dates(text_1) == result_text_1, 'test_remove_dates: check of test 1 failed'
    assert remove_dates(text_2) == result_text_2, 'test_remove_dates: check of test 2 failed'
    assert remove_dates(text_3) == result_text_3, 'test_remove_dates: check of test 3 failed'
    assert remove_dates(text_4) == result_text_4, 'test_remove_dates: check of test 4 failed'
    print('all test_remove_dates tests passed')

In [None]:
test_remove_dates()

In [None]:
def test_remove_brackets():
    text_1 = 'We will meet each other again April 20, 1980 and then we will have fun'
    result_text_1 = 'We will meet each other again  and then we will have fun'
    text_2 = 'We will meet each other again Dec 2, 1980 and then we will have fun'
    result_text_2 = 'We will meet each other again  and then we will have fun'
    text_3 = 'We will meet each other again Jan 17 and then we will have fun'
    result_text_3 = 'We will meet each other again Jan 17 and then we will have fun'
    text_4 = 'We will meet each other again Jan 17 1980 and then we will have fun'
    result_text_4 = 'We will meet each other again  and then we will have fun'
    assert remove_dates(text_1) == result_text_1, 'test_remove_brackets: check of test 1 failed'
    assert remove_dates(text_2) == result_text_2, 'test_remove_brackets: check of test 2 failed'
    assert remove_dates(text_3) == result_text_3, 'test_remove_brackets: check of test 3 failed'
    assert remove_dates(text_4) == result_text_4, 'test_remove_brackets: check of test 4 failed'
    print('all test_remove_brackets tests passed')

In [None]:
test_remove_brackets()

In [None]:
def remove_brackets(text):    
    text = re.sub('[\(\[].*?[\)\]]', '', text)
    return text

In [None]:
def test_remove_brackets():
    text_1 = '[VIDEO], [video], (other stuff), but not this'
    result_text_1 = ', , , but not this'
    assert remove_brackets(text_1) == result_text_1, 'test_remove_brackets: check of test 1 failed'
    print('all test_remove_brackets tests passed')

In [None]:
test_remove_brackets()

In [None]:
def remove_source_link(text):
    text = re.sub('[sS][oO][uU][rR][cC][eE] [lL][iI][nN][kK]', '', text)
    return text

In [None]:
def test_remove_source_link():
    text_1 = 'All kind of source link or SOURCE LINK or Source Link or just link'
    result_text_1 = 'All kind of  or  or  or just link'  
    assert remove_source_link(text_1) == result_text_1, 'test_remove_source_link: check of test 1 failed'
    print('all test_remove_source_link tests passed')

In [None]:
test_remove_source_link()

In [None]:
def remove_links(text): 
    text = re.sub('(?:\\s)[^\\s\\.]*\\.[^\\s]+', '', text)
    return text

In [None]:
def test_remove_links():
    text_1 = 'This text contains some tiny urls such as (bit.ly/2jBh4LU) and another (bit.ly/2jpEXYR) or also ' + \
             'normal URLs such as https://www.ubs.com/ch/de.html or http://www.ubs.com/ch/de.html'
    result_text_1 = 'This text contains some tiny urls such as and another or also normal URLs such as or'  
    text_2 = 'In another text, we use tiny urls without brackets like bit.ly/2jBh4LU and combining with longer ' + \
             'URLs like https://stackoverflow.com/questions/9043820/regex-to-match-words-of-a-certain-length ' + \
             'but it should not stop at the end but continue'
    result_text_2 = 'In another text, we use tiny urls without brackets like and combining with longer ' + \
                    'URLs like but it should not stop at the end but continue'
    text_3 = 'But is it also a problem when there is a slash at the end of the URL: https://stackoverflow.com/ ?'
    result_text_3 = 'But is it also a problem when there is a slash at the end of the URL: ?'
    assert remove_links(text_1) == result_text_1, 'test_remove_links: check of test 1 failed'
    assert remove_links(text_2) == result_text_2, 'test_remove_links: check of test 2 failed'
    assert remove_links(text_3) == result_text_3, 'test_remove_links: check of test 3 failed'
    print('all test_remove_links tests passed')

In [None]:
test_remove_links()

In [None]:
def text_processing(title, text):
    text = remove_reuters(text) # reuters part needs to be removed first before title is added
    text = '{} {}'.format(title, text)
    text = remove_twitter_intro(text)
    text = remove_twitter_factbox(text)
    text = remove_dates(text)
    text = remove_brackets(text)
    text = remove_links(text)
    text = remove_source_link(text)
    text = remove_us(text)
    text = clear_text(text)
    return text

In [None]:
def prossed_text_investigation(index, df, number_strings=0):
    
    assert number_strings >= 0, 'number_strings must be 0 or positive' 
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    text = text_processing(title, text)
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {}\n'.format(index, doc_class, text))

In [None]:
check_index = 23876
unprossed_text_investigation(check_index, original_document)
prossed_text_investigation(check_index, original_document)

# Save a clean dataset 

We also need to make sure to remove duplicates!

In [None]:
def restruct_text(df):
    nrow_data = len(df.index)
    print('{} entries to process'.format(nrow_data))
    iteration = 0
    start_time = time.time()
    for i in df.index:
        title = df['title'].iloc[i]
        text = df['text'].iloc[i]
        iteration += 1
        if iteration % 100 is 0:
            time_now = time.time()
            time_diff = time_now - start_time
            total_time_estimated = nrow_data / iteration * time_diff
            time_remaining = total_time_estimated - time_diff
            print('Iterate {}/{} (time used: {:.0f}s, remaining time: {:.0f}s)'.
                  format(iteration, nrow_data, time_diff, time_remaining))
            
        df.loc[i, 'processed_text'] = text_processing(title, text)
        
    print('done')
    return df    

In [None]:
# sample the dataset to mix classes
nrow_data = len(original_document.index)
random.seed(4)
sample_index = random.sample(range(nrow_data), nrow_data)
original_document = original_document.iloc[sample_index]
original_document.reset_index(inplace=True, drop=True)

In [None]:
# in order to program the feature engineering, select only the top 200 entries from the sampled dataset
sampled_document = original_document.iloc[:200].copy()

In [None]:
# start text processing
processed_document = restruct_text(sampled_document)

In [None]:
def save_document(df, data_path, data_set):
    
    # keep only class and proc_text
    df_save = df[['class', 'processed_text']].copy()
    df_save.drop_duplicates(inplace=True)
    df_save.to_csv('{}/{}'.format(data_path, data_set), index=False)
    old_length = len(df.index)
    new_length = len(df_save.index)
    
    
    number_duplicates = (old_length - new_length) / old_length * 100
    print('{} saved, data changed from {} to {}, approximately {:.2f}% duplicates'.
          format(data_set, len(df.index), len(df_save.index), number_duplicates))

In [None]:
save_document(processed_document, 'data', 'clean_document.csv')