# Download fake news data

This notebook downloads the fake news data from kaggle and performs text processing.

## Download required packages

In [None]:
%%capture

!pip install kaggle
!pip install zipfile

## Import libraries

In [None]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')
import random
import unittest
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import time
from bs4 import BeautifulSoup
nltk.download('stopwords', quiet=True)

## Generate kaggle data

In [None]:
def download_kaggle_data(data_path, data_set, extract_zip):
    
    print('Download {} from kaggle.com...'.format(data_set), end='')
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file
        dataset='benjaminperucco/udacitydata',
        file_name=data_set,
        path=data_path,
        force=True
    )

    file_path = '{}/{}.zip'.format(data_path, data_set)
    
    if extract_zip:
        zf = ZipFile(file_path)
        zf.extractall(data_path)
        zf.close()
        !rm $file_path

    print('done')

In [None]:
def read_kaggle_data(data_path, data_set):
    
    file_path = '{}/{}'.format(data_path, data_set)
    if os.path.exists(file_path):
        
        print('Import {}...'.format(data_set), end='')
    
        file_path = '{}/{}'.format(data_path, data_set)
        imported_data = pd.read_csv(file_path)
        !rm $file_path
    
        print('done')
    
    else:
        
        empty_dict = {'dummy_col_1': [3, 2, 1, 0], 'dummy_col_2': ['a', 'b', 'c', 'd']} # just some test data
        imported_data = pd.DataFrame.from_dict(empty_dict)
        print('{} does not exist. Please download again kaggle data.'.format(file_path))
    
    return imported_data

In [None]:
def generate_kaggle_data(data_path, extract_zip=True):
    
    # download fake news data
    download_kaggle_data(data_path, 'Fake.csv', extract_zip)
    download_kaggle_data(data_path, 'True.csv', extract_zip)
    
    # import data
    fake = read_kaggle_data(data_path, 'Fake.csv')
    true = read_kaggle_data(data_path, 'True.csv')
    
    # add class 
    fake['class'] = 1 # 1 = fake
    true['class'] = 0 # 0 = true
    
    # merge data together and add classification
    corpus = pd.concat([true, fake])
    
    return corpus

In [None]:
corpus = generate_kaggle_data('data')

## Statistics about corpus

We can see here that we have less unique values for title and text as counted values. This means several times the same titles and texts occur. I will remove not unique features once the text is processed further. 

In [None]:
corpus.head()

In [None]:
corpus[['title', 'text', 'subject', 'date']].describe()

In [None]:
corpus['class'].value_counts()

## Text processing

Title and text are put together for processing. Several processing steps are performed in the following order:

- Remove everything that is before (Reuters) in the text. Mostly, these are cities where the article is referenced to.
- Remove twitter intro. In case tweets from the U.S. President are truthful, these articles start with "The following statements..." and stop with @realDonaldTrump. Everything between is excluded.
- Remove "Factbox: Trump on Twitter" intro.
- Remove everything that looks like a date, for example March 20, 1989, Mar 20, 1989 or just Mar 20 1989. Dates contain no information regarding classifiction and are therefore removed.
- Remove everything between brackets [] or (). Often links are between brackets, therefore we just remove them.
- Remove everything that looks like a link.
- Remove text with source link.
- Remove U.S.
- Further cleaning of text like removal of digits, html tags, stopwords and stemming is performed.

In [None]:
def clear_text(text):
    """
    Basic text processing:
    - HTML tag removal
    - Conversion to lower character
    - Split words between spaces
    - Stopwords removal
    - Stem words
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    stemmer = PorterStemmer()
    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower()) # convert to lower case
    words = text.split() # split string into words
    words = [w for w in words if w not in stopwords.words('english')] # remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem 
    text = ' '.join(words)
    return text

In [None]:
def remove_us(text):
    """
    Remove <<U.S.>> from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('[uU].[sS].', '', text)
    return text    

In [None]:
def test_remove_us():
    """
    Performs a test of function remove_us().
    """
    text_1 = 'The U.S. President or the u.s. President or even U.s. President'
    result_text_1 = 'The  President or the  President or even  President'
    assert remove_us(text_1) == result_text_1, 'test_remove_us: check of test 1 failed'
    print('all test_remove_us tests passed')

In [None]:
def remove_twitter_factbox(text):
    """
    Remove <<Factbox: Trump on Twitter>> from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('Factbox: Trump on Twitter', '', text)
    return text    

In [None]:
def remove_twitter_intro(text):
    """
    Remove everything between <<The following statements>> and <<@realDonaldTrump>> from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('The following statements.*@realDonaldTrump', '', text)
    return text    

In [None]:
def remove_reuters(text):
    """
    Remove everything before and including (Reuters) from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('.*\(Reuters\) -', '', text)
    return text

In [None]:
def remove_dates(text):
    """
    Remove dates written as Mar(ch) 20(,) 1982 from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('[a-zA-Z]+ [0-9]?[0-9],? [0-9][0-9][0-9][0-9]', '', text)
    return text

In [None]:
def test_remove_dates():
    """
    Performs a test of function remove_dates().
    """
    text_1 = 'Shall we see each other March 5, 2017 again?'
    result_text_1 = 'Shall we see each other  again?'
    text_2 = 'Shall we see each other April 15, 2017 again?'
    result_text_2 = 'Shall we see each other  again?'
    text_3 = 'Shall we see each other May 5 2017 again?'
    result_text_3 = 'Shall we see each other  again?'
    text_4 = 'Shall we see each other January 17 again?'
    result_text_4 = 'Shall we see each other January 17 again?'
    assert remove_dates(text_1) == result_text_1, 'test_remove_dates: check of test 1 failed'
    assert remove_dates(text_2) == result_text_2, 'test_remove_dates: check of test 2 failed'
    assert remove_dates(text_3) == result_text_3, 'test_remove_dates: check of test 3 failed'
    assert remove_dates(text_4) == result_text_4, 'test_remove_dates: check of test 4 failed'
    print('all test_remove_dates tests passed')

In [None]:
def remove_brackets(text):   
    """
    Remove brackets and text between brackets like () or [].

    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('[\(\[].*?[\)\]]', '', text)
    return text

In [None]:
def test_remove_brackets():
    """
    Performs a test of function remove_brackets().
    """
    text_1 = '[VIDEO], [video], (other stuff), but not this'
    result_text_1 = ', , , but not this'
    assert remove_brackets(text_1) == result_text_1, 'test_remove_brackets: check of test 1 failed'
    print('all test_remove_brackets tests passed')

In [None]:
def remove_source_link(text):
    """
    Remove words source link from text (written in any combination of lower or upper case).
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('[sS][oO][uU][rR][cC][eE] [lL][iI][nN][kK]', '', text)
    return text

In [None]:
def test_remove_source_link():
    """
    Performs a test of function remove_source_link().
    """
    text_1 = 'All kind of source link or SOURCE LINK or Source Link or just link'
    result_text_1 = 'All kind of  or  or  or just link'  
    assert remove_source_link(text_1) == result_text_1, 'test_remove_source_link: check of test 1 failed'
    print('all test_remove_source_link tests passed')

In [None]:
def remove_links(text): 
    """
    Remove everything that looks like a link from text.
    
    Args:
    - text (str): Text to be cleared.
    
    Returns:
    - text (str): Cleared text.
    """
    text = re.sub('(?:\\s)[^\\s\\.]*\\.[^\\s]+', '', text)
    return text

In [None]:
def test_remove_links():
    """
    Performs a test of function remove_links().
    """
    text_1 = 'This text contains some tiny urls such as (bit.ly/2jBh4LU) and another (bit.ly/2jpEXYR) or also ' + \
             'normal URLs such as https://www.ubs.com/ch/de.html or http://www.ubs.com/ch/de.html'
    result_text_1 = 'This text contains some tiny urls such as and another or also normal URLs such as or'  
    text_2 = 'In another text, we use tiny urls without brackets like bit.ly/2jBh4LU and combining with longer ' + \
             'URLs like https://stackoverflow.com/questions/9043820/regex-to-match-words-of-a-certain-length ' + \
             'but it should not stop at the end but continue'
    result_text_2 = 'In another text, we use tiny urls without brackets like and combining with longer ' + \
                    'URLs like but it should not stop at the end but continue'
    text_3 = 'But is it also a problem when there is a slash at the end of the URL: https://stackoverflow.com/ ?'
    result_text_3 = 'But is it also a problem when there is a slash at the end of the URL: ?'
    assert remove_links(text_1) == result_text_1, 'test_remove_links: check of test 1 failed'
    assert remove_links(text_2) == result_text_2, 'test_remove_links: check of test 2 failed'
    assert remove_links(text_3) == result_text_3, 'test_remove_links: check of test 3 failed'
    print('all test_remove_links tests passed')

In [None]:
def text_processing(title, text):
    """
    Execute text processing.
    
    Args:
    - title (str): Title from news article.
    - text (str): Text from news article.
    
    Returns:
    - text (str): Processed text.
    """
    text = remove_reuters(text) # reuters part needs to be removed first before title is added
    text = '{} {}'.format(title, text)
    text = remove_twitter_intro(text)
    text = remove_twitter_factbox(text)
    text = remove_dates(text)
    text = remove_brackets(text)
    text = remove_links(text)
    text = remove_source_link(text)
    text = remove_us(text)
    text = clear_text(text)
    return text

In [None]:
def prossed_text_investigation(index, df, number_strings=0):
    assert number_strings >= 0, 'number_strings must be 0 or positive' 
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    text = text_processing(title, text)
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {}\n'.format(index, doc_class, text))

In [None]:
def unprossed_text_investigation(index, df, number_strings=0):
    assert number_strings >= 0, 'number_strings must be 0 or positive'   
    
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    if number_strings > 0:
        text = text[:number_strings]
        
    print('INDEX {} - CLASS {} - {} {}\n'.format(index, doc_class, title, text))

## Unit test of text processing functions

In [None]:
def perform_unit_tests():
    test_remove_us()
    test_remove_dates()
    test_remove_brackets()
    test_remove_source_link()
    test_remove_links()

In [None]:
perform_unit_tests()

## Investigate before and after text processing

In [None]:
def before_and_after_check(df, check_index):
    unprossed_text_investigation(check_index, df)
    prossed_text_investigation(check_index, df)

In [None]:
before_and_after_check(corpus, 23876)

## Save a clean dataset 

We also need to make sure to remove duplicates! Furthermore, data resampling is performed for training, test and validation dataset extraction in a further step.

In [None]:
def restruct_text(df):
    nrow_data = len(df.index)
    print('{} entries to process'.format(nrow_data))
    iteration = 0
    start_time = time.time()
    for i in df.index:
        title = df['title'].iloc[i]
        text = df['text'].iloc[i]
        iteration += 1
        if iteration % 100 is 0:
            time_now = time.time()
            time_diff = time_now - start_time
            total_time_estimated = nrow_data / iteration * time_diff
            time_remaining = total_time_estimated - time_diff
            print('Iterate {}/{} (time used: {:.0f}s, remaining time: {:.0f}s)'.
                  format(iteration, nrow_data, time_diff, time_remaining))
            
        df.loc[i, 'processed_text'] = text_processing(title, text)
        
    print('done')
    return df    

In [None]:
def resample_corpus(df, set_seed):
    nrow_data = len(df.index)
    random.seed(set_seed)
    sample_index = random.sample(range(nrow_data), nrow_data)
    df = df.iloc[sample_index]
    df.reset_index(inplace=True, drop=True)
    return df

In [None]:
def save_corpus(df, data_path, data_set, set_seed=1, corpus_size=None):
    
    # resample corpus
    print('resample corpus...' , end='')
    df = resample_corpus(df, set_seed)
    print('done')
    
    # cut corpus size (if used)
    if corpus_size is not None:
        df_cut = df.iloc[:corpus_size].copy()
    else:
        df_cut = df.copy()
        
    # start text processing
    df_proc = restruct_text(df_cut)
    
    # keep only class and processed_text, 
    df_save = df_proc[['class', 'processed_text']].copy()
        
    # export, without duplicates
    df_save.drop_duplicates(inplace=True)
    df_save.to_csv('{}/{}'.format(data_path, data_set), index=False)

    print('{} saved'.format(data_set))

In [None]:
save_corpus(corpus, 'data', 'corpus.csv', 4, 5000)