# Download fake news data

This notebook downloads the fake news data from kaggle and does some first analysis.

In [None]:
%%capture

# download required packages
!pip install kaggle
!pip install zipfile

In [None]:
# import libraries
from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
from bs4 import BeautifulSoup

In [None]:
# user parameters
data_path = 'data'

In [None]:
def download_kaggle_data(data_path, data_set, extract_zip=True):
    
    print('Download {} from kaggle.com...'.format(data_set), end='')
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_file(
        dataset='clmentbisaillon/fake-and-real-news-dataset',
        file_name=data_set,
        path=data_path,
        force=True
    )

    file_path = '{}/{}.zip'.format(data_path, data_set)
    
    if extract_zip:
        zf = ZipFile(file_path)
        zf.extractall(data_path)
        zf.close()
        !rm $file_path

    print('done')

In [None]:
# download fake news data
download_kaggle_data(data_path, 'Fake.csv')
download_kaggle_data(data_path, 'True.csv')

In [None]:
def read_kaggle_data(data_path, data_set):
    
    file_path = '{}/{}'.format(data_path, data_set)
    if os.path.exists(file_path):
        
        print('Import {}...'.format(data_set), end='')
    
        file_path = '{}/{}'.format(data_path, data_set)
        imported_data = pd.read_csv(file_path)
        !rm $file_path
    
        print('done')
    
    else:
        
        empty_dict = {'dummy_col_1': [3, 2, 1, 0], 'dummy_col_2': ['a', 'b', 'c', 'd']} # just some test data
        imported_data = pd.DataFrame.from_dict(empty_dict)
        print('{} does not exist. Please download again kaggle data.'.format(file_path))
    
    return imported_data

In [None]:
# import data
fake = read_kaggle_data(data_path, 'Fake.csv')
true = read_kaggle_data(data_path, 'True.csv')

In [None]:
# add class 
fake['class'] = 1 # 1 = fake
true['class'] = 0 # 0 = true

In [None]:
fake.head()

In [None]:
fake[['title', 'text', 'subject', 'date']].describe()

In [None]:
true.head()

In [None]:
true[['title', 'text', 'subject', 'date']].describe()

We can see here that we have less unique values for title and text as counted values. This means several times the same titles and texts occur. I will remove not unique features once the text is processed further. 

In [None]:
# merge data together and add classification
original_document = pd.concat([true, fake])
true = fake = None # to save memory

In [None]:
original_document.head()

In [None]:
original_document[['title', 'text', 'subject', 'date']].describe()

In [None]:
def detailed_investigation(index, df, number_strings=0):
    doc_class = df['class'].iloc[index]
    title = df['title'].iloc[index]
    text = df['text'].iloc[index]
    if number_strings > 0:
        text = text[:number_strings]
        
    print('Index {} - Class {} - {}\n{}\n'.format(index, doc_class, title, text))

In [None]:
detailed_investigation(11010, original_document)

In [None]:
select_from = 1
select_to = 2000

In [None]:
for i in range(select_from, select_to):
    detailed_investigation(i, original_document, 800)

# Text processing

First I will check what a regular text processing does, like removal of html tags, lower case but with no stemming. Title and text are put together for processing.

## Check indices

- 1112 
- 284 

In [None]:
def standard_text_processing(index, df):
    
    nltk.download('stopwords', quiet=True)
    stemmer = PorterStemmer()
    
    df_title = df['title'].iloc[index]
    df_text = df['text'].iloc[index]
    text = '{} {}'.format(df_title, df_text)
    text = BeautifulSoup(text, 'html.parser').get_text() # remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()) # convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words('english')] # Remove stopwords
    text = ' '.join(words)
    #words = [PorterStemmer().stem(w) for w in words] # stem
    
    print(text)

In [None]:
def remove_factbox(text):
    text = re.sub('Factbox: Trump on Twitter', '', text)
    return text    

In [None]:
def remove_twitter_intro(text):
    text = re.sub('The following statements.*@realDonaldTrump', '', text)
    return text    

In [None]:
def remove_reuters(text):
    text = re.sub('.*\(Reuters\) -', '', text)
    return text

In [None]:
def remove_dates(text):
    text = re.sub('[a-zA-Z]+ [0-9][0-9], [0-9][0-9][0-9][0-9]', '', text)
    return text

In [None]:
def remove_brackets(text):    
    text = re.sub('[\(\[].*?[\)\]]', '', text)
    return text

In [None]:
def remove_links(text):    
    text = re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', text)
    return text

In [None]:
def text_processing(index, df):
    df_title = df['title'].iloc[index]    
    df_text = df['text'].iloc[index]
    text = remove_reuters(df_text)
    text = '{} {}'.format(df_title, text)
    text = remove_twitter_intro(text)
    text = remove_factbox(text)
    text = remove_dates(text)
    text = remove_brackets(text)
    text = remove_links(text)
    
    print(text)

In [None]:
detailed_investigation(284, original_document)

In [None]:
manual_text_processing(284, original_document)