# Task 3: pre-processing pipeline
we have created a pre-processing pipeline suitable for large files from the data processing functions of task 1.1

## pre-processing functions 

In [1]:
# Using 'wordpunct_tokenize' to split text on whitespace and punctuation
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
def tokenize_data(data):    
    tokenized_data = word_tokenize(data)
    tok_punct_data = [word for word in tokenized_data if any(char in string.ascii_letters for char in word)]
    return(tok_punct_data)


[nltk_data] Downloading package punkt to /home/zeyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# using NLTK's in-built collection of stopwords 
from nltk.corpus import stopwords
import os 
# Stopwords from nltk
stop_words_nltk = set(stopwords.words('english'))
# collecting more stopwords from website: http://members.unine.ch/jacques.savoy/clef/ given in lecture
stop_words_extra_path = os.path.join(os.getcwd(), "../stopwords_extra.txt")
stop_words_extra = set(open(stop_words_extra_path, "r").read().split("\n"))
stop_words = stop_words_nltk | stop_words_extra 

# removing stopwords
def stopwords_data(data_list):
    #stopwords are removed
    stopword_data = [word for word in data_list if word not in stop_words and sum(1 for char in word if char in string.ascii_letters) > 1]
    return stopword_data


In [3]:
# Stemming
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
# initialize the stemmer
#stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")
placeholders = ['NUM', 'URL', 'EMAIL', 'DATE']
# stemming
def stem_data(data):
    stemmed_data =  [stemmer.stem(word) if word not in placeholders else word for word in data]
    return stemmed_data



In [4]:
import re
def regex_clean(text):
    # lowercase
    pattern_lowercase = re.compile('[A-Z]')
    cleaned_text = re.sub(pattern_lowercase, lambda x: x.group(0).lower(), text)
    # whitespace
    pattern_whitespace = re.compile(' {2,}')
    cleaned_text = re.sub(pattern_whitespace, " ", cleaned_text)
    # newline
    pattern_newline = re.compile('\n+')
    cleaned_text = re.sub(pattern_newline, "\n", cleaned_text)
    # tab
    pattern_tab = re.compile('\t+')
    cleaned_text = re.sub(pattern_tab, "\t", cleaned_text)
    # emails
    pattern_email = re.compile('''([^,|\"|\|| |\t|\n|'|\]|\[]*@[^,|\"|\|| |\t|\n|'|\]|\[]*\.(com|org|edu|uk|net|gov))''')
    cleaned_text = re.sub(pattern_email, "<EMAIL>", cleaned_text)
    # URL's
    pattern_URL1 = re.compile('''([^,|\"|\|| |\t|\n|'|\]|\[]*\.(com|org|edu|uk|net|gov)[^,|\"|\|| |\t|\n|'|\]|\[]*)''')         # top-level domains
    pattern_URL2 = re.compile('''https?:\/\/[^,|\"|\|| |\t|\n|'|\]|\[]*''')                                                           # http(s) 
    cleaned_text = re.sub(pattern_URL1, "<URL>", cleaned_text)
    cleaned_text = re.sub(pattern_URL2, "<URL>", cleaned_text)
    # dates 
    pattern_dates = re.compile('''(((0[1-9]|[1-2]\d|3[0-1])(\-|\/|\.|\,| ){1,2}(0[1-9]|1[1-2]|[a-z]{3,9})(\-|\/|\.|\,| ){1,2}(\d{2,}))|((0[1-9]|1[1-2]|[a-z]{3,9})(\-|\/|\.|\,| )(0[1-9]|[1-2]\d|3[0-1])(\-|\/|\.|\,| ){1,2}(\d{2,}))|((\d{2,}))(\-|\/|\.|\,| )(0[1-9]|1[1-2]|[a-z]{3,9})(\-|\/|\.|\,| ){1,2}(0[1-9]|[1-2]\d|3[0-1])|((jan|january|feb|febuary|apr|april|may|jun|june|aug|august|sep|september|oct|october|nov|november|dec|december)(\-|\/|\.|\,| ){1,2}(0[1-9]|[1-2]\d|3[0-1])))''')
    cleaned_text = re.sub(pattern_dates, '<DATE>', cleaned_text)
    # numbers
    pattern_numbers = re.compile('(\d,\d|\d\.\d|\d)+')
    pattern_numbers_2 = re.compile('((\d:\d|\d,\d|\d\.\d|\d)+)')
    cleaned_text = re.sub(pattern_numbers_2, '<NUM>', cleaned_text)
    return cleaned_text

## Pre-Processing pipeline

This function applies the preprocessing pipeline to a fake news .csv file, creating a preprocessed version of that .csv file, and returning vocabulary information for data exploration.

It processes chunks of 100,000 rows at a time. This is to prevent the 3.4 Gb file 995,000_rows.csv from crashing the program. takes a while to process the entire file; around 6 hours.

The pipeline removes problematic rows (no content, content in russian, no type, wrong type). 

In [5]:
import pandas as pd
import os
from collections import Counter
import string
import matplotlib.pyplot as plt
import pickle 

def data_preprocessing(filepath):

    directory, filename = os.path.split(filepath)
    base, ext = os.path.splitext(filename)
    preprocessed_file_path = os.path.join(directory,f"{base}_preprocessed{ext}")
    statistics_file_path = os.path.join(directory,f"{base}_statistics.pickle")
    print(f"new cleaned dataset:", preprocessed_file_path)
    print(f"new statistics folder:", statistics_file_path)
    cleaned_vocab = Counter()
    processed_fake_vocab = Counter()
    processed_real_vocab = Counter()

    erronious_content = 0
    erronious_type = 0

    chunk_size = 100000
    with open(preprocessed_file_path, 'w', encoding='utf-8') as output_file:
        for chunk in pd.read_csv(filepath, chunksize=chunk_size,low_memory=False):
            #drop data with erronious/empty columns.
            erronious_content += chunk['content'].isnull().sum()
            chunk.dropna(subset=['content'],inplace=True)

            valid = ['fake','satire','bias','conspiracy','state','junksci','hate','clickbait','unreliable','political','reliable']
            erronious_type += chunk['type'].isnull().sum() + (~(chunk['type'].isin(valid))).sum()
            chunk.dropna(subset=['type'],inplace=True)
            chunk.drop(chunk[~chunk['type'].isin(valid)].index, inplace=True)

            #process data and gather vocabulary info.
            chunk['content']=chunk['content'].apply(regex_clean)
            chunk['content']=chunk['content'].apply(tokenize_data)
            cleaned_vocab.update(token for token_list in chunk['content'] for token in token_list)
            chunk['content']=chunk['content'].apply(stopwords_data)
            chunk['content']=chunk['content'].apply(stem_data)
            processed_fake_vocab.update(token for _, row in chunk.iterrows() if row['type'] not in ['reliable','political'] for token in row['content'])
            processed_real_vocab.update(token for _, row in chunk.iterrows() if row['type'] in ['reliable','political'] for token in row['content'])

            #drop data with empty content after processing
            erronious_content += (chunk['content'].apply(len) == 0).sum()
            chunk.drop(chunk[chunk['content'].apply(len) == 0].index, inplace = True)
            
            #return back into a string, words/tokens seperated by one space
            chunk['content'] = chunk['content'].apply(lambda row: ' '.join(row))
            #write to output file
            chunk.to_csv(preprocessed_file_path, header=False, index=False, mode='a')
    
    #write to statistics folder:
    with open(statistics_file_path, 'wb') as f:
        pickle.dump(cleaned_vocab,f)
        pickle.dump(processed_fake_vocab,f)
        pickle.dump(processed_real_vocab,f)
        pickle.dump(erronious_type,f)
        pickle.dump(erronious_content,f)
    return 

### applying the pipeline to 995,000_rows

In [12]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
file = "../995,000_rows.csv"
data_preprocessing(file)

new cleaned dataset: ../995,000_rows_preprocessed.csv
new statistics folder: ../995,000_rows_statistics.pickle


### applying the preprocessing pipeline to the BBC articles


the bbc articles were stored in a .txt, so it is first converted to .csv, then preprocessed. It contains fewer columns than other articles, but it has everything our models need.

In [6]:
import pandas as pd

BBC_df = pd.DataFrame(columns=["domain","type","content","title"])
with open('../BBC_articles.txt', 'r') as file:
    for line in file:
        line = line.strip()
        if line:
            dictionary = eval(line)
            row = {"domain":"BBC.com","type":"reliable","content":dictionary["TEXT"],"title":dictionary["HEADLINE"]}
            BBC_df.loc[len(BBC_df)] = row
BBC_df.to_csv('../BBC_articles.csv', index=False)


In [7]:
data_preprocessing('../BBC_articles.csv')

new cleaned dataset: ../BBC_articles_preprocessed.csv
new statistics folder: ../BBC_articles_statistics.pickle
