# Part 1: Data Processing

### Task 1.1: Importing and cleaning the a sample of the FakeNewsCorpus

In [2]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
from tabulate import tabulate

def clean_text(element):
    """cleans raw data using re.sub() to remove double newlines, space, and tabs. Also replace dates, emails, urls, and numbers"""
    text = str(element).lower()
    
    num_pattern = re.compile(r"(\d+)")
    date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b') #using the dd-mm-yyyy format
    email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text


def unique_words(text: list, n: int):
    """returns a list of the n most used words in the list"""
    words = {}
    for word in text:
        if word in words.keys():
            words[word] += 1
        else:
            words[word] = 1
    
    #sorting the dictionary into a list of tuples by word count in descending order
    return (sorted(words.items(), key = lambda x:x[1], reverse = True)[0:n])

def remove_stopwords(unfiltered_text: list):
    """remove stopwords from list of strings"""
    filtered_words = []
    for word in unfiltered_text: 
        if word not in stopwords.words('english'): filtered_words.append(word)
    return filtered_words

def word_vocabulary_analasis(text: list):
    """print reduction rate of vocabulary of tokenzied data before and after removing stopwords and stemming"""
    
    unique_words = set(text)
    stemmed_words = set([PorterStemmer().stem(word) for word in unique_words])
    stop_words = remove_stopwords(unique_words)
    stop_stemmed_words = remove_stopwords(stemmed_words)
    
    len_unique_words = len(unique_words)
    len_stemmed_words = len(stemmed_words)
    len_stop_words = len(stop_words)
    len_stop_stemmed_words = len(stop_stemmed_words)
    
    #printing reduction rate table
    print(tabulate([['stemming', 100 * (len_unique_words - len_stemmed_words) / len_unique_words], 
                    ['removing stopwords', 100 * (len_unique_words - len_stop_words) / len_unique_words], 
                    ['stemming and removing stopwords', 100 * (len_unique_words - len_stop_stemmed_words) / len_unique_words]], 
                   headers=['data cleaning type', 'reduction rate of vocabularyin percent'], tablefmt='orgtbl'))

def stemming_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    unique_words = set(text.split())
    stemmed_words = set([PorterStemmer().stem(word) for word in unique_words])
    return remove_stopwords(stemmed_words)

Importing data

In [5]:
import os
import json

#importing the CVS file as a pandas Dataframe and saving a copy locally
if os.path.exists(os.getcwd() + "//news_data.csv"): 
    raw_data_fake_news = pd.read_csv(os.getcwd() + "//news_data.csv")
else:
    raw_data_fake_news = pd.read_csv("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv")
    with open(os.getcwd() + "//news_data.csv", "w") as file:
        raw_data_fake_news.to_csv(os.getcwd() + "//news_data.csv", index=False)

Cleaning and basic analasis

In [6]:
#keeping only the relevant collumns
raw_data_fake_news = raw_data_fake_news[["domain", "type", "url", "content", "title", "authors"]]

#cleaning raw text using clean_text function on all elements
tokenized_data_fake_news = raw_data_fake_news.map(clean_text)

#doing a word analasis and getting the stemmed data without stopwords
list_of_words_content = ["content"]
for text in tokenized_data_fake_news["content"].tolist():
    for word in text.split():
        list_of_words_content.append(word)

word_vocabulary_analasis(list_of_words_content)

#stemming and removing stopwords from the data
stemmed_data_fake_news = tokenized_data_fake_news

#printing the Dataframe
tokenized_data_fake_news

| data cleaning type              |   reduction rate of vocabularyin percent |
|---------------------------------+------------------------------------------|
| stemming                        |                                32.8514   |
| removing stopwords              |                                 0.795229 |
| stemming and removing stopwords |                                33.502    |


Unnamed: 0,domain,type,url,content,title,authors
0,awmcom,unreliable,URL,sometimes the power of christmas will make you...,church congregation brings gift to waitresses ...,ruth harris
1,beforeitsnewscom,fake,URL,awakening of <NUM> strands of dna reconnectin...,awakening of <NUM> strands of dna reconnectin...,zurich times
2,cnnnextcom,unreliable,URL,never hike alone a friday the <NUM>th fan film...,never hike alone a friday the <NUM>th fan fil...,
3,awmcom,unreliable,URL,when a rare shark was caught scientists were l...,elusive alien of the sea caught by scientist ...,alexander smith
4,bipartisanreportcom,clickbait,URL,donald trump has the unnerving ability to abil...,trumps genius poll is complete the results ha...,gloria christie
...,...,...,...,...,...,...
245,beforeitsnewscom,fake,URL,prison for rahm gods work and many others head...,prison for rahm gods work and many others,
246,beforeitsnewscom,fake,URL,<NUM> useful items for your tiny home headline...,<NUM> useful items for your tiny home,dimitry k
247,wwwnewsmaxcom,,URL,former cia director michael hayden said thursd...,michael hayden we should be frightened by trum...,todd beamon
248,wwwnewsmaxcom,,URL,antonio sabato jr says hollywoods liberal elit...,antonio sabato jr its oprah or bust for hollyw...,bill hoffmann


## Task 1.2

## Task 1.3

## Task 1.4

# Part 2: Simple Model

## Task 1:

## Task 2.1

## Task 2.2

## Task 2.3

# Part 3: Advanced Model

# Part 4: Evaluation

## Task 4.1

## Task 4.2

## Task 4.3

# Part 5: Conclusions