# Part 1: Data Processing

### Task 1.1: Importing and cleaning the a sample of the FakeNewsCorpus

In [13]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import requests
from tabulate import tabulate

def clean_text(text: str):
    """cleans raw data using re.sub() to remove double newlines, space, and tabs. Also replace dates, emails, urls, and numbers"""
    text = text.lower()
    
    num_pattern = re.compile(r"(\d+)")
    date_pattern = re.compile(r"((\d{2})-(\d{2})-(\d{4}))") #using the dd-mm-yyyy format
    email_pattern = re.compile(r"(([\w\-_.]*)(@\w+)(.com))")
    url_pattern = re.compile(r"((https:\/\/www\.)([a-zA-Z0-9]*)(\.com))")
    
    text = re.sub(" +", " ", text)
    text = re.sub("\t+", "\t", text)
    text = re.sub("\n+", "\n", text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(num_pattern, "<NUM>", text)
    
    return text

def unique_words_plot(text: list, n: int):
    """returns a list of the n most used words in the list"""
    words = {}
    for word in text:
        if word in words.keys():
            words[word] += 1
        else:
            words[word] = 1
    
    #sorting the dictionary into a list of tuples by word count in descending order
    return (sorted(words.items(), key = lambda x:x[1], reverse = True)[0:n])

def word_vocabulary_analasis(text: list):
    """returns reduction rate of vocabulary of tokenzied data before and after removing stopwords and stemming"""
    
    def remove_stopwords(unfiltered_text: list):
        filtered_words = []
        for word in unfiltered_text: 
            if word not in stopwords.words('english'): filtered_words.append(word)
        return filtered_words
    
    unique_words = set(text)
    stemmed_words = set([PorterStemmer().stem(word) for word in unique_words])
    stop_words = remove_stopwords(unique_words)
    stop_stemmed_words = remove_stopwords(stemmed_words)
    
    len_unique_words = len(unique_words)
    len_stemmed_words = len(stemmed_words)
    len_stop_words = len(stop_words)
    len_stop_stemmed_words = len(stop_stemmed_words)
    
    #printing reduction rate table
    print(tabulate([['stemming', 100 * (len_unique_words - len_stemmed_words) / len_unique_words], 
                    ['removing stopwords', 100 * (len_unique_words - len_stop_words) / len_unique_words], 
                    ['stemming and removing stopwords', 100 * (len_unique_words - len_stop_stemmed_words) / len_unique_words]], 
                   headers=['data cleaning type', 'reduction rate of vocabularyin percent'], tablefmt='orgtbl'))
    
    return stop_stemmed_words

We import the data and run our functions. (Remember to run the functions first)

In [14]:
#importing the CVS file as raw text
raw_data_fake_news = requests.get("https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv").text

#cleaning raw text
cleaned_data_fake_news = clean_text(raw_data_fake_news)

#tokenizing raw text
tokenized_data_fake_news = nltk.word_tokenize(cleaned_data_fake_news, language="english", preserve_line=True)

#doing a word analasis and getting the stemmed data without stopwords
stemmed_data_fake_news = word_vocabulary_analasis(tokenized_data_fake_news)

| data type                       |   reduction rate of vocabulary |
|---------------------------------+--------------------------------|
| stemming                        |                      24.2999   |
| removing stopwords              |                       0.695249 |
| stemming and removing stopwords |                      24.889    |


## Task 1.2

## Task 1.3

## Task 1.4

# Part 2: Simple Model

## Task 1:

## Task 2.1

## Task 2.2

## Task 2.3

# Part 3: Advanced Model

# Part 4: Evaluation

## Task 4.1

## Task 4.2

## Task 4.3

# Part 5: Conclusions