# Part 1: Data Processing

In [17]:
import pandas as pd

raw_data_fake_news = pd.read_csv("data.csv", low_memory=False)

In [24]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from tabulate import tabulate

#compile stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def remove_stopwords(text: str):
    """remove stopwords from string"""
    return ' '.join([word for word in text.split() if word not in stop_words])

def stemming_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    return ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

def word_stemming_stopwords(df: pd.DataFrame):
    """print reduction rate of vocabulary of tokenized data"""
    avg_unique_words = df.str.split().apply(set).apply(len).mean()

    df = df.apply(stemming_data) #stemming
    avg_stemmed_words = df.str.split().apply(set).apply(len).mean()
    
    df = df.apply(remove_stopwords) #removing stopwords
    avg_stopwords_removed = df.str.split().apply(set).apply(len).mean()
    
    #printing reduction rate table
    print(tabulate([
        ["Stemming", 100 * (avg_unique_words - avg_stemmed_words) / avg_unique_words],
        ["Removing Stopwords", 100 * (avg_unique_words - avg_stopwords_removed) / avg_unique_words]
    ], headers=["Data Cleaning Type", "Reduction Rate of Vocabulary (%)"], tablefmt='orgtbl'))
    
    print(avg_unique_words, avg_stemmed_words)


    return df

fake_news_labels = ["fake", "satire", "bias", "conspiracy", "junksci"]
reliable_news_labelse = ["reliable", "political", "clickbait"]

def categorize_article_type(article_type):
    if article_type in fake_news_labels:
        return "Fake News"
    elif article_type in reliable_news_labelse:
        return "Reliable News"
    else:
        return None

Cleaning and doing vocabulary analasis

In [19]:
#keeping only the relevant collumns
data_fake_news = raw_data_fake_news.copy()[
    ['domain', 'type', 'content', 'title', 'authors', 'meta_description', 'meta_keywords']].head(10000)

In [None]:
set(len(data_fake_news['content'].apply(remove_stopwords)))

In [21]:
#stemming and removing stopwords, while calculating the reduction in vocabulary
data_fake_news['content'] = word_stemming_stopwords(data_fake_news['content'])

| Data Cleaning Type   |   Reduction Rate of Vocabulary (%) |
|----------------------+------------------------------------|
| Stemming             |                            24.2647 |
| Removing Stopwords   |                            24.5852 |
236.01 178.7428
