In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
from tabulate import tabulate

def clean_text(element):
    """cleans raw data using re.sub() to remove double newlines, space, and tabs. Also replace dates, emails, urls, and numbers"""
    text = str(element).lower()
    
    num_pattern = re.compile(r"(\d+)")
    date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b') #using the dd-mm-yyyy format
    date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b')
    email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text


def unique_words(text: list, n: int):
    """returns a list of the n most used words in the list"""
    words = {}
    for word in text:
        if word in words.keys():
            words[word] += 1
        else:
            words[word] = 1
    
    #sorting the dictionary into a list of tuples by word count in descending order
    return (sorted(words.items(), key = lambda x:x[1], reverse = True)[0:n])

def remove_stopwords(unfiltered_text: list):
    """remove stopwords from list of strings"""
    filtered_words = []
    for word in unfiltered_text: 
        if word not in stopwords.words('english'): filtered_words.append(word)
    return filtered_words

def word_vocabulary_analasis(text: list):
    """print reduction rate of vocabulary of tokenzied data before and after removing stopwords and stemming"""
    
    unique_words = set(text)
    stemmed_words = set([PorterStemmer().stem(word) for word in unique_words])
    stop_words = remove_stopwords(unique_words)
    stop_stemmed_words = remove_stopwords(stemmed_words)
    
    len_unique_words = len(unique_words)
    len_stemmed_words = len(stemmed_words)
    len_stop_words = len(stop_words)
    len_stop_stemmed_words = len(stop_stemmed_words)
    
    #printing reduction rate table
    print(tabulate([['stemming', 100 * (len_unique_words - len_stemmed_words) / len_unique_words], 
                    ['removing stopwords', 100 * (len_unique_words - len_stop_words) / len_unique_words], 
                    ['stemming and removing stopwords', 100 * (len_unique_words - len_stop_stemmed_words) / len_unique_words]], 
                   headers=['data cleaning type', 'reduction rate of vocabularyin percent'], tablefmt='orgtbl'))

def stemming_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    unique_words = text.split()
    stemmed_words = [PorterStemmer().stem(word) for word in unique_words]
    return remove_stopwords(stemmed_words) 

raw_data_fake_news = pd.read_csv("995K_.csv", dtype={0: str, 1: str})
# Selecting the first 10,000 rows
raw_data_fake_news = raw_data_fake_news.head(10000)

tokenized_data_fake_news = raw_data_fake_news.map(clean_text)

list_of_words_content = ["content"]
for text in tokenized_data_fake_news["content"].tolist():
    for word in text.split():
        list_of_words_content.append(word)

word_vocabulary_analasis(list_of_words_content)

stemmed_data_fake_news = tokenized_data_fake_news

print(stemmed_data_fake_news)

nan_rows = tokenized_data_fake_news[tokenized_data_fake_news["authors"] == "nan"]

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import pandas as pd
from tabulate import tabulate

def clean_text(element):
    """cleans raw data using re.sub() to remove double newlines, space, and tabs. Also replace dates, emails, urls, and numbers"""
    text = str(element).lower()
    
    num_pattern = re.compile(r"(\d+)")
    date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b') #using the dd-mm-yyyy format
    date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b')
    email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text


def unique_words(text: list, n: int):
    """returns a list of the n most used words in the list"""
    words = {}
    for word in text:
        if word in words.keys():
            words[word] += 1
        else:
            words[word] = 1
    
    #sorting the dictionary into a list of tuples by word count in descending order
    return (sorted(words.items(), key = lambda x:x[1], reverse = True)[0:n])

def remove_stopwords(unfiltered_text: list):
    """remove stopwords from list of strings"""
    filtered_words = []
    for word in unfiltered_text: 
        if word not in stopwords.words('english'): filtered_words.append(word)
    return filtered_words

def word_vocabulary_analasis(text: list):
    """print reduction rate of vocabulary of tokenzied data before and after removing stopwords and stemming"""
    
    unique_words = set(text)
    stemmed_words = set([PorterStemmer().stem(word) for word in unique_words])
    stop_words = remove_stopwords(unique_words)
    stop_stemmed_words = remove_stopwords(stemmed_words)
    
    len_unique_words = len(unique_words)
    len_stemmed_words = len(stemmed_words)
    len_stop_words = len(stop_words)
    len_stop_stemmed_words = len(stop_stemmed_words)
    
    #printing reduction rate table
    print(tabulate([['stemming', 100 * (len_unique_words - len_stemmed_words) / len_unique_words], 
                    ['removing stopwords', 100 * (len_unique_words - len_stop_words) / len_unique_words], 
                    ['stemming and removing stopwords', 100 * (len_unique_words - len_stop_stemmed_words) / len_unique_words]], 
                   headers=['data cleaning type', 'reduction rate of vocabularyin percent'], tablefmt='orgtbl'))

def stemming_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    unique_words = text.split()
    stemmed_words = [PorterStemmer().stem(word) for word in unique_words]
    return remove_stopwords(stemmed_words)

In [None]:
raw_data_fake_news = pd.read_csv("995K_.csv", dtype={0: str, 1: str})

In [None]:
tokenized_data_fake_news = raw_data_fake_news.map(clean_text)

#doing a word analasis and getting the stemmed data without stopwords
list_of_words_content = ["content"]
for text in tokenized_data_fake_news["content"].tolist():
    for word in text.split():
        list_of_words_content.append(word)

word_vocabulary_analasis(list_of_words_content)

#stemming and removing stopwords from the data
stemmed_data_fake_news = tokenized_data_fake_news

#printing the Dataframe
tokenized_data_fake_news

nan_rows = tokenized_data_fake_news[tokenized_data_fake_news["authors"] == "nan"]

: 