In [1]:
import pandas as pd
import re
from cleantext import clean

def preprocess_text(text):
    cleaned_text = clean(text,
                         fix_unicode=True,
                         to_ascii=True,
                         lower=True,
                         no_line_breaks=True,
                         no_urls=True,
                         no_emails=True,
                         no_phone_numbers=True,
                         no_numbers=True,
                         no_digits=True,
                         no_currency_symbols=True,
                         no_punct=False,  
                         replace_with_url="<URL>",
                         replace_with_email="<EMAIL>",
                         replace_with_phone_number="<PHONE>",
                         replace_with_number="<NUM>",
                         replace_with_digit="<NUM>",
                         replace_with_currency_symbol="<CUR>",
                         lang="en")
    return cleaned_text

df = pd.read_csv("995K_subset.csv", low_memory=False)

df['cleaned_content'] = df['content'].apply(preprocess_text)

# Test
raw_text = "I received an e-mail from andrea@dentisk.dk regarding my dentist appointment on 2024-02-22 at 03:00 PM with cancellation link https://andreathedentist.com."
cleaned_text = preprocess_text(raw_text)
print("Cleaned Text:")
print(cleaned_text)

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


KeyboardInterrupt: 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

nltk.download('punkt')
nltk.download('stopwords')

# Tokenize the text
tokens = word_tokenize(cleaned_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Compute vocabulary size before removing stopwords
vocab_before = set(tokens)
vocab_size_before = len(vocab_before)

# Compute vocabulary size after removing stopwords
vocab_after = set(filtered_tokens)
vocab_size_after = len(vocab_after)

# Compute reduction rate of vocabulary size after removing stopwords
reduction_rate_stopwords = (vocab_size_before - vocab_size_after) / vocab_size_before

# Stemming
porter = PorterStemmer()
stemmed_tokens = [porter.stem(word) for word in filtered_tokens]

# Compute vocabulary size before stemming
vocab_size_before_stemming = len(vocab_after)

# Compute vocabulary size after stemming
vocab_after_stemming = set(stemmed_tokens)
vocab_size_after_stemming = len(vocab_after_stemming)

# Compute reduction rate of vocabulary size after stemming
reduction_rate_stemming = (vocab_size_before_stemming - vocab_size_after_stemming) / vocab_size_before_stemming

# Print
print("Size of vocabulary before removing stopwords:", vocab_size_before)
print("Size of vocabulary after removing stopwords:", vocab_size_after)
print("Reduction rate of vocabulary size after removing stopwords: {:.2f}%".format(reduction_rate_stopwords * 100))

print("\nSize of vocabulary before stemming:", vocab_size_before_stemming)
print("Size of vocabulary after stemming:", vocab_size_after_stemming)
print("Reduction rate of vocabulary size after stemming: {:.2f}%".format(reduction_rate_stemming * 100))

Size of vocabulary before removing stopwords: 23
Size of vocabulary after removing stopwords: 16
Reduction rate of vocabulary size after removing stopwords: 30.43%

Size of vocabulary before stemming: 16
Size of vocabulary after stemming: 16
Reduction rate of vocabulary size after stemming: 0.00%


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreazeuthenheidam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andreazeuthenheidam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
