In [13]:
import pandas as pd
import string
from nltk.corpus import stopwords
import regex as re
from collections import Counter
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import spacy

In [2]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,id,text,type,date
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00


In [3]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,id,text,type,date,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00,"epstein files are reported to be shared, start..."
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00,"for discussion, here're the basic components o..."
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00,"[since its creation in 2003, the department of..."


In [4]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [5]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,id,text,type,date,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00,"epstein files are reported to be shared, start..."
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00,"for discussion, here're the basic components o..."
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00,"[since its creation in 2003, the department of..."


In [6]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,id,text,type,date,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00,epstein files are reported to be shared start...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00,for discussion here re the basic components o...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00,since its creation in 2003 the department of...


In [8]:
# Removing stopwords
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,id,text,type,date,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00,nirvana fallacy people dismiss real option isn...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00,term fascism used modern politics qualify diff...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00,epstein files reported shared starting friday ...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00,discussion basic components social democracy t...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00,since creation 2003 department homeland securi...


In [10]:
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])

def remove_words(text, words_list):
    return " ".join([word for word in text.split() if word not in words_list])

# Removing frequent and non-informative words        
# df['clean_text']=df['clean_text'].apply(lambda x: remove_words(x, FREQUENT_WORDS))
# Removing rare words
df['clean_text']=df['clean_text'].apply(lambda x: remove_words(x, RARE_WORDS))


In [11]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,id,text,type,date,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,2025-08-20T03:31:07+00:00,nirvana fallacy people dismiss real option isn...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,2025-08-19T19:41:42+00:00,term fascism used modern politics qualify diff...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,2025-08-19T14:22:03+00:00,epstein files reported shared starting friday ...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,2025-08-19T16:11:58+00:00,discussion basic components social democracy t...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,2025-08-19T16:18:59+00:00,since creation 2003 department homeland securi...


In [14]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    
    pos_tags = pos_tag(tokens)
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) 
                     for word, pos in pos_tags])

df["lemmatized_text"] = df["clean_text"].apply(lemmatize_words)

##### Saving final dataframe

In [15]:
df.drop(columns=['text'], axis=1, inplace=True)
df = df.dropna(subset=['clean_text'])  
df = df[df['clean_text'] != ''] 
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8', index=False)