In [1]:
import pandas as pd
import string
from nltk.corpus import stopwords
import regex as re

In [2]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,id,text,type
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post


In [3]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,"epstein files are reported to be shared, start..."
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,"for discussion, here're the basic components o..."
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,"[since its creation in 2003, the department of..."


In [4]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [5]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,"epstein files are reported to be shared, start..."
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,"for discussion, here're the basic components o..."
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,"[since its creation in 2003, the department of..."


In [6]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,the term fascism is used in modern politics to...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files are reported to be shared start...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,for discussion here re the basic components o...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since its creation in 2003 the department of...


In [8]:
# Removing stopwords
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,nirvana fallacy people dismiss real option isn...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,term fascism used modern politics qualify diff...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files reported shared starting friday ...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,discussion basic components social democracy t...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since creation 2003 department homeland securi...


In [10]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,nirvana fallacy people dismiss real option isn...
1,KingGhidorah1225,1musrfu,The term fascism is used in modern politics to...,post,term fascism used modern politics qualify diff...
2,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files reported shared starting friday ...
3,Rong_Liu,1mumwdb,"For discussion, here're the basic components o...",post,discussion basic components social democracy t...
4,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since creation 2003 department homeland securi...


##### Saving final dataframe

In [11]:
df.drop(columns=['text'], axis=1, inplace=True)
df = df.dropna(subset=['clean_text'])  
df = df[df['clean_text'] != ''] 
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8', index=False)