In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import regex as re
import csv

In [2]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,id,text,type
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post


In [3]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,"epstein files are reported to be shared, start..."
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,"[since its creation in 2003, the department of..."
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post,the white house zelensky meeting followed by e...
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post,do democrats care more about image than action...


In [4]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [5]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,"epstein files are reported to be shared, start..."
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,"[since its creation in 2003, the department of..."
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post,the white house zelensky meeting followed by e...
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post,do democrats care more about image than action...


In [6]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,the nirvana fallacy is when people dismiss a r...
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files are reported to be shared start...
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since its creation in 2003 the department of...
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post,the white house zelensky meeting followed by e...
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post,do democrats care more about image than action...


In [8]:
# Removing stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,nirvana fallacy people dismiss real option isn...
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files reported shared starting friday ...
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since creation 2003 department homeland securi...
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post,white house zelensky meeting followed eu meeti...
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post,democrats care image action jan 6 california c...


In [10]:
# Removing frequent and rare words
# from collections import Counter

# word_count = Counter()

# for text in df['clean_text']:
#     for word in text.split():
#         word_count[word] += 1

# FREQUENT_WORDS = set(word for (word, word_count) in word_count.most_common(10))
# RARE_WORDS = set(word for (word, word_count) in word_count.most_common()[:-100:-1])

# FREQUENT_WORDS, RARE_WORDS

In [11]:
# def remove_frequent_words(text, n_most_common = 10):
#     return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

# def remove_rare_words(text, n_most_common = 10):
#     return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [12]:
# df['clean_text'] = df['clean_text'].apply(lambda x: remove_frequent_words(x))
# df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))

# df.head()

In [13]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Raichu4u,1mv3z5n,The Nirvana fallacy is when people dismiss a r...,post,nirvana fallacy people dismiss real option isn...
1,Potato_Cat93,1mujuir,"Epstein files are reported to be shared, start...",post,epstein files reported shared starting friday ...
2,Candle-Jolly,1mun37m,"[Since its creation in 2003, the Department of...",post,since creation 2003 department homeland securi...
3,PsychLegalMind,1mu59zz,The White House Zelensky meeting followed by E...,post,white house zelensky meeting followed eu meeti...
4,Virtual-Orchid3065,1mt26yb,Do Democrats care more about image than action...,post,democrats care image action jan 6 california c...


##### Stemming

In [14]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [15]:
# df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
# df.head()

##### Lemmatiazation and POS Tagging

In [16]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V" : wordnet.VERB, "J" : wordnet.ADJ, "R" : wordnet.ADV}

def lemmatize_words(text):
    # find POS tags
    pos_text = pos_tag(text.split())
    # the get() function associates a deafult value, in this case NOUN, to a word if its pos is not in wordnet map
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]) 

In [17]:
# df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
# df.head()

##### Saving final dataframe

In [18]:
df.drop(columns=['text'], axis=1, inplace=True)
df = df.dropna(subset=['clean_text'])  
df = df[df['clean_text'] != ''] 
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8', index=False)