In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import regex as re
import csv

In [2]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,id,text,type
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post


In [3]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,people talk all the time about the structural ...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,if it were discovered that a sitting president...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons for ukraine on mon...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois is considering a new bill (hb 3458) t...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,in recent days some of maga's more outspoken i...


In [4]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [5]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,people talk all the time about the structural ...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,if it were discovered that a sitting president...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons for ukraine on mon...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois is considering a new bill (hb 3458) t...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,in recent days some of maga's more outspoken i...


In [6]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,people talk all the time about the structural ...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,if it were discovered that a sitting president...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons for ukraine on mon...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois is considering a new bill hb 3458 t...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,in recent days some of maga s more outspoken i...


In [8]:
# Removing stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,people talk time structural issues 3rd parties...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,discovered sitting president committed exact c...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons ukraine monday thr...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois considering new bill hb 3458 would le...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,recent days maga outspoken influencers rushing...


In [10]:
# Removing frequent and rare words
from collections import Counter

word_count = Counter()

for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

FREQUENT_WORDS = set(word for (word, word_count) in word_count.most_common(10))
RARE_WORDS = set(word for (word, word_count) in word_count.most_common()[:-100:-1])

FREQUENT_WORDS, RARE_WORDS

({'also',
  'even',
  'get',
  'like',
  'one',
  'people',
  'think',
  'time',
  'want',
  'would'},
 {'2071',
  '38hrs',
  'actuallg',
  'alloromantic',
  'analyis',
  'anecdotle',
  'appreciators',
  'aromantics',
  'autobody',
  'bewilders',
  'blowoff',
  'breakfast‚Äîbut',
  'caligraphy',
  'caricaturize',
  'commerical',
  'contemporaryart',
  'contexting',
  'coomunity',
  'cringiness',
  'damiya',
  'deffend',
  'degradable',
  'demigogues',
  'department‚Äîdidn‚Äôt',
  'desentisation',
  'despit',
  'dioxins',
  'dispositional',
  'education‚Äôs',
  'elimimate',
  'elimimating',
  'embarassability',
  'essentialy',
  'estatic',
  'ethics‚Äù',
  'ethnocentrists',
  'everywhen',
  'ewww‚Ä¶',
  'eydryan',
  'fell‚Äù',
  'filburn',
  'fsrming',
  'gto',
  'hadtohurt',
  'ikarus',
  'impotant',
  'inolve',
  'instructor‚Äîalso',
  'inturn',
  'knausgaard',
  'kookoo',
  'lain',
  'lamasinlava',
  'limation',
  'lyceum',
  'maaaaan',
  'metalwork',
  'ority',
  'painter‚Äôs',
  'p

In [11]:
def remove_frequent_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

def remove_rare_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [12]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_frequent_words(x))
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))

df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,talk structural issues 3rd parties face us pol...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,discovered sitting president committed exact c...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons ukraine monday thr...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois considering new bill hb 3458 let some...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,recent days maga outspoken influencers rushing...


In [13]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,id,text,type,clean_text
0,Coteup,t3_1lzuccl,People talk all the time about the structural ...,post,talk structural issues 3rd parties face us pol...
1,_alpinisto,t3_1lzy15f,If it were discovered that a sitting president...,post,discovered sitting president committed exact c...
2,PsychLegalMind,t3_1lzrb15,Trump announced new weapons for Ukraine on Mon...,post,trump announced new weapons ukraine monday thr...
3,jaytee319,t3_1lyiioj,Illinois is considering a new bill (HB 3458) t...,post,illinois considering new bill hb 3458 let some...
4,the_original_Retro,t3_1lys6tq,In recent days some of MAGA's more outspoken i...,post,recent days maga outspoken influencers rushing...


##### Stemming

In [14]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [15]:
# df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
# df.head()

##### Lemmatiazation and POS Tagging

In [16]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V" : wordnet.VERB, "J" : wordnet.ADJ, "R" : wordnet.ADV}

def lemmatize_words(text):
    # find POS tags
    pos_text = pos_tag(text.split())
    # the get() function associates a deafult value, in this case NOUN, to a word if its pos is not in wordnet map
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]) 

In [17]:
# df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
# df.head()

##### Saving final dataframe

In [18]:
df.drop(columns=['text'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8', index=False)