In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import regex as re
import csv

In [2]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,text
0,Coteup,People talk all the time about the structural ...
1,_alpinisto,If it were discovered that a sitting president...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...


In [3]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,people talk all the time about the structural ...
1,_alpinisto,If it were discovered that a sitting president...,if it were discovered that a sitting president...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons for ukraine on mon...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois is considering a new bill (hb 3458) t...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,in recent days some of maga's more outspoken i...


In [4]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [5]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,people talk all the time about the structural ...
1,_alpinisto,If it were discovered that a sitting president...,if it were discovered that a sitting president...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons for ukraine on mon...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois is considering a new bill (hb 3458) t...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,in recent days some of maga's more outspoken i...


In [6]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','', punctuations))

In [7]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,people talk all the time about the structural ...
1,_alpinisto,If it were discovered that a sitting president...,if it were discovered that a sitting president...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons for ukraine on mon...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois is considering a new bill hb 3458 tha...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,in recent days some of magas more outspoken in...


In [8]:
# Removing stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,people talk time structural issues 3rd parties...
1,_alpinisto,If it were discovered that a sitting president...,discovered sitting president committed exact c...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons ukraine monday thr...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois considering new bill hb 3458 would le...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,recent days magas outspoken influencers rushin...


In [10]:
# Removing frequent and rare words
from collections import Counter

word_count = Counter()

for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

FREQUENT_WORDS = set(word for (word, word_count) in word_count.most_common(10))
RARE_WORDS = set(word for (word, word_count) in word_count.most_common()[:-100:-1])

FREQUENT_WORDS, RARE_WORDS

({'also',
  'dont',
  'even',
  'get',
  'like',
  'one',
  'people',
  'think',
  'want',
  'would'},
 {'1223',
  '1645',
  '1673',
  '3480',
  '38hrs',
  '8910hr',
  'actuallg',
  'adjust”',
  'aggressiveand',
  'agressivebut',
  'analyis',
  'anecdotle',
  'antisentiments',
  'antistances',
  'appreciators',
  'autoerotic',
  'breakfast—but',
  'businessesorganizations',
  'caligraphy',
  'clickbaitish',
  'commerical',
  'contexting',
  'createexpresscommunicate',
  'damiya',
  'deffend',
  'demigogues',
  'dioxins',
  'doctorengineeretc',
  'elimimate',
  'elimimating',
  'essentialy',
  'ethnocentrists',
  'evergreater',
  'filburn',
  'fsrming',
  'furtherfaster',
  'gto',
  'guessnow',
  'hamasisrael',
  'hamasnow',
  'headfeathers',
  'headshoulders',
  'highstaffing',
  'ikarus',
  'industriesmost',
  'inolve',
  'israellet',
  'knausgaard',
  'lain',
  'leagueslower',
  'limation',
  'loanspayments',
  'monthsquarters',
  'mutaart',
  'noncommand',
  'noveltyart',
  'ority',

In [11]:
def remove_frequent_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

def remove_rare_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [12]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_frequent_words(x))
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))

df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,talk time structural issues 3rd parties face u...
1,_alpinisto,If it were discovered that a sitting president...,discovered sitting president committed exact c...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons ukraine monday thr...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois considering new bill hb 3458 let some...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,recent days magas outspoken influencers rushin...


In [13]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,text,clean_text
0,Coteup,People talk all the time about the structural ...,talk time structural issues 3rd parties face u...
1,_alpinisto,If it were discovered that a sitting president...,discovered sitting president committed exact c...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons ukraine monday thr...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois considering new bill hb 3458 let some...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,recent days magas outspoken influencers rushin...


##### Stemming

In [14]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [15]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,author,text,clean_text,stemmed_text
0,Coteup,People talk all the time about the structural ...,talk time structural issues 3rd parties face u...,talk time structur issu 3rd parti face us poli...
1,_alpinisto,If it were discovered that a sitting president...,discovered sitting president committed exact c...,discov sit presid commit exact crime nixon for...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons ukraine monday thr...,trump announc new weapon ukrain monday threate...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois considering new bill hb 3458 let some...,illinoi consid new bill hb 3458 let someon avo...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,recent days magas outspoken influencers rushin...,recent day maga outspoken influenc rush call t...


##### Lemmatiazation and POS Tagging

In [16]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V" : wordnet.VERB, "J" : wordnet.ADJ, "R" : wordnet.ADV}

def lemmatize_words(text):
    # find POS tags
    pos_text = pos_tag(text.split())
    # the get() function associates a deafult value, in this case NOUN, to a word if its pos is not in wordnet map
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]) 

In [17]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,author,text,clean_text,stemmed_text,lemmatized_text
0,Coteup,People talk all the time about the structural ...,talk time structural issues 3rd parties face u...,talk time structur issu 3rd parti face us poli...,talk time structural issue 3rd party face u po...
1,_alpinisto,If it were discovered that a sitting president...,discovered sitting president committed exact c...,discov sit presid commit exact crime nixon for...,discover sit president commit exact crime nixo...
2,PsychLegalMind,Trump announced new weapons for Ukraine on Mon...,trump announced new weapons ukraine monday thr...,trump announc new weapon ukrain monday threate...,trump announce new weapon ukraine monday threa...
3,jaytee319,Illinois is considering a new bill (HB 3458) t...,illinois considering new bill hb 3458 let some...,illinoi consid new bill hb 3458 let someon avo...,illinois consider new bill hb 3458 let someone...
4,the_original_Retro,In recent days some of MAGA's more outspoken i...,recent days magas outspoken influencers rushin...,recent day maga outspoken influenc rush call t...,recent day magas outspoken influencers rush ca...


##### Saving final dataframe

In [21]:
df.drop(columns=['text'], axis=1)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8')