In [55]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
import regex as re
import csv

In [56]:
df = pd.read_csv('../../src/nlp/raw_textual_df.csv')
df['text'] = df['text'].apply(lambda x: str(x))
df.head()

Unnamed: 0,author,text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w..."
1,---Spartacus---,The end goal is to produce a stupid population...
2,---____--__-_-_-___-,This is a gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months..."
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good"


In [57]:
# Conver text to lower case
df['clean_text'] = df['text'].str.lower()
df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...","agreed. deleted.||""i'm sorry you took it the w..."
1,---Spartacus---,The end goal is to produce a stupid population...,the end goal is to produce a stupid population...
2,---____--__-_-_-___-,This is a gross understatement lol,this is a gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...","oh man, i'll let you know here in a few months..."
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good","i lock myself in the bathroom, so you’re good"


In [58]:
# Removing URLs and HTML tags
def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_html(text):
    return re.sub(r'<.*?>', '', text)


In [59]:
df['clean_text'] = df['clean_text'].apply(lambda x : remove_html(x))
df['clean_text'] = df['clean_text'].apply(lambda x : remove_urls(x))

df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...","agreed. deleted.||""i'm sorry you took it the w..."
1,---Spartacus---,The end goal is to produce a stupid population...,the end goal is to produce a stupid population...
2,---____--__-_-_-___-,This is a gross understatement lol,this is a gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...","oh man, i'll let you know here in a few months..."
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good","i lock myself in the bathroom, so you’re good"


In [60]:
# Removing punctutation
def remove_punctation(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans(punctuations, ' ' * len(punctuations)))

In [61]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctation(x))
df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted i m sorry you took it the w...
1,---Spartacus---,The end goal is to produce a stupid population...,the end goal is to produce a stupid population...
2,---____--__-_-_-___-,This is a gross understatement lol,this is a gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man i ll let you know here in a few months...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",i lock myself in the bathroom so you’re good


In [62]:
# Removing stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [63]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted sorry took wrong way huge step ...
1,---Spartacus---,The end goal is to produce a stupid population...,end goal produce stupid population reliably vo...
2,---____--__-_-_-___-,This is a gross understatement lol,gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man let know months first lot time afraid e...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",lock bathroom you’re good


In [64]:
# Removing frequent and rare words
from collections import Counter

word_count = Counter()

for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1

FREQUENT_WORDS = set(word for (word, word_count) in word_count.most_common(10))
RARE_WORDS = set(word for (word, word_count) in word_count.most_common()[:-100:-1])

FREQUENT_WORDS, RARE_WORDS

({'also',
  'even',
  'get',
  'like',
  'one',
  'people',
  'think',
  'time',
  'want',
  'would'},
 {'074',
  '101st',
  '1to1',
  '54yo',
  'allmen',
  'andbyet',
  'angry”',
  'animals—raids',
  'anothony',
  'antoinnette',
  'apparenty',
  'arduously',
  'around—for',
  'aubgrabe',
  'ayatem',
  'b99',
  'bucees',
  'cassandras',
  'chiwetel',
  'clude',
  'compressing',
  'comtact',
  'converter',
  'democracytm',
  'denim”',
  'devoutoy',
  'ejiofor',
  'engeged',
  'esle',
  'establishec',
  'fabulists',
  'fisg',
  'freking',
  'gauranteee',
  'getcha',
  'girlfriend—his',
  'gods”',
  'going—now',
  'gruff',
  'gucci”',
  'hostilize',
  'hustlers',
  'individuals…',
  'jdea',
  'linemen',
  'machetes',
  'megapixel',
  'milken',
  'milkin',
  'mother……if',
  'mouldering',
  'nonshort',
  'nouse',
  'ocv',
  'otzi',
  'pacem',
  'parents…my',
  'people—jake',
  'perpetuator',
  'perseveres',
  'phaidon',
  'plumet',
  'priviliege',
  'quartes',
  'queenout',
  'redouble',
  

In [65]:
def remove_frequent_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

def remove_rare_words(text, n_most_common = 10):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [66]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_frequent_words(x))
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))

df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted sorry took wrong way huge step ...
1,---Spartacus---,The end goal is to produce a stupid population...,end goal produce stupid population reliably vo...
2,---____--__-_-_-___-,This is a gross understatement lol,gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man let know months first lot afraid ever k...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",lock bathroom you’re good


In [67]:
# Removing special chars
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text) # replacing multiple space char with just one char
    return text

df['clean_text'] = df['clean_text'].apply(lambda x: remove_special_chars(x))
df.head()

Unnamed: 0,author,text,clean_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted sorry took wrong way huge step ...
1,---Spartacus---,The end goal is to produce a stupid population...,end goal produce stupid population reliably vo...
2,---____--__-_-_-___-,This is a gross understatement lol,gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man let know months first lot afraid ever k...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",lock bathroom you re good


##### Stemming

In [68]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [69]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,author,text,clean_text,stemmed_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted sorry took wrong way huge step ...,agre delet sorri took wrong way huge step ai a...
1,---Spartacus---,The end goal is to produce a stupid population...,end goal produce stupid population reliably vo...,end goal produc stupid popul reliabl vote equa...
2,---____--__-_-_-___-,This is a gross understatement lol,gross understatement lol,gross understat lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man let know months first lot afraid ever k...,oh man let know month first lot afraid ever ki...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",lock bathroom you re good,lock bathroom you re good


##### Lemmatiazation and POS Tagging

In [70]:
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V" : wordnet.VERB, "J" : wordnet.ADJ, "R" : wordnet.ADV}

def lemmatize_words(text):
    # find POS tags
    pos_text = pos_tag(text.split())
    # the get() function associates a deafult value, in this case NOUN, to a word if its pos is not in wordnet map
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]) 

In [71]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,author,text,clean_text,stemmed_text,lemmatized_text
0,---AI---,"Agreed. Deleted.||""I'm sorry you took it the w...",agreed deleted sorry took wrong way huge step ...,agre delet sorri took wrong way huge step ai a...,agree deleted sorry take wrong way huge step a...
1,---Spartacus---,The end goal is to produce a stupid population...,end goal produce stupid population reliably vo...,end goal produc stupid popul reliabl vote equa...,end goal produce stupid population reliably vo...
2,---____--__-_-_-___-,This is a gross understatement lol,gross understatement lol,gross understat lol,gross understatement lol
3,---chewie--,"Oh man, I'll let you know here in a few months...",oh man let know months first lot afraid ever k...,oh man let know month first lot afraid ever ki...,oh man let know month first lot afraid ever ki...
4,---why-so-serious---,"I lock myself in the bathroom, so you’re good",lock bathroom you re good,lock bathroom you re good,lock bathroom you re good


##### Saving final dataframe

In [72]:
df.drop(columns=['text'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_dataset.csv', sep=',', encoding='utf-8', index=False)