# Text Preprocessing

In [24]:
# Helper libraries
import warnings
from bs4 import BeautifulSoup

# Scientific and visual libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from emot import UNICODE_EMOJI, EMOTICONS_EMO
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Local Modules
from youtube_analysis.edautils import get_dataset_info, plot_stem

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
from youtube_analysis.paths import INTERIM_DATA_DIR, TRANSFORMED_DATA_DIR

In [5]:
corpus = pd.read_pickle(INTERIM_DATA_DIR / "corpus.pkl")

### Removing URLs and HTML Tags

In [2]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [6]:
corpus["cleaned_text"] = corpus.text.apply(lambda t: remove_html_tags(t))

In [7]:
corpus.head()

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps. Please chec...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,😳The Plants are Fake…?
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,Responding to your pushback on the p...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...","1:44:35 ""...brutal facts about the ..."
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,Don´t you forget that the ape comes ...


### Converting Emojis and Emoticons

In [11]:
def translate_emojis(text):
    tokenized_text = word_tokenize(text)
    for emo in UNICODE_EMOJI:
        if emo in tokenized_text:
            emo_index = tokenized_text.index(emo)
            tokenized_text[emo_index] = UNICODE_EMOJI[emo].replace(":","").replace("_", " ")
    return " ".join(tokenized_text)


def translate_emoticons(text):
    new_text = text
    for ticon in EMOTICONS_EMO:
        if ticon in new_text:
            new_text = new_text.replace(ticon, EMOTICONS_EMO[ticon])
    return new_text

In [12]:
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: translate_emojis(t))
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: translate_emoticons(t))

In [13]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps . Please che...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,😳The Plants are Fake… ?
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,Responding to your pushback on the p...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",1:44Happy face smiley5 `` ... brutal...
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,Don´t you forget that the ape comes ...
5,Boggo,2023-11-01 09:06:38+00:00,2023-11-01 09:06:38+00:00,0,Does this host not bother to speak c...,Does this host not bother to speak c...
6,Imperious,2023-10-31 20:03:41+00:00,2023-10-31 20:03:41+00:00,0,"This is dumb, the only answer is God...","This is dumb , the only answer is Go..."
7,Elizabeth,2023-10-30 14:01:05+00:00,2023-10-30 14:01:05+00:00,0,Annnd they ...... 😅 Awesomeness!,Annnd they ...... grinning face with...
8,Indie Guvenc,2023-10-30 09:39:49+00:00,2023-10-30 09:46:12+00:00,0,"@<a href=""https://www.youtube.com/wa...",@ 53:59 Lex is completely dominated ...
9,Gary Rowe,2023-10-27 06:41:52+00:00,2023-10-27 06:41:52+00:00,0,Sorry bro too much do do bird things...,Sorry bro too much do do bird things...


In [None]:
# Not all emojis have been translated, thus we need to
# finalize the task by removing non alphabetic symbols.

### Removing Non Alphabetic Symbols

In [15]:
def filter_text_noise(text):
    next_text = text
    for e in next_text:
        if not e.isalpha():
            next_text = next_text.replace(e, " ")
    return next_text

In [16]:
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: filter_text_noise(t))

In [17]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps Please che...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,The Plants are Fake
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,Responding to your pushback on the p...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",Happy face smiley brutal...
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,Don t you forget that the ape comes ...
5,Boggo,2023-11-01 09:06:38+00:00,2023-11-01 09:06:38+00:00,0,Does this host not bother to speak c...,Does this host not bother to speak c...
6,Imperious,2023-10-31 20:03:41+00:00,2023-10-31 20:03:41+00:00,0,"This is dumb, the only answer is God...",This is dumb the only answer is Go...
7,Elizabeth,2023-10-30 14:01:05+00:00,2023-10-30 14:01:05+00:00,0,Annnd they ...... 😅 Awesomeness!,Annnd they grinning face with...
8,Indie Guvenc,2023-10-30 09:39:49+00:00,2023-10-30 09:46:12+00:00,0,"@<a href=""https://www.youtube.com/wa...",Lex is completely dominated ...
9,Gary Rowe,2023-10-27 06:41:52+00:00,2023-10-27 06:41:52+00:00,0,Sorry bro too much do do bird things...,Sorry bro too much do do bird things...


### Removing Capitalization

In [18]:
corpus["cleaned_text"] = corpus.cleaned_text.str.lower()

In [19]:
corpus.head(15)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,don t you forget that the ape comes ...
5,Boggo,2023-11-01 09:06:38+00:00,2023-11-01 09:06:38+00:00,0,Does this host not bother to speak c...,does this host not bother to speak c...
6,Imperious,2023-10-31 20:03:41+00:00,2023-10-31 20:03:41+00:00,0,"This is dumb, the only answer is God...",this is dumb the only answer is go...
7,Elizabeth,2023-10-30 14:01:05+00:00,2023-10-30 14:01:05+00:00,0,Annnd they ...... 😅 Awesomeness!,annnd they grinning face with...
8,Indie Guvenc,2023-10-30 09:39:49+00:00,2023-10-30 09:46:12+00:00,0,"@<a href=""https://www.youtube.com/wa...",lex is completely dominated ...
9,Gary Rowe,2023-10-27 06:41:52+00:00,2023-10-27 06:41:52+00:00,0,Sorry bro too much do do bird things...,sorry bro too much do do bird things...


### Filtering Stopwords

In [21]:
stop_words = set(stopwords.words("english"))

def filter_stopwords(text):
    stop_words = set(stopwords.words("english"))
    tokenized_text = word_tokenize(text)
    filtered_text = []
    for token in tokenized_text:
        if token not in stop_words:
            filtered_text.append(token)
    return " ".join(filtered_text)

In [22]:
corpus["filtered_text"] = corpus.cleaned_text.apply(lambda t: filter_stopwords(t))

In [23]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake,plants fake
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...,responding pushback pushback circa i...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...,happy face smiley brutal facts plane...
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,don t you forget that the ape comes ...,forget ape comes also earth returns ...
5,Boggo,2023-11-01 09:06:38+00:00,2023-11-01 09:06:38+00:00,0,Does this host not bother to speak c...,does this host not bother to speak c...,host bother speak clearly baked grea...
6,Imperious,2023-10-31 20:03:41+00:00,2023-10-31 20:03:41+00:00,0,"This is dumb, the only answer is God...",this is dumb the only answer is go...,dumb answer god none questions answe...
7,Elizabeth,2023-10-30 14:01:05+00:00,2023-10-30 14:01:05+00:00,0,Annnd they ...... 😅 Awesomeness!,annnd they grinning face with...,annnd grinning face sweat awesomeness
8,Indie Guvenc,2023-10-30 09:39:49+00:00,2023-10-30 09:46:12+00:00,0,"@<a href=""https://www.youtube.com/wa...",lex is completely dominated ...,lex completely dominated gushing rn
9,Gary Rowe,2023-10-27 06:41:52+00:00,2023-10-27 06:41:52+00:00,0,Sorry bro too much do do bird things...,sorry bro too much do do bird things...,sorry bro much bird things could n hang


### Lemmatizing Comments

In [25]:
lemmatizer = WordNetLemmatizer()

In [26]:
def lemmatize_text(text):
    tokenized_text = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(w) for w in tokenized_text])

In [27]:
corpus["lemmatized_text"] = corpus.filtered_text.apply(lambda t: lemmatize_text(t))

In [28]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text,lemmatized_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...,timestamps please check sponsor supp...
1,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake,plants fake,plant fake
2,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...,responding pushback pushback circa i...,responding pushback pushback circa i...
3,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...,happy face smiley brutal facts plane...,happy face smiley brutal fact planet...
4,Oskar Gudnason,2023-11-02 21:47:54+00:00,2023-11-03 12:18:47+00:00,0,Don´t you forget that the ape comes ...,don t you forget that the ape comes ...,forget ape comes also earth returns ...,forget ape come also earth return ea...
5,Boggo,2023-11-01 09:06:38+00:00,2023-11-01 09:06:38+00:00,0,Does this host not bother to speak c...,does this host not bother to speak c...,host bother speak clearly baked grea...,host bother speak clearly baked grea...
6,Imperious,2023-10-31 20:03:41+00:00,2023-10-31 20:03:41+00:00,0,"This is dumb, the only answer is God...",this is dumb the only answer is go...,dumb answer god none questions answe...,dumb answer god none question answer...
7,Elizabeth,2023-10-30 14:01:05+00:00,2023-10-30 14:01:05+00:00,0,Annnd they ...... 😅 Awesomeness!,annnd they grinning face with...,annnd grinning face sweat awesomeness,annnd grinning face sweat awesomeness
8,Indie Guvenc,2023-10-30 09:39:49+00:00,2023-10-30 09:46:12+00:00,0,"@<a href=""https://www.youtube.com/wa...",lex is completely dominated ...,lex completely dominated gushing rn,lex completely dominated gushing rn
9,Gary Rowe,2023-10-27 06:41:52+00:00,2023-10-27 06:41:52+00:00,0,Sorry bro too much do do bird things...,sorry bro too much do do bird things...,sorry bro much bird things could n hang,sorry bro much bird thing could n hang


In [30]:
corpus.to_pickle(TRANSFORMED_DATA_DIR / "processed_corpus.pkl")