# Text Preprocessing

In [1]:
# Helper libraries
import warnings
from bs4 import BeautifulSoup

# Scientific and visual libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from emot import UNICODE_EMOJI, EMOTICONS_EMO
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

Let's import useful paths:

In [2]:
from youtube_analysis.paths import INTERIM_DATA_DIR, TRANSFORMED_DATA_DIR

We need to reload the interim dataset:

In [3]:
corpus = pd.read_pickle(INTERIM_DATA_DIR / "corpus.pkl")

Text falls into the definition of unstructured data. It means that while it is easily interpretable by humans, it's not the case for computers. This difference leads to very different strategies for processing and analysis, and errors can take advantage of human intuition. In order to use text corpus as information-rich features, we must process it in the right way.

For that we will define special functions dedicated to basic text normalization. Let's dig into the steps.


### Removing URLs and HTML Tags

In [4]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [5]:
corpus["cleaned_text"] = corpus.text.apply(lambda t: remove_html_tags(t))
corpus.head()

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps. Please chec...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,Protein research is a major new brea...
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,This is a good one.
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,A very interesting conversation unti...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,This chick is always in Bilderberg g...


### Converting Emojis and Emoticons

In [6]:
def translate_emojis(text):
    tokenized_text = word_tokenize(text)
    for emo in UNICODE_EMOJI:
        if emo in tokenized_text:
            emo_index = tokenized_text.index(emo)
            tokenized_text[emo_index] = UNICODE_EMOJI[emo].replace(":","").replace("_", " ")
    return " ".join(tokenized_text)


def translate_emoticons(text):
    new_text = text
    for ticon in EMOTICONS_EMO:
        if ticon in new_text:
            new_text = new_text.replace(ticon, EMOTICONS_EMO[ticon])
    return new_text

In [7]:
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: translate_emojis(t))
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: translate_emoticons(t))

In [8]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps . Please che...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,Protein research is a major new brea...
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,This is a good one .
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,A very interesting conversation unti...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,This chick is always in Bilderberg g...
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,Now I feel really infinitely ignoran...
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,I kept checking the time in hopes th...
7,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,😳The Plants are Fake… ?
8,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,Responding to your pushback on the p...
9,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",1:44Happy face smiley5 `` ... brutal...


Not all emojis have been translated, thus we need to finalize the task by removing non alphabetic symbols.

### Removing Non Alphabetic Symbols

In [9]:
def filter_text_noise(text):
    next_text = text
    for e in next_text:
        if not e.isalpha():
            next_text = next_text.replace(e, " ")
    return next_text

In [10]:
corpus["cleaned_text"] = corpus.cleaned_text.apply(lambda t: filter_text_noise(t))
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,Here are the timestamps Please che...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,Protein research is a major new brea...
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,This is a good one
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,A very interesting conversation unti...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,This chick is always in Bilderberg g...
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,Now I feel really infinitely ignoran...
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,I kept checking the time in hopes th...
7,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,The Plants are Fake
8,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,Responding to your pushback on the p...
9,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",Happy face smiley brutal...


### Removing Capitalization

In [11]:
corpus["cleaned_text"] = corpus.cleaned_text.str.lower()

In [12]:
corpus.head(15)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,protein research is a major new brea...
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,this is a good one
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,a very interesting conversation unti...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,this chick is always in bilderberg g...
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,now i feel really infinitely ignoran...
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,i kept checking the time in hopes th...
7,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake
8,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...
9,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...


### Filtering Stopwords

In [13]:
stop_words = set(stopwords.words("english"))

def filter_stopwords(text):
    stop_words = set(stopwords.words("english"))
    tokenized_text = word_tokenize(text)
    filtered_text = []
    for token in tokenized_text:
        if token not in stop_words:
            filtered_text.append(token)
    return " ".join(filtered_text)

In [14]:
corpus["filtered_text"] = corpus.cleaned_text.apply(lambda t: filter_stopwords(t))
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,protein research is a major new brea...,protein research major new breakthrough
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,this is a good one,good one
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,a very interesting conversation unti...,interesting conversation end answer ...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,this chick is always in bilderberg g...,chick always bilderberg group meetin...
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,now i feel really infinitely ignoran...,feel really infinitely ignorant hope...
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,i kept checking the time in hopes th...,kept checking time hopes entered sta...
7,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake,plants fake
8,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...,responding pushback pushback circa i...
9,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...,happy face smiley brutal facts plane...


### Lemmatizing Comments

In [15]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokenized_text = word_tokenize(text)
    return " ".join([lemmatizer.lemmatize(w) for w in tokenized_text])

In [16]:
corpus["lemmatized_text"] = corpus.filtered_text.apply(lambda t: lemmatize_text(t))
corpus.head(7)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text,lemmatized_text
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...,timestamps please check sponsor supp...
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,protein research is a major new brea...,protein research major new breakthrough,protein research major new breakthrough
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,this is a good one,good one,good one
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,a very interesting conversation unti...,interesting conversation end answer ...,interesting conversation end answer ...
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,this chick is always in bilderberg g...,chick always bilderberg group meetin...,chick always bilderberg group meetin...
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,now i feel really infinitely ignoran...,feel really infinitely ignorant hope...,feel really infinitely ignorant hope...
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,i kept checking the time in hopes th...,kept checking time hopes entered sta...,kept checking time hope entered stat...


Now we have a cleaned text corpus. Let's save it into `processed` directory:

In [17]:
corpus.to_pickle(TRANSFORMED_DATA_DIR / "processed_corpus.pkl")