# Constructing Additional Features

In [1]:
# Helper libraries
import warnings

# Scientific and visual libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from emot import UNICODE_EMOJI, EMOTICONS_EMO
from nltk.tokenize import word_tokenize

# Local Modules
from youtube_analysis.paths import TRANSFORMED_DATA_DIR

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

In Text analysis, it’s really easy to extract new features from unstructured data. We could inspect any characteristic to do that: does the user comment contain foreign words, does the user make short comments, did the viewer update its comment ? For now we will only create two simple features for categorizing peoples that use emoticons and emojis in their comments.

The code for that looks like this:

In [2]:
def has_emojis(text):
    tokenized_text = word_tokenize(text)
    for emo in UNICODE_EMOJI:
        if emo in tokenized_text:
            return 1
    return 0


def has_emoticons(text):
    new_text = text
    for ticon in EMOTICONS_EMO:
        if ticon in new_text:
            return 1
    return 0

We apply these functions on original text data:

In [3]:
corpus = pd.read_pickle(TRANSFORMED_DATA_DIR / "sentiment_corpus.pkl")

In [4]:
corpus["has_emojis"] = corpus.text.apply(lambda t: has_emojis(t))
corpus["has_emoticons"] = corpus.text.apply(lambda t: has_emoticons(t))

In [5]:
corpus.head()

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text,lemmatized_text,sent_class,sent_score,has_emojis,has_emoticons
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...,timestamps please check sponsor supp...,positive,0.9922,0,1
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,protein research is a major new brea...,protein research major new breakthrough,protein research major new breakthrough,neutral,0.0,0,0
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,this is a good one,good one,good one,positive,0.4404,0,0
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,a very interesting conversation unti...,interesting conversation end answer ...,interesting conversation end answer ...,negative,-0.896,0,0
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,this chick is always in bilderberg g...,chick always bilderberg group meetin...,chick always bilderberg group meetin...,neutral,0.0,0,0


We will save it into a new file:

In [6]:
corpus.to_pickle(TRANSFORMED_DATA_DIR / "improved_sentiment_corpus.pkl")