# Comments Sentiment Analysis

In [1]:
# Helper libraries
import warnings

# Scientific and visual libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Local Modules
from youtube_analysis.paths import TRANSFORMED_DATA_DIR
from youtube_analysis.cleanutils import translate_emojis

%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Various settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_colwidth", 40)
pd.set_option("display.precision", 4)
pd.set_option("display.max_columns", None)

We will use an unsupervised method for that because labeling all comments will be time-consuming and challeging. For that, we can use a lexicon and rule-based method: VADER. It is included in the NLTK library. The result may not be as accurate as expected since the valence dictionary is not context-specific in VADER, but it comes to be enough for most social media content.

Let's load our preprocessed corpus:

In [2]:
corpus = pd.read_pickle(TRANSFORMED_DATA_DIR / "processed_corpus.pkl")

### Unsupervised Sentiment Extraction

Let’s create two functions: one for categorizing, and one for scoring.

In [3]:
sent_analyzer = SentimentIntensityAnalyzer()


def get_sent_label(text="", score=None):
    if not score:
        score = sent_analyzer.polarity_scores(text)["compound"]
    if 0.4 < score:
        return "positive"
    if -0.1 < score <= 0.4:
        return "neutral"
    return "negative"


def get_sent_score(text):
    score = sent_analyzer.polarity_scores(text)["compound"]
    return score

Now we can apply the functions to the corpus:

In [4]:
corpus["sent_class"] = corpus.cleaned_text.apply(lambda t: get_sent_label(t))
corpus["sent_score"] = corpus.cleaned_text.apply(lambda t: get_sent_score(t))

In [5]:
corpus.head(10)

Unnamed: 0,author,published_at,updated_at,likes,text,cleaned_text,filtered_text,lemmatized_text,sent_class,sent_score
0,Lex Fridman,2022-12-29 17:34:04+00:00,2022-12-29 17:34:04+00:00,194,Here are the timestamps. Please chec...,here are the timestamps please che...,timestamps please check sponsors sup...,timestamps please check sponsor supp...,positive,0.9922
1,John Dickinson,2023-11-19 10:59:46+00:00,2023-11-19 10:59:46+00:00,0,Protein research is a major new brea...,protein research is a major new brea...,protein research major new breakthrough,protein research major new breakthrough,neutral,0.0
2,John Dickinson,2023-11-19 10:50:38+00:00,2023-11-19 10:50:38+00:00,0,This is a good one.,this is a good one,good one,good one,positive,0.4404
3,john g henderson,2023-11-18 03:47:08+00:00,2023-11-18 03:49:31+00:00,0,A very interesting conversation unti...,a very interesting conversation unti...,interesting conversation end answer ...,interesting conversation end answer ...,negative,-0.896
4,arife dickerson,2023-11-17 20:49:24+00:00,2023-11-17 20:49:24+00:00,0,This chick is always in Bilderberg g...,this chick is always in bilderberg g...,chick always bilderberg group meetin...,chick always bilderberg group meetin...,neutral,0.0
5,WakeUpnThinkClearly,2023-11-17 18:51:44+00:00,2023-11-17 18:51:44+00:00,0,Now I feel really infinitely ignoran...,now i feel really infinitely ignoran...,feel really infinitely ignorant hope...,feel really infinitely ignorant hope...,negative,-0.3976
6,Mike Huff,2023-11-17 01:54:59+00:00,2023-11-17 01:56:52+00:00,0,I kept checking the time in hopes th...,i kept checking the time in hopes th...,kept checking time hopes entered sta...,kept checking time hope entered stat...,negative,-0.4767
7,Steve C,2023-11-10 22:16:58+00:00,2023-11-10 22:16:58+00:00,0,😳The Plants are Fake…?,the plants are fake,plants fake,plant fake,negative,-0.4767
8,Brigid Mary Prain,2023-11-09 09:51:33+00:00,2023-11-09 09:51:33+00:00,0,Responding to your pushback on the p...,responding to your pushback on the p...,responding pushback pushback circa i...,responding pushback pushback circa i...,positive,0.5859
9,Jack Reacher,2023-11-04 09:30:38+00:00,2023-11-04 09:30:38+00:00,0,"<a href=""https://www.youtube.com/wat...",happy face smiley brutal...,happy face smiley brutal facts plane...,happy face smiley brutal fact planet...,positive,0.6705


Contrary to what has been said sometimes about VADER, it can't classify directly emojis:

In [6]:
get_sent_label("😍")

'neutral'

In [7]:
get_sent_label(translate_emojis("😍"))

'positive'

Now we can save this new dataset into a different file:

In [8]:
corpus.to_pickle(TRANSFORMED_DATA_DIR / "sentiment_corpus.pkl")