In [1]:
!pip install spacy gensim pyLDAvis
!python -m spacy download en_core_web_sm
!pip install transformers torch

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 393.8 kB/s eta 0:00:33
     --------------------------------------- 0.1/12.8 MB 435.7 kB/s eta 0:00:30
     --------------------------------------- 0.1/12.8 MB 708.1 kB/s eta 0:00:18
      --------------------------------------- 0.2/12.8 MB 1.0 MB/s eta 0:00:13
     - -------------------------------------- 0.4/12.8 MB 1.4 MB/s eta 0:00:09
     - -------------------------------------- 0.6/12.8 MB 1.7 MB/s eta 0:00:07
     -- ------------------------------------- 0.9/12.8 MB 2.4 MB/s eta 0:00:05
     --- ------------------------------------ 1.2/12.8 MB 3.0 MB/s eta 0:00:04
     ---- -----------------------------------

In [2]:
pip install --upgrade typing_extensions

Note: you may need to restart the kernel to use updated packages.


In [55]:
#import all tools needed
import pandas as pd #work w/tables
import re #clean text 
import nltk #natural language processing 
import spacy #nlp to process text
import gensim #topic modeling library (LDA)
import pyLDAvis #visalize topic models
import pyLDAvis.gensim_models #^
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax #model scores to readable probabilities
from gensim import corpora #create word-to-id dict
from sklearn.feature_extraction.text import TfidfVectorizer #handle stop words
from nltk.corpus import stopwords #common word list
from nltk.stem import WordNetLemmatizer #reduce words to base
from collections import defaultdict, Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
nltk.download('stopwords')  #download the list of common stopwords
nltk.download('wordnet')  #download the tool that helps us simplify words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thety\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thety\AppData\Roaming\nltk_data...


True

## Three of the most popular posts (most commented) regarding reasons for quitting

In [30]:
# blind comments from post 1
# https://www.teamblind.com/post/Why-did-you-quit-your-last-job-8WJinT7p

df = pd.read_csv("Blind_Comments_Post_1.csv")
# Dropping URL column automatically created
df.drop(columns=['web-scraper-order', 'web-scraper-start-url'], inplace = True)
df.head()

Unnamed: 0,Date,Company,Comment,Likes
0,"Nov 12, 2019",Amazon,Bad manager. Bad manager. Bad manager.\n\nWith...,67.0
1,"Nov 12, 2019",Facebook,"You don't quit the company, you quit your mana...",41.0
2,"Nov 12, 2019",Google,For 61% increase in compensation and 21% incre...,37.0
3,"Nov 12, 2019",Airbnb,"I was getting extremely stressed out, and soug...",33.0
4,"Nov 12, 2019",Microsoft,Bad manager - didn't believe one manager could...,35.0


In [7]:
# blind comments from post 2
# https://www.teamblind.com/post/Why-did-you-quit-your-last-job-oxLGsaPt

df2 = pd.read_csv("Blind_Comments_Post_2.csv")
# Dropping URL column automatically created
df2.drop(columns=['web-scraper-order', 'web-scraper-start-url'], inplace = True)
df2.head()

Unnamed: 0,Date,Company,Comment,Likes
0,"May 13, 2018",Amazon,Lowest caliber of people and ceiling of ambiti...,8.0
1,"May 13, 2018",Oath,I didn’t think I need a reason to quit Oath,13.0
2,"May 13, 2018",Facebook,I quit Google shortly after I got my green car...,9.0
3,"May 13, 2018",Facebook,Quit previous role because of the enticement o...,6.0
4,"May 13, 2018",Snapchat,Burn out,5.0


In [8]:
# blind comments from post 3
# https://www.teamblind.com/post/why-do-you-quit-ee80oepK

df3 = pd.read_csv("Blind_Comments_Post_3.csv")
# Dropping URL column automatically created
df3.drop(columns=['web-scraper-order', 'web-scraper-start-url'], inplace = True)
df3.head()

Unnamed: 0,Date,Company,Comment,Likes
0,"Feb 19, 2020",Target,Manager said my work didnt warrant a promotion...,115.0
1,"Feb 19, 2020",Ultimate Software,"Can we add ""bad manager"" to the list?\n\nMy TC...",76.0
2,"Feb 19, 2020",Zymergen,I quit my last company and joined Zymergen bec...,24.0
3,"Feb 19, 2020",Amazon,I grew up in a poor family; where I couldn't f...,25.0
4,"Feb 19, 2020",LinkedIn,I know a lot of people are not switching becau...,31.0


### Combining data from comment threads above

In [19]:
full_blind_quit_comments = pd.concat([df, df2, df3], ignore_index=True)
full_blind_quit_comments

Unnamed: 0,Date,Company,Comment,Likes
0,"Nov 12, 2019",Amazon,Bad manager. Bad manager. Bad manager.\n\nWith...,67.0
1,"Nov 12, 2019",Facebook,"You don't quit the company, you quit your mana...",41.0
2,"Nov 12, 2019",Google,For 61% increase in compensation and 21% incre...,37.0
3,"Nov 12, 2019",Airbnb,"I was getting extremely stressed out, and soug...",33.0
4,"Nov 12, 2019",Microsoft,Bad manager - didn't believe one manager could...,35.0
...,...,...,...,...
423,"Feb 19, 2020",Deloitte,Mo’ TC!,
424,"Feb 19, 2020",,TC is lagging due to management thinking they ...,
425,"Feb 19, 2020",Lyft,A clean start?,
426,"Feb 19, 2020",Credit Karma,Boredom,


In [11]:
full_blind_quit_comments.to_csv('full_negative_comments.csv', index=False)

In [25]:
# preprocessing

full_blind_quit_comments['Likes'] = full_blind_quit_comments['Likes'].fillna(0)  # replace NaN likes with 0

#lowercase, remove punctuation/numbers
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)            #keep letters and spaces
    return text

def tokenize_no_stopword_removal(text):
    words = text.split()
    #keep stopwords for bigrams
    return ' '.join([word for word in words if len(word) > 2])

#apply both steps
full_blind_quit_comments['cleaned_text'] = full_blind_quit_comments['Comment'].apply(clean_text)
full_blind_quit_comments['processed_text'] = full_blind_quit_comments['cleaned_text'].apply(tokenize_no_stopword_removal)

## Exploratory Data Analysis - Using steps from Adriana's Reddit Analysis

In [23]:
#set up TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words="english",     #remove common boring words
    ngram_range=(2,2),       #include bigrams
)

#fit TF-IDF to cleaned Blind comments
tfidf_matrix = vectorizer.fit_transform(full_blind_quit_comments['cleaned_text'])

#convert to df
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

#get top 30 average TF-IDF terms
top_scores = tfidf_df.mean().sort_values(ascending=False).head(30).reset_index()
top_scores.columns = ['term', 'average_tfidf']

print("\nTop 30 TF-IDF Terms:\n")
print(top_scores.to_string(index=False))


Top 30 TF-IDF Terms:

                term  average_tfidf
         bad manager       0.027875
      bad management       0.010535
            bad boss       0.007725
         increase tc       0.006854
         tc increase       0.006765
           got fired       0.006201
        shit manager       0.004992
         manager bad       0.004623
           make poll       0.004474
              low tc       0.004194
        pay increase       0.003913
           got bored       0.003829
growth opportunities       0.003609
    previous company       0.003599
         current job       0.003470
        job security       0.003327
       quit previous       0.003277
     poor leadership       0.003196
           tc growth       0.003178
        bad managers       0.003007
      shitty manager       0.002937
         wanted work       0.002922
     poor management       0.002887
      direct manager       0.002878
      leave managers       0.002877
  growth opportunity       0.002647
     

In [40]:
import spacy
nlp = spacy.load("en_core_web_sm")

def clean_for_lda(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc 
            if token.pos_ in ["NOUN", "ADJ", "VERB"]
            and not token.is_stop
            and len(token) > 2]

#apply to cleaned txt
full_blind_quit_comments["lda_tokens"] = full_blind_quit_comments["cleaned_text"].apply(clean_for_lda)
full_blind_quit_comments["lda_tokens"].head()

#create dict that assigns an ID to every unique word (used by LDA)
dictionary = corpora.Dictionary(full_blind_quit_comments["lda_tokens"])

#filter out rare words (appear in <10 docs) and too common words (>50% of docs)
dictionary.filter_extremes(no_below=10, no_above=0.50)

#converts each doc into bag-of-words format (list of (word_id, count) pairs)
#input format LDA needs
corpus = [dictionary.doc2bow(text) for text in full_blind_quit_comments["lda_tokens"]]

#train LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,             #bag-of-words data
    id2word=dictionary,        #word-to-ID mapping
    num_topics=6,              #how many topics to find
    random_state=15,           #for reproducibility 
    passes=10,                 #number of training passes (more = better, slower)
    alpha='auto'               #automatically tune topic distribution sparsity
)

#view discovered topics 
for i, topic in lda_model.show_topics(num_words=10, formatted=True):
    print(f"\n Topic {i+1}: {topic}")


 Topic 1: 0.061*"culture" + 0.059*"leadership" + 0.053*"people" + 0.052*"care" + 0.048*"toxic" + 0.047*"company" + 0.044*"know" + 0.042*"good" + 0.037*"team" + 0.036*"promotion"

 Topic 2: 0.182*"growth" + 0.120*"opportunity" + 0.110*"low" + 0.060*"lack" + 0.057*"compensation" + 0.048*"manager" + 0.046*"good" + 0.043*"go" + 0.033*"career" + 0.029*"culture"

 Topic 3: 0.098*"work" + 0.089*"company" + 0.051*"get" + 0.049*"manager" + 0.046*"quit" + 0.036*"new" + 0.035*"time" + 0.034*"job" + 0.033*"year" + 0.027*"increase"

 Topic 4: 0.113*"management" + 0.077*"bad" + 0.075*"boss" + 0.067*"work" + 0.062*"want" + 0.042*"big" + 0.041*"well" + 0.040*"product" + 0.037*"toxic" + 0.035*"wlb"

 Topic 5: 0.221*"job" + 0.069*"money" + 0.058*"leave" + 0.051*"current" + 0.046*"salary" + 0.042*"role" + 0.035*"high" + 0.034*"new" + 0.030*"pay" + 0.029*"culture"

 Topic 6: 0.357*"manager" + 0.186*"bad" + 0.075*"leave" + 0.038*"company" + 0.037*"team" + 0.034*"people" + 0.023*"shit" + 0.022*"change" + 0

In [47]:
# pyLDAvis.enable_notebook()

# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
# pyLDAvis.display(vis)

In [67]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

vader = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\thety\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [73]:
#apply VADER sentiment to each cleaned comment 
full_blind_quit_comments["sentiment_score"] = full_blind_quit_comments["cleaned_text"].apply(lambda text: vader.polarity_scores(text)["compound"])

#preview df
sentiment_preview = full_blind_quit_comments[["Comment", "sentiment_score"]].copy()

#round for neatness
sentiment_preview["sentiment_score"] = sentiment_preview["sentiment_score"].round(3)

#sort by most negative sentiment
sentiment_preview = sentiment_preview.sort_values(by="sentiment_score").reset_index(drop=True)

def label_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

full_blind_quit_comments["vader_sentiment"] = full_blind_quit_comments["sentiment_score"].apply(label_sentiment)

#display table w/sentiment score & whether it's pos, neut, or neg.
sentiment_table = full_blind_quit_comments[["Comment", "sentiment_score", "vader_sentiment"]].copy()
sentiment_table["sentiment_score"] = sentiment_table["sentiment_score"].round(3)
sentiment_table = sentiment_table.sort_values(by="sentiment_score").reset_index(drop=True)
display(sentiment_table.head(20))

Unnamed: 0,Comment,sentiment_score,vader_sentiment
0,Don't know where to start:\n- Worst CTO\n- 0 T...,-0.958,Negative
1,Bad boss. He was demeaning and would yell at m...,-0.949,Negative
2,"Culture changing, wanting to learn new things ...",-0.934,Negative
3,- Bad manager (VP of Product)\n- mediocre Java...,-0.926,Negative
4,I’m heavily considering leaving.\n\nMy manager...,-0.922,Negative
5,"Because things were an absolute mess, the proj...",-0.91,Negative
6,Bad manager. Bad manager. Bad manager.\n\nWith...,-0.898,Negative
7,Imagine working for a CEO and board who openly...,-0.898,Negative
8,Microsoft got rid of my team. I got an officia...,-0.896,Negative
9,Left Oracle because:\n- terrible manager\n- no...,-0.896,Negative


In [75]:
#count the # of comments in each sentiment category
sentiment_counts = full_blind_quit_comments["vader_sentiment"].value_counts().reset_index()
sentiment_counts.columns = ["Sentiment", "Count"]

#calculate percentage
total_comments = sentiment_counts["Count"].sum()
sentiment_counts["Percent"] = (sentiment_counts["Count"] / total_comments * 100).round(1)

#display
sentiment_counts = sentiment_counts.sort_values(by="Sentiment")  # Optional alphabetical sort
display(sentiment_counts)

Unnamed: 0,Sentiment,Count,Percent
0,Negative,205,47.9
2,Neutral,80,18.7
1,Positive,143,33.4


### Exact same issue as Adriana with the Reddit Comments (just wanted to double check) - Seems BERT is the optimal route

In [86]:
#run BERT Sentiment on comments
#load the model & tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

#function to get sentiment label
def get_bert_sentiment(text):
    #512 tokens max or it breaks
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = softmax(logits.numpy()[0])
    
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[probs.argmax()]


#apply to df
full_blind_quit_comments["bert_sentiment"] = full_blind_quit_comments["Comment"].apply(get_bert_sentiment)

In [87]:
# Results

full_blind_quit_comments[["Comment", "bert_sentiment"]].head(10)

Unnamed: 0,Comment,bert_sentiment
0,Bad manager. Bad manager. Bad manager.\n\nWith...,Negative
1,"You don't quit the company, you quit your mana...",Negative
2,For 61% increase in compensation and 21% incre...,Neutral
3,"I was getting extremely stressed out, and soug...",Neutral
4,Bad manager - didn't believe one manager could...,Negative
5,1. Bad manager\n2. Bad manager\n3. Bad manager...,Negative
6,Who blocks stack overflow?,Neutral
7,"In Soviet Russia, job quits you.",Negative
8,Toxicity and back stabbing.,Negative
9,Bro culture and a douche of a CEO. Sued his as...,Negative


In [90]:
bert_counts = full_blind_quit_comments["bert_sentiment"].value_counts().reset_index()
bert_counts.columns = ["Sentiment", "Count"]
bert_counts["Percent"] = (bert_counts["Count"] / bert_counts["Count"].sum() * 100).round(1)

display(bert_counts)

Unnamed: 0,Sentiment,Count,Percent
0,Negative,256,59.8
1,Neutral,127,29.7
2,Positive,45,10.5


In [94]:
#get counts
vader_counts = full_blind_quit_comments["vader_sentiment"].value_counts().sort_index()
bert_counts = full_blind_quit_comments["bert_sentiment"].value_counts().sort_index()

#get total for each model
total_vader = vader_counts.sum()
total_bert = bert_counts.sum()

#combine into one df
sentiment_totals = pd.DataFrame({
    "VADER Count": vader_counts,
    "BERT Count": bert_counts
}).fillna(0).astype(int)

#add % columns
sentiment_totals["VADER %"] = (sentiment_totals["VADER Count"] / total_vader * 100).round(1)
sentiment_totals["BERT %"] = (sentiment_totals["BERT Count"] / total_bert * 100).round(1)

#reset index so Sentiment is a column
sentiment_totals = sentiment_totals.reset_index().rename(columns={"index": "Sentiment"})

#display!
display(sentiment_totals)

Unnamed: 0,Sentiment,VADER Count,BERT Count,VADER %,BERT %
0,Negative,205,256,47.9,59.8
1,Neutral,80,127,18.7,29.7
2,Positive,143,45,33.4,10.5


## Conversely, comments left for why people have not left their current roles

In [13]:
# blind comments from post 4 -- Specifically asking why have people NOT quit their current jobs
# https://www.teamblind.com/post/Why-havent-you-quit-tLFL18Ch
df_positive_comments = pd.read_csv("Blind_Post_4_Positive_Comments.csv")
# Dropping URL column automatically created
df_positive_comments.drop(columns=['web-scraper-order', 'web-scraper-start-url'], inplace = True)
df_positive_comments.head()

Unnamed: 0,Date,Company,Comment,Likes
0,"Dec 13, 2021",,Not enough time to practice Leetcode and syste...,341.0
1,"Dec 13, 2021",Google,Not enough time to practice all these things b...,90.0
2,"Dec 13, 2021",Amazon,I got a big pay bump from going from NCG salar...,51.0
3,"Dec 14, 2021",Better.com,My incredible CEO and work culture 😃 😉 😜 😱 😢,119.0
4,"Dec 13, 2021",EMC,"I like the product I'm working on, decent wlb,...",87.0


Clearly there will be sarcasm in these comments, that will hopefully be identified with BERT analysis.

In [96]:
# preprocessing

df_positive_comments['Likes'] = df_positive_comments['Likes'].fillna(0)  # replace NaN likes with 0

#lowercase, remove punctuation/numbers
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)            #keep letters and spaces
    return text

def tokenize_no_stopword_removal(text):
    words = text.split()
    #keep stopwords for bigrams
    return ' '.join([word for word in words if len(word) > 2])

#apply both steps
df_positive_comments['cleaned_text'] = df_positive_comments['Comment'].apply(clean_text)
df_positive_comments['processed_text'] = df_positive_comments['cleaned_text'].apply(tokenize_no_stopword_removal)

In [98]:
#set up TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words="english",     #remove common boring words
    ngram_range=(2,2),       #include bigrams
)

#fit TF-IDF to cleaned Blind comments
tfidf_matrix = vectorizer.fit_transform(df_positive_comments['cleaned_text'])

#convert to df
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

#get top 30 average TF-IDF terms
top_scores = tfidf_df.mean().sort_values(ascending=False).head(30).reset_index()
top_scores.columns = ['term', 'average_tfidf']

print("\nTop 30 TF-IDF Terms:\n")
print(top_scores.to_string(index=False))


Top 30 TF-IDF Terms:

                 term  average_tfidf
          visa issues       0.019608
           green card       0.011575
        time leetcode       0.010803
   stock appreciation       0.008373
         jumping ship       0.007756
        time practice       0.006652
avoiding homelessness       0.006536
    waiting liquidity       0.006536
        stock vesting       0.006536
      prepared taking       0.006536
          jumped june       0.006536
   stockholm syndrome       0.006536
            like team       0.006536
    immigration stuff       0.006536
    offering nowadays       0.006536
   immigration issues       0.006536
        good question       0.006536
          waiting ipo       0.006536
             got jump       0.006536
    waiting greencard       0.006536
      company talking       0.006536
    visa restrictions       0.006536
           time study       0.006536
       portfolio isnt       0.006536
  potential promotion       0.006536
             ju

In [100]:
def clean_for_lda(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc 
            if token.pos_ in ["NOUN", "ADJ", "VERB"]
            and not token.is_stop
            and len(token) > 2]

#apply to cleaned txt
df_positive_comments["lda_tokens"] = df_positive_comments["cleaned_text"].apply(clean_for_lda)
df_positive_comments["lda_tokens"].head()

#create dict that assigns an ID to every unique word (used by LDA)
dictionary = corpora.Dictionary(df_positive_comments["lda_tokens"])

#filter out rare words (appear in <10 docs) and too common words (>50% of docs)
dictionary.filter_extremes(no_below=10, no_above=0.50)

#converts each doc into bag-of-words format (list of (word_id, count) pairs)
#input format LDA needs
corpus = [dictionary.doc2bow(text) for text in df_positive_comments["lda_tokens"]]

#train LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,             #bag-of-words data
    id2word=dictionary,        #word-to-ID mapping
    num_topics=6,              #how many topics to find
    random_state=15,           #for reproducibility 
    passes=10,                 #number of training passes (more = better, slower)
    alpha='auto'               #automatically tune topic distribution sparsity
)

#view discovered topics 
for i, topic in lda_model.show_topics(num_words=10, formatted=True):
    print(f"\n Topic {i+1}: {topic}")


 Topic 1: 0.692*"work" + 0.070*"want" + 0.050*"pay" + 0.047*"company" + 0.038*"stay" + 0.032*"year" + 0.024*"time" + 0.015*"wait" + 0.007*"jump" + 0.006*"get"

 Topic 2: 0.240*"time" + 0.225*"interview" + 0.180*"stock" + 0.163*"leetcode" + 0.100*"want" + 0.044*"pay" + 0.022*"job" + 0.005*"work" + 0.005*"company" + 0.004*"stay"

 Topic 3: 0.389*"jump" + 0.362*"wait" + 0.172*"pay" + 0.029*"time" + 0.007*"get" + 0.006*"job" + 0.005*"leetcode" + 0.004*"interview" + 0.004*"year" + 0.004*"company"

 Topic 4: 0.328*"stay" + 0.138*"year" + 0.108*"jump" + 0.108*"company" + 0.079*"job" + 0.074*"leetcode" + 0.071*"work" + 0.046*"interview" + 0.018*"time" + 0.006*"get"

 Topic 5: 0.438*"year" + 0.199*"company" + 0.125*"stock" + 0.104*"job" + 0.066*"pay" + 0.030*"time" + 0.009*"work" + 0.006*"jump" + 0.005*"get" + 0.004*"want"

 Topic 6: 0.495*"get" + 0.120*"time" + 0.113*"want" + 0.110*"job" + 0.086*"year" + 0.023*"stay" + 0.008*"stock" + 0.008*"pay" + 0.006*"jump" + 0.006*"leetcode"


In [102]:
# pyLDAvis.enable_notebook()

# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
# pyLDAvis.display(vis)

In [110]:
#apply VADER sentiment to each cleaned comment 
df_positive_comments["sentiment_score"] = df_positive_comments["cleaned_text"].apply(lambda text: vader.polarity_scores(text)["compound"])

#preview df
sentiment_preview = df_positive_comments[["Comment", "sentiment_score"]].copy()

#round for neatness
sentiment_preview["sentiment_score"] = sentiment_preview["sentiment_score"].round(3)

#sort by most negative sentiment
sentiment_preview = sentiment_preview.sort_values(by="sentiment_score", ascending=False).reset_index(drop=True)

def label_sentiment(score):
    if score >= 0.05:
        return "Positive"
    elif score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

df_positive_comments["vader_sentiment"] = df_positive_comments["sentiment_score"].apply(label_sentiment)

#display table w/sentiment score & whether it's pos, neut, or neg.
sentiment_table = df_positive_comments[["Comment", "sentiment_score", "vader_sentiment"]].copy()
sentiment_table["sentiment_score"] = sentiment_table["sentiment_score"].round(3)
sentiment_table = sentiment_table.sort_values(by="sentiment_score", ascending=False).reset_index(drop=True)
display(sentiment_table.head(20))

Unnamed: 0,Comment,sentiment_score,vader_sentiment
0,I'm not worried about missing out. If I switc...,0.995,Positive
1,Your poll needs more answer options!\n\nI woul...,0.932,Positive
2,I don't want to relocate outside of LA and dea...,0.89,Positive
3,Im going to just stick with Dev Ops. It pays a...,0.882,Positive
4,"Culture. After a certain point (TC or Age), yo...",0.881,Positive
5,"Easy boss, not much pressure or work, promise ...",0.84,Positive
6,Not everything in life is about money… Many of...,0.815,Positive
7,My WLB and the work I do I like. A big plus is...,0.796,Positive
8,In a good position to learn and grow in this t...,0.788,Positive
9,Stock appreciation is terrific,0.751,Positive


In [112]:
#count the # of comments in each sentiment category
sentiment_counts = df_positive_comments["vader_sentiment"].value_counts().reset_index()
sentiment_counts.columns = ["Sentiment", "Count"]

#calculate percentage
total_comments = sentiment_counts["Count"].sum()
sentiment_counts["Percent"] = (sentiment_counts["Count"] / total_comments * 100).round(1)

#display
sentiment_counts = sentiment_counts.sort_values(by="Sentiment")  # Optional alphabetical sort
display(sentiment_counts)

Unnamed: 0,Sentiment,Count,Percent
2,Negative,46,30.1
1,Neutral,51,33.3
0,Positive,56,36.6


## Bert application

In [115]:
#run BERT Sentiment on comments
#load the model & tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

#function to get sentiment label
def get_bert_sentiment(text):
    #512 tokens max or it breaks
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = softmax(logits.numpy()[0])
    
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[probs.argmax()]


#apply to df
df_positive_comments["bert_sentiment"] = df_positive_comments["Comment"].apply(get_bert_sentiment)

In [117]:
df_positive_comments[["Comment", "bert_sentiment"]].head(10)

Unnamed: 0,Comment,bert_sentiment
0,Not enough time to practice Leetcode and syste...,Negative
1,Not enough time to practice all these things b...,Negative
2,I got a big pay bump from going from NCG salar...,Positive
3,My incredible CEO and work culture 😃 😉 😜 😱 😢,Positive
4,"I like the product I'm working on, decent wlb,...",Positive
5,"Looking at the poll results, I think it's enco...",Positive
6,Fear of the unknown and/or laziness is almost ...,Neutral
7,takes me too long to brush up LC,Negative
8,Imposter syndrome. Feels like I’m already und...,Negative
9,working 80 hour weeks and too tired to LC afte...,Negative


In [119]:
bert_counts = df_positive_comments["bert_sentiment"].value_counts().reset_index()
bert_counts.columns = ["Sentiment", "Count"]
bert_counts["Percent"] = (bert_counts["Count"] / bert_counts["Count"].sum() * 100).round(1)

display(bert_counts)

Unnamed: 0,Sentiment,Count,Percent
0,Neutral,70,45.8
1,Negative,52,34.0
2,Positive,31,20.3


In [121]:
#get counts
vader_counts = df_positive_comments["vader_sentiment"].value_counts().sort_index()
bert_counts = df_positive_comments["bert_sentiment"].value_counts().sort_index()

#get total for each model
total_vader = vader_counts.sum()
total_bert = bert_counts.sum()

#combine into one df
sentiment_totals = pd.DataFrame({
    "VADER Count": vader_counts,
    "BERT Count": bert_counts
}).fillna(0).astype(int)

#add % columns
sentiment_totals["VADER %"] = (sentiment_totals["VADER Count"] / total_vader * 100).round(1)
sentiment_totals["BERT %"] = (sentiment_totals["BERT Count"] / total_bert * 100).round(1)

#reset index so Sentiment is a column
sentiment_totals = sentiment_totals.reset_index().rename(columns={"index": "Sentiment"})

#display!
display(sentiment_totals)

Unnamed: 0,Sentiment,VADER Count,BERT Count,VADER %,BERT %
0,Negative,46,52,30.1,34.0
1,Neutral,51,70,33.3,45.8
2,Positive,56,31,36.6,20.3
