In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import json
from collections import Counter

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import swifter
from tqdm import tqdm
tqdm.pandas()

# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('omw-1.4')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')

path_to_folder = "C:/Users/phku0/Quant_Project"

### clean, tokenize & lemmatize

In [None]:
df = pd.read_parquet(path_to_folder+"/data_raw/glassd_review_internal_data_2024-03-19.parquet")

In [None]:
# drop advice column (58% missing)
df.drop(['updateDateTime', 'other_data'], axis=1, inplace=True)

df = df.dropna(subset=['summary'])
df = df.fillna('')

# Precompile the regular expressions
newline_re = re.compile(r'\r?\n+')
quote_re = re.compile(r"’")
dots_re = re.compile(r'\.{2,}')
backslash_re = re.compile(r'\\')
whitespace_re = re.compile(r'\s+')

def cleanText(text):
  text = newline_re.sub('. ', text) # same as text = re.sub(r'\r?\n+', '. ', text) etc
  text = quote_re.sub("'", text)
  text = dots_re.sub('.', text)
  text = backslash_re.sub('', text)
  text = whitespace_re.sub(' ', text)
  text = text.strip().lower()
  return text

df[['summary', 'pros', 'cons', 'advice']] = df[['summary', 'pros', 'cons', 'advice']].swifter.applymap(cleanText)

Pandas Apply:   0%|          | 0/17282496 [00:00<?, ?it/s]

In [None]:
# Initialization
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) # suitable for informal text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)

POS_TAG_MAP = {'j': wordnet.ADJ, 'n': wordnet.NOUN, 'v': wordnet.VERB, 'r': wordnet.ADV}

# Map POS tag for lemmatization
def get_POS(word):
  tag = pos_tag([word])[0][1][0].lower()
  return POS_TAG_MAP.get(tag, wordnet.NOUN)

def tokenize(text):
  tokens = tokenizer.tokenize(text)

  filtered_tokens = [
    word for word in tokens
    if word not in stop_words
    and word not in punctuation_set
    and len(word) > 1
  ]

  if not filtered_tokens:
    return tuple()

  pos_tags = pos_tag(filtered_tokens)

  lemmatized_tokens = [
    lemmatizer.lemmatize(word, get_POS(word))
    for word, tag in pos_tags
  ]
  
  return tuple(lemmatized_tokens)

df[['summary_token', 'pros_token', 'cons_token', 'advice_token']] = df[['summary', 'pros', 'cons', 'advice']].swifter.applymap(tokenize)

Pandas Apply:   0%|          | 0/17282496 [00:00<?, ?it/s]

In [None]:
df.to_parquet(path_to_folder+'/data_NLP_checkpoints/2b.new_NLP_cleaned_tokenized.parquet')

### aggregate

In [2]:
df = pd.read_parquet(path_to_folder+'/data_NLP_checkpoints/2b.new_NLP_cleaned_tokenized.parquet')

In [3]:
df['summary_length'] = df['summary'].str.len()
df['pros_length'] = df['pros'].str.len()
df['cons_length'] = df['cons'].str.len()
df['advice_length'] = df['advice'].str.len()

df['summary_token_count'] = df['summary_token'].apply(len)
df['pros_token_count'] = df['pros_token'].apply(len)
df['cons_token_count'] = df['cons_token'].apply(len)
df['advice_token_count'] = df['advice_token'].apply(len)

In [4]:
df.drop(columns=['summary', 'pros', 'cons', 'advice'], inplace=True)

In [5]:
df_sentiment = pd.read_parquet(path_to_folder+'/data_NLP_checkpoints/10.sentiment_roberta.parquet')
df = pd.merge(df, df_sentiment, on='reviewId', how='inner')

In [6]:
df_sentiment_advice = pd.read_parquet(path_to_folder+'/data_NLP_checkpoints/10b.sentiment_roberta_advice_col.parquet')
df = pd.merge(df, df_sentiment_advice, on='reviewId', how='inner')

In [7]:
df_sentiment_LLAMA = pd.read_parquet(path_to_folder+'/data_NLP_checkpoints/13.LLM_asba_output.parquet')
df_sentiment_LLAMA.rename(columns={
    'innovative': 'innovative_llama',
    'integrity': 'integrity_llama',
    'quality': 'quality_llama',
    'respect': 'respect_llama',
    'teamwork': 'teamwork_llama'
}, inplace=True)

df = pd.merge(df, df_sentiment_LLAMA, on='reviewId', how='inner')

In [8]:
df.head(2)

Unnamed: 0,reviewId,summary_token,pros_token,cons_token,advice_token,summary_length,pros_length,cons_length,advice_length,summary_token_count,...,advice_token_count,summary_sentiment,pros_sentiment,cons_sentiment,advice_sentiment,innovative_llama,integrity_llama,quality_llama,respect_llama,teamwork_llama
0,49,"[great, internship, experience]","[people, helpful, create, product, impact, wor...","[really, think, there's, downside]","[great, company, hope, everybody, keep, great,...",30,145,46,95,3,...,10,0.958085,0.951453,0.0,0.988652,,,0.99,,0.5
1,59,"[microsoft, stop, treat, people, important, he...","[lot, smart, folk, wide, variety, thing, good,...","[unless, partner, company, treat, like, commod...","[excellence, people, management, can't, come, ...",85,185,158,389,7,...,38,-0.830964,0.956963,0.0,0.0,,,0.5,,0.5


In [None]:
df.columns

Index(['reviewId', 'summary_token', 'pros_token', 'cons_token', 'advice_token',
       'summary_length', 'pros_length', 'cons_length', 'advice_length',
       'summary_token_count', 'pros_token_count', 'cons_token_count',
       'advice_token_count', 'summary_sentiment', 'pros_sentiment',
       'cons_sentiment', 'advice_sentiment', 'innovative_llama',
       'integrity_llama', 'quality_llama', 'respect_llama', 'teamwork_llama'],
      dtype='object')

In [None]:
df.to_parquet(path_to_folder+'/data_NLP_checkpoints/prod_NLP_cleaned_by_review.parquet')

### LDA

In [9]:
df_import = pd.read_parquet(path_to_folder+'/data_NLP_checkpoints/prod_NLP_cleaned_by_review.parquet')

In [None]:
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import matplotlib.pyplot as plt

df = df_import.copy()

dictionary = corpora.Dictionary(df["summary_token"])
corpus = [dictionary.doc2bow(tokens) for tokens in df["summary_token"]]

def compute_coherence(corpus, dictionary, num_topics):
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=15, random_state=42)
    coherence_model = CoherenceModel(model=lda_model, texts=df["summary_token"], dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

topic_range = range(3, 15)  # Test number of topics 
coherence_scores = []

for num_topics in topic_range:
    coherence = compute_coherence(corpus, dictionary, num_topics)
    coherence_scores.append(coherence)
    print(f"Num Topics: {num_topics}, Coherence Score: {coherence}")

plt.plot(topic_range, coherence_scores)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Coherence Score vs Number of Topics')
plt.show()

In [None]:
df = df_import.copy()

dictionary = corpora.Dictionary(df["summary_token"])
corpus = [dictionary.doc2bow(tokens) for tokens in df["summary_token"]]

num_topics = 5  # to be tuned
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=15, random_state=42)

print("Topics generated by LDA:")
for idx, topic in lda_model.print_topics(num_words=20):
  print(f"Topic {idx}: {topic}")

df["topic_distribution"] = [lda_model.get_document_topics(bow) for bow in corpus]

vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

Topics generated by LDA:
Topic 0: 0.057*"management" + 0.043*"team" + 0.042*"benefit" + 0.032*"leadership" + 0.028*"opportunity" + 0.026*"poor" + 0.024*"growth" + 0.020*"lot" + 0.019*"toxic" + 0.017*"culture" + 0.015*"engineer" + 0.015*"product" + 0.014*"worth" + 0.013*"tech" + 0.011*"need" + 0.011*"lack" + 0.010*"make" + 0.010*"much" + 0.008*"fast" + 0.008*"senior"
Topic 1: 0.206*"great" + 0.140*"work" + 0.080*"company" + 0.076*"place" + 0.037*"culture" + 0.030*"pay" + 0.029*"people" + 0.028*"decent" + 0.024*"environment" + 0.016*"life" + 0.016*"balance" + 0.016*"ok" + 0.014*"overall" + 0.013*"salary" + 0.013*"excellent" + 0.012*"amaze" + 0.011*"okay" + 0.010*"well" + 0.008*"low" + 0.007*"fun"
Topic 2: 0.124*"job" + 0.062*"review" + 0.030*"manager" + 0.028*"time" + 0.023*"love" + 0.022*"sale" + 0.017*"service" + 0.016*"first" + 0.015*"go" + 0.014*"support" + 0.013*"customer" + 0.013*"year" + 0.012*"like" + 0.010*"associate" + 0.009*"level" + 0.008*"easy" + 0.008*"role" + 0.008*"depend

### next

In [None]:
# def evaluate_tokens(row):
#   positive_tokens = []
#   neutral_tokens = []
#   negative_tokens = []

#   if row['pros_sentiment'] >= 0.7:
#     positive_tokens.extend(row['pros_token'])
#   if row['summary_sentiment'] >= 0.7:
#     positive_tokens.extend(row['summary_token'])

#   if row['pros_sentiment'] >= -0.25 and row['pros_sentiment'] <= 0.25:
#     neutral_tokens.extend(row['pros_token'])
#   if row['cons_sentiment'] >= -0.25 and row['cons_sentiment'] <= 0.25:
#     neutral_tokens.extend(row['cons_token'])
#   if row['summary_sentiment'] >= -0.25 and row['summary_sentiment'] <= 0.25:
#     neutral_tokens.extend(row['summary_token'])

#   if row['cons_sentiment'] <= -0.7:
#     negative_tokens.extend(row['cons_token'])
#   if row['summary_sentiment'] <= -0.7:
#     negative_tokens.extend(row['summary_token'])

#   return positive_tokens, neutral_tokens, negative_tokens

# df[['positive_tokens', 'neutral_tokens', 'negative_tokens']] = df[['summary_token', 'pros_token', 'cons_token', 'summary_sentiment', 'pros_sentiment', 'cons_sentiment']].swifter.apply(lambda row: pd.Series(evaluate_tokens(row)), axis=1)

Pandas Apply:   0%|          | 0/4320624 [00:00<?, ?it/s]

In [None]:
all_positive_tokens = [token for tokens_list in df['positive_tokens'] for token in tokens_list]
positive_tokens_counter = Counter(all_positive_tokens)
positive_tokens_df = pd.DataFrame(positive_tokens_counter.items(), columns=['positive_token', 'Frequency'])
positive_tokens_df = positive_tokens_df.sort_values(by='Frequency', ascending=False)

all_negative_tokens = [token for tokens_list in df['negative_tokens'] for token in tokens_list]
negative_tokens_counter = Counter(all_negative_tokens)
negative_tokens_df = pd.DataFrame(negative_tokens_counter.items(), columns=['negative_token', 'Frequency'])
negative_tokens_df = negative_tokens_df.sort_values(by='Frequency', ascending=False)

all_neutral_tokens = [token for tokens_list in df['neutral_tokens'] for token in tokens_list]
neutral_tokens_counter = Counter(all_neutral_tokens)
neutral_tokens_df = pd.DataFrame(neutral_tokens_counter.items(), columns=['neutral_token', 'Frequency'])
neutral_tokens_df = neutral_tokens_df.sort_values(by='Frequency', ascending=False)

In [None]:
work_employment = [
    'work', 'job', 'career', 'company', 'employee', 'co-worker', 
    'coworker', 'colleague', 'staff', 'management', 'manager', 
    'leadership', 'business', 'industry'
]

pay_benefits = [
    'salary', 'pay', 'bonus', 'benefit', 'discount', 'perk', 'health',
    'food'
]

work_environment_culture = [
    'environment', 'culture', 'atmosphere', 'office', 'place', 'fun', 
    'friendly', 'supportive', 'family', 'team', 'coworkers', 'colleague', 
    'people', 'everyone'
]

time_scheduling = [
    'hour', 'time', 'day', 'schedule', 'year', 'flexible', 'fast'
]

growth_development = [
    'opportunity', 'learn', 'training', 'growth', 'grow', 'new', 
    'experience', 'development', 'project', 'start', 'move'
]

work_life_balance = [
    'balance', 'life', 'home', 'flexible', 'care',
    'hour', 'time', 'day', 'schedule', 'year', 'flexible', 'fast'
]

work_quality_performance = [
    'excellent', 'decent', 'strong', 'hard', 'best', 'amaze', 'awesome', 
    'support', 'supportive', 'smart', 'fast',
]

customer_sales = [
    'customer', 'sale', 'product', 'value'
]

technology_innovation = ['technology']