In [None]:
import random
import pandas as pd
df = pd.read_csv("review_data.csv")

In [None]:
#restrict dates to be after 2018-01-01
df = df[df["date"] >= "2018-01-01"]

In [None]:
# use bertopic to create topics and show the trend of topics over time
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer    

In [None]:
# run bertopic on a single company
company = "Google"

from umap import UMAP
# create umap instance to save state
umap_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False, random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model, umap_model=umap_model, nr_topics="auto", diversity=0.2)

# create list of merged pros and cons
docs = df[df["company"] == company]["pros"] + ". " + df[df["company"] == company]["cons"]
docs = docs.tolist()
topics, probs = topic_model.fit_transform(docs)

df["date"] = df["date"].apply(lambda x: x.replace(day=1))
timestamps = df[df["company"] == company].date.to_list()

topics_over_time = topic_model.topics_over_time(docs, timestamps)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10)


In [None]:
# wrangling to make sure there is data for all topics for all timestamps
freq_df = topic_model.get_topic_freq()
freq_df = freq_df.loc[freq_df.Topic != -1, :]
# group by Timestamp and count rows in each group
chosen_topics = sorted(freq_df.Topic.to_list()[:10])
# remove all rows where Topic isn't in chosen_topics
topics_over_time = topics_over_time[topics_over_time["Topic"].isin(chosen_topics)]
#add a row to data if for a given timestamp a topic in chosen_topics doesn't exist
for topic in chosen_topics:
    for timestamp in topics_over_time.Timestamp.unique():
        if topic not in topics_over_time[topics_over_time["Timestamp"] == timestamp]["Topic"].to_list():
            topics_over_time = topics_over_time.append({"Timestamp": timestamp, "Topic": topic, "Count": 0}, ignore_index=True)
# impute nan values in frequency to 0
topics_over_time["Frequency"] = topics_over_time["Frequency"].fillna(0)


In [None]:
# add a new column "relative_frequency" where the relative frequency is calculated grouped on Timestamp
topics_over_time["relative_frequency"] = topics_over_time.groupby("Timestamp")["Frequency"].apply(lambda x: x / x.sum())

In [None]:
# importing package
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
 

#for each sorted timestamp in topics_over_time, get the relative frequency for each topic
data = topics_over_time.sort_values(["Timestamp", "Topic"]).groupby("Timestamp").apply(lambda x: x["relative_frequency"].to_list())

_list = []
for row in data.iteritems():
    value = [str(row[0])[:10]]
    value.extend(row[1])
    _list.append(value)

# create data
df = pd.DataFrame(_list,
                  columns=['Time', "Company Culture", "Company Employee", "Teamwork", "Internship experience", "Engineering culture", "Perks and work life balance", "Promotions and work life", "Work life balance and growth", "Positive Aspects", "Campus and cantine"])
# view data 
# plot data in stack manner of bar type
ax = df.plot(x='Time', kind='bar', stacked=True,
        title='', width=0.7)

mpl.rcParams['font.family'] = 'STIXGeneral'
    
mpl.rcParams.update({
    'legend.frameon' : 'True',
    'legend.facecolor' : (1.0, 1.0, 1.0),
    'legend.framealpha' : 1.0,
    'legend.edgecolor' : 'white',
})
ax.set_facecolor("white")
ax.legend(loc='lower right', fontsize=18)
#ax xtick font size
plt.xticks(fontsize=20)

# remove every second ytick label
xticks = ax.get_xticks()
xticks = xticks[::2]
plt.xticks(xticks, fontsize=20)

plt.yticks(fontsize=15)
ax.set_ylabel("Relative Frequency", fontsize=20)
ax.set_xlabel("Time", fontsize=20)
# make plot wider
plt.gcf().set_size_inches(20, 10)

plt.style.use("bmh")

# set facecolor to white
# reduce space between bars

plt.savefig("relativefrequencytopicgoogle.png", dpi=300, bbox_inches="tight")

In [None]:
# importing package
#for each sorted timestamp in topics_over_time, get the relative frequency for each topic
data = topics_over_time.sort_values(["Timestamp", "Topic"]).groupby("Timestamp").apply(lambda x: x["Frequency"].to_list())

_list = []
for row in data.iteritems():
    value = [str(row[0])[:10]]
    value.extend(row[1])
    _list.append(value)

# create data
df = pd.DataFrame(_list,
                  columns=['Time', "Company Culture", "Company Employee", "Teamwork", "Internship experience", "Engineering culture", "Perks and work life balance", "Promotions and work life", "Work life balance and growth", "Positive Aspects", "Campus and cantine"])
# view data 
# plot data in stack manner of bar type
ax = df.plot(x='Time', kind='bar', stacked=True,
        title='', width=0.7)

mpl.rcParams['font.family'] = 'STIXGeneral'

mpl.rcParams.update({
    'legend.frameon' : 'True',
    'legend.facecolor' : (1.0, 1.0, 1.0),
    'legend.framealpha' : 1.0,
    'legend.edgecolor' : 'white',
})
ax.set_facecolor("white")
ax.legend(loc='upper left', fontsize=20)
#ax xtick font size
plt.xticks(fontsize=20)

# remove every second ytick label
xticks = ax.get_xticks()
xticks = xticks[::2]
plt.xticks(xticks, fontsize=20)

plt.yticks(fontsize=15)
ax.set_ylabel("Frequency", fontsize=20)
ax.set_xlabel("Time", fontsize=20)
# make plot wider
plt.gcf().set_size_inches(20, 10)

plt.style.use("bmh")

# set facecolor to white
# reduce space between bars

plt.savefig("frequencytopicgoogle.png", dpi=300, bbox_inches="tight")

In [None]:
# get all 5 best keywords for 10 best topics
keywords = []

for i in range(0,10):
    output = ""
    for keyword in topic_model.get_topic(i)[:5]:
        output += ", {}".format(keyword[0], round(keyword[1], 2))
    output = output[2:]
    keywords.append(output)

#print them for manual inspection
keywords

#These topics are derived from the keywords
###########################################
# Company culture
# Company benefits and growth opportunities
# Company amenities
# Internship experience
# Engineering culture
# salary and hours
# Experience and opportunities to learn
# Cooperation and teamwork
# Work life balance