In [507]:
## imports

import numpy as np
import pandas as pd

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os
root = os.path.expanduser('~')

company_index='5'
company_name='Amazon'
# img_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/img/companies/'+f'{company_index}_{company_name}'
img_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/img/companies/LDA_vis'
img_flag = False

if not os.path.exists(img_path):
    os.makedirs(img_path)

In [508]:
## making dataset

def get_px_data(company_index=5, company_name='Amazon', uni=False, bi=False, tri=False):
    data_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/data/companies/'+f'{company_index}_{company_name}'+'/output_data/px_data/pros.csv'
    df = pd.read_csv(data_path)
#     print(df.head())
    df_data = pd.DataFrame()
    df_data['Review_Text'] = df['Review_Text']
    if uni and bi and tri:
        df_data['Px_Texts'] = df['Unigrams'].fillna('') + ' ' + df['Bigrams'].fillna('') + ' ' + df['Trigrams'].fillna('')
    elif uni:
        df_data['Px_Texts'] = df['Unigrams'].dropna()
    elif bi:
        df_data['Px_Texts'] = df['Bigrams'].dropna()
    elif tri:
        df_data['Px_Texts'] = df['Trigrams'].dropna()
    
    return df_data
    
sentences = get_px_data(company_index=company_index, company_name=company_name, 
                        uni=True, bi=True, tri=True)

# print(len(sentences))
# sentences[:5]
print(sentences.shape[0])
sentences.head()





14687


Unnamed: 0,Review_Text,Px_Texts
0,3 Days in a row off,day
1,"Benefits, There many different shifts, work ha...",advanc benefit differ half hard holiday major ...
2,"Free drinks, paid time, overtime",drink free overtim paid time free_drink paid_t...
3,"On your own, flexible, can keep job even worki...",flexibl month time flexibl_time
4,Great benefits,benefit


In [509]:

def merge_data(company_list=[5], uni=False, bi=False, tri=False):
    sentences = []
    df_company_list = pd.read_csv( root + '/Desktop/workspace/indeed/Job-Satisfaction/data/scraper_data/review_site.csv')
    sentences = pd.DataFrame(columns = ['Review_Text', 'Px_Texts'])
    for company_index in company_list:
        company_name = df_company_list.iloc[company_index - 1]['Company_Name']
        px_sents = get_px_data(company_index=company_index, company_name=company_name, 
                        uni=uni, bi=bi, tri=tri)
        if (px_sents.shape[0]) < 500:
            sentences = pd.concat([sentences, px_sents], ignore_index=True)
        else:
            sentences = pd.concat([sentences, px_sents.sample(n=500, random_state=42)], ignore_index=True)
    return sentences

# sentences = merge_data(company_list=[5],
#                         uni=True, bi=True, tri=True)
sentences = merge_data(company_list=range(1, 51),
                        uni=True, bi=True, tri=True)
# sentences = merge_data(company_list=[5, 6, 7, 8, 9, 10],
#                         uni=True, bi=True, tri=True)
# print(len(sentences))
sentences.head()

Unnamed: 0,Review_Text,Px_Texts
0,"Good bonus, nice coworkers, good breaks, organ...",bonu break cowork ethic nice organ nice_cowork
1,fun stocking merchandise making customer happy,custom fun happi merchandis stock
2,"incentives and bonuses, good pay",bonus incent pay
3,"Helping the customers, feeling accomplished at...",accomplish custom day feel help help_custom
4,great place to work,


In [510]:
## tokenizing

sent_list = sentences[['Px_Texts']].values

data_px = []
for sent in sent_list:
    data_px.append(sent[0].split())

sentences['Tokenized_Texts'] = data_px
sentences.head()

Unnamed: 0,Review_Text,Px_Texts,Tokenized_Texts
0,"Good bonus, nice coworkers, good breaks, organ...",bonu break cowork ethic nice organ nice_cowork,"[bonu, break, cowork, ethic, nice, organ, nice..."
1,fun stocking merchandise making customer happy,custom fun happi merchandis stock,"[custom, fun, happi, merchandis, stock]"
2,"incentives and bonuses, good pay",bonus incent pay,"[bonus, incent, pay]"
3,"Helping the customers, feeling accomplished at...",accomplish custom day feel help help_custom,"[accomplish, custom, day, feel, help, help_cus..."
4,great place to work,,[]


In [511]:
sentences.shape

(23456, 3)

In [512]:
# Create Dictionary
id2word = corpora.Dictionary(data_px) ##dictionary
print(len(id2word))
print(id2word)

# Create Corpus
texts = data_px 

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]  ##corpus

# View
# print(corpus[:1])
print(corpus[:5])

5027
Dictionary(5027 unique tokens: ['bonu', 'break', 'cowork', 'ethic', 'nice']...)
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)], [(12, 1), (13, 1), (14, 1)], [(7, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)], []]


In [513]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:4]]

[[('bonu', 1),
  ('break', 1),
  ('cowork', 1),
  ('ethic', 1),
  ('nice', 1),
  ('nice_cowork', 1),
  ('organ', 1)],
 [('custom', 1), ('fun', 1), ('happi', 1), ('merchandis', 1), ('stock', 1)],
 [('bonus', 1), ('incent', 1), ('pay', 1)],
 [('custom', 1),
  ('accomplish', 1),
  ('day', 1),
  ('feel', 1),
  ('help', 1),
  ('help_custom', 1)]]

In [514]:
# %%time
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = root + '/Downloads/mallet-2.0.8/bin/mallet' # update this path
%time ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word, iterations=1000, random_seed=42)

CPU times: user 284 ms, sys: 19.4 ms, total: 304 ms
Wall time: 44.3 s


In [515]:
# Compute Coherence Score
%time coherence_model_lda = CoherenceModel(model=ldamallet, texts=data_px, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

CPU times: user 3.98 ms, sys: 8.2 ms, total: 12.2 ms
Wall time: 11.8 ms

Coherence Score:  0.3682101428123341


In [516]:
##mallet model to lda model conversion

optimal_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

In [517]:
optimal_model

<gensim.models.ldamodel.LdaModel at 0x1a27e61a58>

In [518]:
# %%time
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word, mds='tsne')
vis
# pyLDAvis.show(vis)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [519]:
from collections import OrderedDict
def get_doc_topic_dist(model=optimal_model, corpus=corpus, kwords=False):
    
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    '''
    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [np.array(vals)]
        if kwords:
            keys += [np.array(vals).argmax()]

    return np.array(top_dist), keys

top_dist, keys = get_doc_topic_dist(optimal_model, corpus, kwords=True)
print(top_dist[:5])
print(keys[:5])

[[0.0877193  0.0877193  0.10526316 0.0877193  0.1273858  0.0877193
  0.118213   0.1052784  0.0877193  0.10526316]
 [0.10543551 0.09090909 0.10943355 0.10879259 0.09093013 0.10909091
  0.09123377 0.09090909 0.09090909 0.11235627]
 [0.09436871 0.09433962 0.09433962 0.10035131 0.09433962 0.10716677
  0.13175543 0.09465966 0.09433962 0.09433962]
 [0.14206542 0.09117401 0.09111996 0.09092538 0.09092492 0.09090909
  0.09090909 0.09090909 0.09090909 0.13015396]
 [0.1        0.1        0.1        0.1        0.1        0.1
  0.1        0.1        0.1        0.1       ]]
[4, 9, 6, 0, 0]


In [520]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

explore_topic(optimal_model, 0, 3)

discount             0.097
schedul              0.088
employe              0.086


['discount', 'schedul', 'employe']

In [521]:
topic_summaries = []
num_topics = 10
# print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
#     print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(optimal_model,topic_number=i, topn=3, output=False )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print
    
print(topic_summaries)

# for t in topic_labels:
#     print(" ".join(t))
topic_label = [" ".join(t) for t in topic_summaries]
print(topic_label)

[['discount', 'schedul', 'employe'], ['free', 'lunch', 'free_lunch'], ['learn', 'lot', 'life'], ['hour', 'pay', 'flexibl'], ['environ', 'time', 'break'], ['benefit', 'pay', 'pay_benefit'], ['team', 'cowork', 'benefit'], ['manag', 'opportun', 'benefit'], ['benefit', 'home', 'balanc'], ['peopl', 'nice', 'day']]
['discount schedul employe', 'free lunch free_lunch', 'learn lot life', 'hour pay flexibl', 'environ time break', 'benefit pay pay_benefit', 'team cowork benefit', 'manag opportun benefit', 'benefit home balanc', 'peopl nice day']


In [522]:
top_labels = dict()

for i in range(10):
    top_labels[i] = topic_label[i]
    
print(top_labels)

{0: 'discount schedul employe', 1: 'free lunch free_lunch', 2: 'learn lot life', 3: 'hour pay flexibl', 4: 'environ time break', 5: 'benefit pay pay_benefit', 6: 'team cowork benefit', 7: 'manag opportun benefit', 8: 'benefit home balanc', 9: 'peopl nice day'}


In [523]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word',       
#                              min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True                   # convert all words to lowercase
#                              token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

# display(sentences[['Px_Texts']].values.tolist())
sents = [s[0] for s in sentences[['Px_Texts']].values.tolist()]
# display(sents)
# display(sentences[['Px_Texts']].values)
data_vectorized = vectorizer.fit_transform(sents).toarray() ##dtm
print(data_vectorized[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [524]:
top_dist, lda_keys= get_doc_topic_dist(optimal_model, corpus, True)
features = vectorizer.get_feature_names()

In [525]:
threshold = 0.12
_idx = np.amax(top_dist, axis=1) > threshold  # idx of doc that above the threshold
X_topic = top_dist[_idx]

In [526]:
dtm = data_vectorized

In [527]:
p_df = pd.DataFrame()
p_df['tokenz'] = np.asarray(data_px)[_idx]
p_df.head()

Unnamed: 0,tokenz
0,"[bonu, break, cowork, ethic, nice, organ, nice..."
1,"[bonus, incent, pay]"
2,"[accomplish, custom, day, feel, help, help_cus..."
3,"[hour, lunch, hour_lunch]"
4,"[christma, discount, employ, extra, holiday, p..."


In [528]:
display(dtm)
print(p_df.shape)
print(len(X_topic))

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(10403, 1)
10403


In [481]:
sentences[['Review_Text']].values

array([['Good bonus, nice coworkers, good breaks, organized work ethic'],
       ['fun stocking merchandise making customer happy'],
       ['incentives and bonuses, good pay'],
       ...,
       ['Good Benefits. Decent Salary. Strong Leadership.'],
       ['Working your own hours and the people you meet!'],
       ['You somewhat make your own schedule, do not have to report to the office daily, you can work in your pajamas, you can be selective with your clients, some advantages on tax write offs.']],
      dtype=object)

In [482]:
top_ws = []
for n in range(len(dtm)):
    inds = np.int0(np.argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]    
    top_ws += [' '.join(tmp)]


p_df['Text_Rep'] = pd.DataFrame(np.asarray(top_ws)[_idx])
p_df['clusters'] = pd.DataFrame(np.asarray(lda_keys)[_idx])
p_df['clusters'].fillna(11, inplace=True)
# p_df['Review_Text'] = pd.DataFrame(sentences[['Review_Text']].values[_idx])
p_df['Review_Text'] = pd.DataFrame(sentences[['Review_Text']].values[_idx])


colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

cluster_colors = dict()
for c in range(10): 
    cluster_colors[c] = colormap[c]
    
print(cluster_colors)
# cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 
#                   7:'maroon', 8:'crimson', 9:'black', 10:'gray', 11: '#8c564b'}

p_df['colors'] = p_df['clusters'].apply(lambda l: cluster_colors[l])





{0: '#1f77b4', 1: '#aec7e8', 2: '#ff7f0e', 3: '#ffbb78', 4: '#2ca02c', 5: '#98df8a', 6: '#d62728', 7: '#ff9896', 8: '#9467bd', 9: '#c5b0d5'}


In [483]:
p_df.shape

(21701, 5)

In [484]:
len(top_dist)
print(top_ws)

['nice_cowork ethic organ bonu', 'merchandis happi stock fun', 'incent bonus pay zero', 'accomplish help_custom feel help', 'zero evryon eve event', 'hour_lunch hour lunch zero', 'thanksgiv percent christma extra', 'paycheck zero evp_wonder evalu', 'littl supervis benefit_discount discount', 'break zero evryon event', 'day break zero evryon', 'break_lunch_break break_lunch reason lunch_break', 'associ zero evryon eve', 'potenti advanc zero ethnic', 'custom evryon eve event', 'benefit zero evp_wonder evalu', 'zero evryon eve event', 'listen receiv hour_lunch hour', 'zero evryon eve event', 'pay_hour time hour pay', 'job enjoy fun evp_wonder', 'train zero evp europ', 'sam discount_free discount free', 'peopl zero evp_wonder eve', 'decent_pay decent pay ethnic', 'oppertun lot europ eve', 'bonus discount evalu event', 'day pay zero evp', 'excel zero evryon eve', 'offer hour zero eve', 'time zero evp europ', 'ok pay evp_wonder eve', 'zero evryon eve event', 'walmart purchas product discount

In [485]:
# threshold = 0.11
# _idx = np.amax(top_dist, axis=1) > threshold  # idx of doc that above the threshold
# X_topic = top_dist[_idx]

In [486]:
# print(X_topic)
# print(len(X_topic))

In [487]:
top_dist = X_topic

In [488]:
from sklearn.manifold import TSNE
# tsne = TSNE(n_components=2)
tsne = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')

X_tsne = tsne.fit_transform(top_dist)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 21701 samples in 0.014s...
[t-SNE] Computed neighbors for 21701 samples in 2.521s...
[t-SNE] Computed conditional probabilities for sample 1000 / 21701
[t-SNE] Computed conditional probabilities for sample 2000 / 21701
[t-SNE] Computed conditional probabilities for sample 3000 / 21701
[t-SNE] Computed conditional probabilities for sample 4000 / 21701
[t-SNE] Computed conditional probabilities for sample 5000 / 21701
[t-SNE] Computed conditional probabilities for sample 6000 / 21701
[t-SNE] Computed conditional probabilities for sample 7000 / 21701
[t-SNE] Computed conditional probabilities for sample 8000 / 21701
[t-SNE] Computed conditional probabilities for sample 9000 / 21701
[t-SNE] Computed conditional probabilities for sample 10000 / 21701
[t-SNE] Computed conditional probabilities for sample 11000 / 21701
[t-SNE] Computed conditional probabilities for sample 12000 / 21701
[t-SNE] Computed conditional probabilities for sam

In [489]:
p_df['X_tsne'] =X_tsne[:, 0]
p_df['Y_tsne'] =X_tsne[:, 1]

In [490]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [491]:
source = ColumnDataSource(dict(
    x=p_df['X_tsne'],
    y=p_df['Y_tsne'],
    color=p_df['colors'],
    label=p_df['clusters'].apply(lambda l: top_labels[l]),
#     msize= p_df['marker_size'],
    topic_key= p_df['clusters'],
    content = p_df['tokenz'],
    review_text = p_df['Review_Text']
))

In [493]:
title = 'T-SNE visualization of topics'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source,
                 color='color', alpha=0.8)#'msize', )

# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "KeyWords: @content, ReviewText: @review_text - Topic: @topic_key "}
plot_lda.legend.location = "top_left"

show(plot_lda)

#save the plot
img_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/img/companies/clustering'
save(plot_lda, img_path+'/cluster2.html')

'/Users/bishalsainju/Desktop/workspace/indeed/Job-Satisfaction/img/companies/clustering/cluster2.html'