In [42]:
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

#Gensim
from gensim.models import CoherenceModel

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os
root = os.path.expanduser('~')

company_index='5'
company_name='Amazon'
img_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/img/companies/'+f'{company_index}_{company_name}'

if not os.path.exists(img_path):
    os.makedirs(img_path)

In [37]:
## making dataset

def get_px_data(company_index=5, company_name='Amazon', uni=False, bi=False, tri=False):
    data_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/data/companies/'+f'{company_index}_{company_name}'+'/output_data/px_data/pros.csv'
    df = pd.read_csv(data_path)
    print(df.head())
    if uni and bi and tri:
        df_data = df['Unigrams'].fillna('') + ' ' + df['Bigrams'].fillna('') + ' ' + df['Trigrams'].fillna('')
    elif uni:
        df_data = df['Unigrams'].dropna()
    elif bi:
        df_data = df['Bigrams'].dropna()
    elif tri:
        df_data = df['Trigrams'].dropna()
    
    return df_data.tolist()
    
def merge_data(company_list=[5], uni=False, bi=False, tri=False):
    sentences = []
    df_company_list = pd.read_csv( root + '/Desktop/workspace/indeed/Job-Satisfaction/data/scraper_data/review_site.csv')
    sentences = []
    for company_index in company_list:
        company_name = df_company_list.iloc[company_index - 1]['Company_Name']
        sentences.extend(get_px_data(company_index=company_index, company_name=company_name, 
                        uni=uni, bi=bi, tri=tri))
    return sentences

sentences = merge_data(company_list=[5, 6, 7],
                        uni=True, bi=True, tri=True)
# sentences = merge_data(company_list=range(1, 51),
#                         uni=True, bi=True, tri=True)
print(len(sentences))
sentences[:5]

   Unnamed: 0                                        Review_Text  \
0           0                                           Good pay   
1           1  The only good part for me was it was close to ...   
2           2               All ways an relxing work evironment.   
3           3  Flexible scheduling around your life and your ...   
4           4                                           Benefits   

                      Unigrams          Bigrams Trigrams  
0                          pay              NaN      NaN  
1                   close hous              NaN      NaN  
2                          NaN              NaN      NaN  
3  cowork flexibl life schedul  flexibl_schedul      NaN  
4                      benefit              NaN      NaN  
   Unnamed: 0                               Review_Text             Unigrams  \
0           0                                  benefits              benefit   
1           1  Egonomics and Safety were very important        egonom safeti 

   Unnamed: 0                                        Review_Text  \
0           0               Great people, made pretty good money   
1           1  Good morning meetings addressing Safety and da...   
2           2                                           Benefits   
3           3                                      Money is good   
4           4                                        good raises   

                                Unigrams Bigrams  \
0                     money peopl pretti     NaN   
1  address daili expect meet morn safeti     NaN   
2                                benefit     NaN   
3                                  money     NaN   
4                                   rais     NaN   

                                            Trigrams  
0                                 peopl_pretti_money  
1  address_safeti_daili meet_address_safeti morn_...  
2                                                NaN  
3                                                NaN  
4  

   Unnamed: 0                                        Review_Text  \
0           0                           free drinks and sandwich   
1           1             Benefits, Interesting Work, Co Workers   
2           2                                  Excellent bonuses   
3           3  Lunches is like long, but have to take care of...   
4           4  Decent pay, good insurance benefits, good people.   

                                    Unigrams  \
0                        drink free sandwich   
1                          benefit co worker   
2                                bonus excel   
3  build care custmer help insid lunch there   
4             benefit decent insur pay peopl   

                                            Bigrams  \
0                         drink_sandwich free_drink   
1                              benefit_co co_worker   
2                                       excel_bonus   
3    care_custmer help_insid insid_build lunch_care   
4  benefit_peopl decent_pay

   Unnamed: 0                                        Review_Text  \
0           0            Great people & pedigree for your resume   
1           1  Co-workers, benefits, experience gained, worki...   
2           2                 old timers have a lot of knowledge   
3           3  stable, good work life balance, and very suppo...   
4           4                                            Support   

                           Unigrams         Bigrams Trigrams  
0                       peopl resum             NaN      NaN  
1  benefit cowork experi gain remot  cowork_benefit      NaN  
2                      knowledg lot             NaN      NaN  
3   balanc life remot stabl support     life_balanc      NaN  
4                           support             NaN      NaN  
   Unnamed: 0                                        Review_Text  \
0           0                    Some nice people, good location   
1           1                                              Hours   
2        

   Unnamed: 0                            Review_Text               Unigrams  \
0           0  Supportive culture and good mentoring  cultur mentor support   
1           1                       Please see above                  pleas   
2           2                               Training                  train   
3           3               posiive work environment         environ posiiv   
4           4                            Nice people             nice peopl   

      Bigrams               Trigrams  
0         NaN  support_cultur_mentor  
1         NaN                    NaN  
2         NaN                    NaN  
3         NaN                    NaN  
4  nice_peopl                    NaN  
349847


['pay  ',
 'close hous  ',
 '  ',
 'cowork flexibl life schedul flexibl_schedul ',
 'benefit  ']

In [38]:
data_px = sentences

In [39]:
vectorizer = TfidfVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_px)
print(data_vectorized)

  (0, 691)	1.0
  (1, 179)	0.597370084186877
  (1, 484)	0.8019656990910294
  (3, 245)	0.30141453836556753
  (3, 397)	0.5922868997469584
  (3, 551)	0.409435666570386
  (3, 842)	0.625066347990413
  (4, 79)	1.0
  (5, 263)	0.7564494378939898
  (5, 328)	0.654052175219888
  (6, 416)	0.2728093443040306
  (6, 576)	0.6162155035550861
  (6, 652)	0.499402956494321
  (6, 886)	0.4435919721573907
  (6, 964)	0.3157156380685073
  (7, 112)	0.30330937022559024
  (7, 465)	0.39521146178139016
  (7, 509)	0.4141696412664891
  (7, 578)	0.26865461113687694
  (7, 911)	0.4438572412459378
  (7, 1023)	0.5577546808154992
  (8, 256)	0.44794099206554133
  (8, 290)	0.3942060492456101
  (8, 470)	0.5168578585267343
  (8, 564)	0.30085794995859977
  :	:
  (349832, 578)	0.6828851387453391
  (349833, 949)	0.6258109490683443
  (349833, 365)	0.7799747790962077
  (349834, 422)	0.6729140714854994
  (349834, 140)	0.7397206583547656
  (349836, 79)	0.34209207509717654
  (349836, 215)	0.9396664366442529
  (349837, 109)	1.0
  (34983

In [40]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense == 0).sum()/data_dense.size)*100, "%")

Sparsicity:  99.787754406948 %


In [41]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_topics=8,               # Number of topics
                                      max_iter=100,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=8, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [33]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_vectorized, dictionary=id2word, coherence='u_mass')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -305992.13210978365
Perplexity:  747.3711736467765
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 100,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'n_topics': 8,
 'perp_tol': 0.1,
 'random_state': 42,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [35]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
