In [1]:
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

#Gensim
from gensim.models import CoherenceModel

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import os
root = os.path.expanduser('~')

company_index='5'
company_name='Amazon'
img_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/img/companies/'+f'{company_index}_{company_name}'

if not os.path.exists(img_path):
    os.makedirs(img_path)

In [2]:
## making dataset

def get_px_data(company_index=5, company_name='Amazon', uni=False, bi=False, tri=False):
    data_path = root + '/Desktop/workspace/indeed/Job-Satisfaction/data/companies/'+f'{company_index}_{company_name}'+'/output_data/px_data/pros.csv'
    df = pd.read_csv(data_path)
    print(df.head())
    if uni and bi and tri:
        df_data = df['Unigrams'].fillna('') + ' ' + df['Bigrams'].fillna('') + ' ' + df['Trigrams'].fillna('')
    elif uni:
        df_data = df['Unigrams'].dropna()
    elif bi:
        df_data = df['Bigrams'].dropna()
    elif tri:
        df_data = df['Trigrams'].dropna()
    
    return df_data.tolist()
    
def merge_data(company_list=[5], uni=False, bi=False, tri=False):
    sentences = []
    df_company_list = pd.read_csv( root + '/Desktop/workspace/indeed/Job-Satisfaction/data/scraper_data/review_site.csv')
    sentences = []
    for company_index in company_list:
        company_name = df_company_list.iloc[company_index - 1]['Company_Name']
        sentences.extend(get_px_data(company_index=company_index, company_name=company_name, 
                        uni=uni, bi=bi, tri=tri))
    return sentences

sentences = merge_data(company_list=[5, 6, 7, 8, 9, 10],
                        uni=True, bi=True, tri=True)
# sentences = merge_data(company_list=range(1, 51),
#                         uni=True, bi=True, tri=True)
print(len(sentences))
sentences[:5]

   Unnamed: 0                                        Review_Text  \
0           0                                3 Days in a row off   
1           1  Benefits, There many different shifts, work ha...   
2           2                   Free drinks, paid time, overtime   
3           3  On your own, flexible, can keep job even worki...   
4           4                                     Great benefits   

                                            Unigrams               Bigrams  \
0                                                day                   NaN   
1  advanc benefit differ half hard holiday major ...                   NaN   
2                       drink free overtim paid time  free_drink paid_time   
3                                 flexibl month time          flexibl_time   
4                                            benefit                   NaN   

  Trigrams  
0      NaN  
1      NaN  
2      NaN  
3      NaN  
4      NaN  
   Unnamed: 0                               

['day  ',
 'advanc benefit differ half hard holiday major shift time  ',
 'drink free overtim paid time free_drink paid_time ',
 'flexibl month time flexibl_time ',
 'benefit  ']

In [3]:
data_px = sentences

In [4]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_px)
print(data_vectorized)

  (0, 148)	1
  (1, 557)	1
  (1, 491)	1
  (1, 322)	1
  (1, 263)	1
  (1, 257)	1
  (1, 254)	1
  (1, 158)	1
  (1, 41)	1
  (1, 11)	1
  (2, 379)	2
  (2, 376)	1
  (2, 229)	2
  (2, 170)	2
  (2, 557)	2
  (3, 346)	1
  (3, 223)	2
  (3, 557)	2
  (4, 41)	1
  (5, 469)	1
  (6, 355)	2
  (6, 56)	2
  (8, 423)	1
  (8, 258)	2
  (8, 17)	1
  :	:
  (53235, 385)	1
  (53236, 41)	1
  (53237, 564)	1
  (53237, 523)	1
  (53237, 41)	1
  (53238, 583)	1
  (53238, 469)	1
  (53239, 319)	2
  (53239, 229)	2
  (53240, 544)	2
  (53240, 561)	1
  (53240, 41)	2
  (53241, 333)	1
  (53242, 482)	1
  (53242, 466)	1
  (53242, 324)	1
  (53242, 266)	1
  (53242, 477)	1
  (53242, 223)	1
  (53243, 527)	1
  (53243, 544)	1
  (53243, 324)	1
  (53244, 324)	1
  (53245, 392)	2
  (53245, 41)	2


In [5]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense == 0).sum()/data_dense.size)*100, "%")

Sparsicity:  99.64511973307748 %


In [7]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_topics=8,               # Number of topics
                                      max_iter=100,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=42,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=100, mean_change_tol=0.001,
             n_components=10, n_jobs=-1, n_topics=8, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)


In [8]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=data_vectorized, dictionary=id2word, coherence='u_mass')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\nCoherence Score: ', coherence_lda)

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -786081.6060611812
Perplexity:  113.4880234995769
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 100,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'n_topics': 8,
 'perp_tol': 0.1,
 'random_state': 42,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [13]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, sort_topics=False)
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [14]:
pyLDAvis.save_html(panel, 'a.html')