# Topic modeling of job reviews
## Andrew Hall <br><sup>August 28, 2022 <br> Initial script for final project of Metis NLP short course</sup>


In [1]:
import pandas as pd

business_names = ['Adobe', 'Airbnb', 'Amazon', 'Apple', 'Atlassian', 'Bloomberg',
                 'Bytedance', 'Cisco', 'Coinbase', 'Deloitte', 'Goldman-Sachs', 'Google',
                 'IBM', 'Intel', 'Intuit', 'Meta', 'Microsoft', 'Netflix', 'Oracle',
                 'Salesforce', 'SAP-Labs', 'Stripe', 'Twitter', 'Uber', 'Walmart']

#initialize a dictionary
data = {}

#read in each company data with key being name of company
for name in business_names: 
    data[name] = pd.read_csv('data/'+name+'/'+name+'-data.csv')

In [2]:
full_data = []
cols = ['Rating','Description', 'Pros', 'Cons', 'Company']
for company in data:
    subset_data = data[company]
    subset_data['Company'] = company
    full_data.append(subset_data[cols])

full_data = pd.concat(full_data)
print("The number of documents is: ", full_data.shape[0])

The number of documents is:  43803


In [3]:
full_data_long = pd.melt(full_data, id_vars = ['Company', 'Rating'], value_vars = ['Description', 'Pros', 'Cons'], var_name = 'Prompt', value_name = 'Output')
print("The number of individual document components is: ", full_data_long.shape[0])

The number of individual document components is:  131409


In [4]:
sum([len(str(d).split(' ')) for d in full_data_long.Output]) > 100000.

True

In [5]:
print("The aggregated number of non-unique terms is: ", sum([len(str(d).split(' ')) for d in full_data_long.Output]))

The aggregated number of non-unique terms is:  1903424


In [6]:
# example call of the data for the first ten entries for Adobe
for d in full_data_long.Output[:10]:
    print(d)

A decent tier 2 company 
Good Company...terrible middle managers
Great place to work
Not a place for work life balance, full of politics.
Work life balance is good
Great benefits and very good wlb
First Impressions 
Gr8 WLB, Management heavy with no direction
Disappointing 
Adobe is amazing. Your managers may not be.


# Create initial vectorized output

In [7]:
full_data.Description = [str(row) for row in full_data.Description]

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Create doc-term matrix initially using CountVectorizer
vec = CountVectorizer(stop_words = "english")
doc_term = vec.fit_transform(full_data.Description)

doc_term.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
type(doc_term)

scipy.sparse.csr.csr_matrix

In [10]:
doc_term.shape

(43803, 6711)

In [11]:
pd.DataFrame(doc_term.toarray(), columns = vec.get_feature_names_out())

Unnamed: 0,10,100,1000,100mph,10x,10yrs,11,110,12,12127,...,yoy,ypu,yr,yrs,yymv,zero,zone,zones,zrh,豆腐渣工程
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43798,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43799,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43800,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
43801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# -------------------------------

# Topic modeling start

In [12]:
#!pip install pyLDAvis

In [13]:
#!pip install gensim

In [14]:
# pyLDAvis
import pyLDAvis as pyLDA

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

  from imp import reload
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


**Change to term-doc matrix for LDA input**

In [15]:
data_description = full_data_long[full_data_long.Prompt == "Description"]

In [2]:
#term_doc = doc_term.transpose
#term_doc = vec.fit_transform(full_data.Description).transpose()
term_doc = vec.fit_transform(data_description['Output'].values.astype(str)).transpose()
#x = v.fit_transform(df['Review'].values.astype('U'))

NameError: name 'vec' is not defined

In [17]:
term_doc

<6711x43803 sparse matrix of type '<class 'numpy.int64'>'
	with 165771 stored elements in Compressed Sparse Column format>

In [18]:
pd.DataFrame(term_doc.toarray(), vec.get_feature_names_out())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43793,43794,43795,43796,43797,43798,43799,43800,43801,43802
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100mph,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10x,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zones,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zrh,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Convert to gensim
We need to convert our sparse `scipy` matrix to a `gensim`-friendly object called a Corpus:

In [19]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(term_doc)

#### Map matrix rows to words (tokens)
We need to save a mapping (dict) of row id to word (token) for later use by gensim:

In [20]:
id2word = dict((v, k) for k, v in vec.vocabulary_.items())

Construct initial LDA model

In [21]:
import gensim
# Create lda model (equivalent to "fit" in sklearn)
#lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

2022-09-01 00:27:15,305 : INFO : using symmetric alpha at 0.3333333333333333
2022-09-01 00:27:15,310 : INFO : using symmetric eta at 0.3333333333333333
2022-09-01 00:27:15,315 : INFO : using serial LDA version on this node
2022-09-01 00:27:15,321 : INFO : running online (multi-pass) LDA training, 3 topics, 5 passes over the supplied corpus of 43803 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2022-09-01 00:27:15,327 : INFO : PROGRESS: pass 0, at document #2000/43803
2022-09-01 00:27:15,996 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:15,998 : INFO : topic #0 (0.333): 0.103*"good" + 0.084*"great" + 0.066*"company" + 0.039*"culture" + 0.036*"wlb" + 0.026*"management" + 0.025*"work" + 0.020*"people" + 0.020*"place" + 0.012*"career"
2022-09-01 00:27:15,999 : INFO : topic #1 (0.333): 0.136*"work" + 0.113*"great" + 0.091*"life" + 0.091*"

2022-09-01 00:27:18,380 : INFO : topic #2 (0.333): 0.080*"great" + 0.076*"wlb" + 0.051*"growth" + 0.049*"career" + 0.036*"good" + 0.028*"bad" + 0.023*"place" + 0.019*"culture" + 0.019*"opportunities" + 0.016*"pay"
2022-09-01 00:27:18,380 : INFO : topic diff=0.328923, rho=0.333333
2022-09-01 00:27:18,721 : INFO : -5.945 per-word bound, 61.6 perplexity estimate based on a held-out corpus of 2000 documents with 7910 words
2022-09-01 00:27:18,722 : INFO : PROGRESS: pass 0, at document #20000/43803
2022-09-01 00:27:18,963 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:18,964 : INFO : topic #0 (0.333): 0.111*"good" + 0.076*"company" + 0.051*"great" + 0.031*"culture" + 0.028*"people" + 0.020*"wlb" + 0.019*"management" + 0.017*"overall" + 0.015*"tech" + 0.010*"learning"
2022-09-01 00:27:18,964 : INFO : topic #1 (0.333): 0.182*"work" + 0.115*"life" + 0.114*"balance" + 0.108*"great" + 0.048*"place" + 0.034*"good" + 0.030*"team" + 0.017*"learn" + 0.0

2022-09-01 00:27:20,836 : INFO : topic diff=0.222080, rho=0.235702
2022-09-01 00:27:20,840 : INFO : PROGRESS: pass 0, at document #38000/43803
2022-09-01 00:27:21,050 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:21,051 : INFO : topic #0 (0.333): 0.095*"company" + 0.094*"good" + 0.059*"great" + 0.045*"culture" + 0.027*"people" + 0.024*"overall" + 0.024*"management" + 0.011*"tech" + 0.009*"better" + 0.008*"best"
2022-09-01 00:27:21,052 : INFO : topic #1 (0.333): 0.211*"work" + 0.145*"balance" + 0.144*"life" + 0.140*"great" + 0.040*"place" + 0.038*"good" + 0.028*"team" + 0.012*"depends" + 0.008*"learn" + 0.005*"manager"
2022-09-01 00:27:21,053 : INFO : topic #2 (0.333): 0.110*"wlb" + 0.101*"great" + 0.056*"good" + 0.047*"growth" + 0.040*"career" + 0.022*"compensation" + 0.021*"bad" + 0.021*"benefits" + 0.021*"pay" + 0.020*"place"
2022-09-01 00:27:21,053 : INFO : topic diff=0.241826, rho=0.229416
2022-09-01 00:27:21,382 : INFO : -5.398 per-w

2022-09-01 00:27:23,274 : INFO : topic diff=0.212326, rho=0.204544
2022-09-01 00:27:23,278 : INFO : PROGRESS: pass 1, at document #12000/43803
2022-09-01 00:27:23,476 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:23,478 : INFO : topic #0 (0.333): 0.073*"company" + 0.069*"good" + 0.040*"great" + 0.035*"culture" + 0.025*"management" + 0.023*"people" + 0.017*"overall" + 0.015*"experience" + 0.014*"amazon" + 0.012*"tech"
2022-09-01 00:27:23,478 : INFO : topic #1 (0.333): 0.178*"work" + 0.100*"life" + 0.097*"great" + 0.096*"balance" + 0.058*"team" + 0.055*"place" + 0.037*"good" + 0.033*"learn" + 0.021*"depends" + 0.014*"dependent"
2022-09-01 00:27:23,479 : INFO : topic #2 (0.333): 0.069*"wlb" + 0.066*"great" + 0.060*"good" + 0.050*"growth" + 0.044*"career" + 0.041*"bad" + 0.023*"place" + 0.022*"opportunities" + 0.019*"learning" + 0.017*"culture"
2022-09-01 00:27:23,479 : INFO : topic diff=0.216393, rho=0.204544
2022-09-01 00:27:23,484 : INFO :

2022-09-01 00:27:25,392 : INFO : PROGRESS: pass 1, at document #30000/43803
2022-09-01 00:27:25,593 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:25,594 : INFO : topic #0 (0.333): 0.090*"company" + 0.068*"good" + 0.051*"great" + 0.038*"culture" + 0.032*"people" + 0.025*"overall" + 0.022*"management" + 0.015*"tech" + 0.013*"best" + 0.009*"smart"
2022-09-01 00:27:25,595 : INFO : topic #1 (0.333): 0.207*"work" + 0.136*"great" + 0.133*"life" + 0.133*"balance" + 0.052*"place" + 0.039*"good" + 0.030*"team" + 0.014*"learn" + 0.012*"depends" + 0.006*"dependent"
2022-09-01 00:27:25,595 : INFO : topic #2 (0.333): 0.098*"wlb" + 0.094*"great" + 0.069*"good" + 0.049*"growth" + 0.044*"career" + 0.025*"bad" + 0.023*"benefits" + 0.021*"pay" + 0.021*"place" + 0.021*"compensation"
2022-09-01 00:27:25,596 : INFO : topic diff=0.206063, rho=0.204544
2022-09-01 00:27:25,600 : INFO : PROGRESS: pass 1, at document #32000/43803
2022-09-01 00:27:25,796 : INFO : me

2022-09-01 00:27:27,677 : INFO : topic diff=0.192641, rho=0.200395
2022-09-01 00:27:27,681 : INFO : PROGRESS: pass 2, at document #4000/43803
2022-09-01 00:27:27,861 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:27,863 : INFO : topic #0 (0.333): 0.085*"company" + 0.060*"good" + 0.049*"great" + 0.042*"culture" + 0.030*"management" + 0.026*"people" + 0.020*"overall" + 0.016*"tech" + 0.013*"leadership" + 0.011*"experience"
2022-09-01 00:27:27,863 : INFO : topic #1 (0.333): 0.206*"work" + 0.130*"life" + 0.129*"balance" + 0.128*"great" + 0.051*"place" + 0.042*"good" + 0.041*"team" + 0.019*"learn" + 0.017*"depends" + 0.008*"dependent"
2022-09-01 00:27:27,864 : INFO : topic #2 (0.333): 0.089*"wlb" + 0.083*"great" + 0.074*"good" + 0.051*"growth" + 0.042*"career" + 0.029*"bad" + 0.021*"pay" + 0.021*"place" + 0.020*"compensation" + 0.018*"opportunities"
2022-09-01 00:27:27,864 : INFO : topic diff=0.204567, rho=0.200395
2022-09-01 00:27:27,868 : INF

2022-09-01 00:27:29,651 : INFO : topic diff=0.221550, rho=0.200395
2022-09-01 00:27:29,655 : INFO : PROGRESS: pass 2, at document #22000/43803
2022-09-01 00:27:29,843 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:29,845 : INFO : topic #0 (0.333): 0.083*"company" + 0.056*"good" + 0.048*"great" + 0.037*"culture" + 0.029*"people" + 0.021*"overall" + 0.021*"management" + 0.018*"tech" + 0.014*"best" + 0.010*"experience"
2022-09-01 00:27:29,845 : INFO : topic #1 (0.333): 0.201*"work" + 0.131*"great" + 0.125*"life" + 0.124*"balance" + 0.058*"place" + 0.038*"good" + 0.035*"team" + 0.019*"learn" + 0.014*"depends" + 0.008*"dependent"
2022-09-01 00:27:29,845 : INFO : topic #2 (0.333): 0.095*"wlb" + 0.084*"great" + 0.074*"good" + 0.046*"growth" + 0.045*"career" + 0.028*"bad" + 0.022*"place" + 0.020*"pay" + 0.018*"compensation" + 0.017*"culture"
2022-09-01 00:27:29,846 : INFO : topic diff=0.171080, rho=0.200395
2022-09-01 00:27:29,850 : INFO : PROGRES

2022-09-01 00:27:31,649 : INFO : PROGRESS: pass 2, at document #40000/43803
2022-09-01 00:27:31,829 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:31,830 : INFO : topic #0 (0.333): 0.097*"company" + 0.056*"good" + 0.054*"great" + 0.048*"culture" + 0.027*"people" + 0.023*"management" + 0.022*"overall" + 0.015*"best" + 0.014*"tech" + 0.008*"better"
2022-09-01 00:27:31,831 : INFO : topic #1 (0.333): 0.222*"work" + 0.150*"balance" + 0.150*"life" + 0.148*"great" + 0.046*"place" + 0.041*"good" + 0.026*"team" + 0.011*"depends" + 0.009*"learn" + 0.005*"manager"
2022-09-01 00:27:31,831 : INFO : topic #2 (0.333): 0.111*"wlb" + 0.097*"great" + 0.077*"good" + 0.046*"growth" + 0.039*"career" + 0.024*"compensation" + 0.024*"pay" + 0.023*"bad" + 0.019*"benefits" + 0.019*"place"
2022-09-01 00:27:31,831 : INFO : topic diff=0.186209, rho=0.200395
2022-09-01 00:27:31,835 : INFO : PROGRESS: pass 2, at document #42000/43803
2022-09-01 00:27:32,014 : INFO : mer

2022-09-01 00:27:33,544 : INFO : PROGRESS: pass 3, at document #14000/43803
2022-09-01 00:27:33,731 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:33,733 : INFO : topic #0 (0.333): 0.078*"company" + 0.050*"good" + 0.042*"great" + 0.041*"culture" + 0.025*"management" + 0.023*"people" + 0.018*"overall" + 0.014*"tech" + 0.013*"experience" + 0.011*"amazon"
2022-09-01 00:27:33,733 : INFO : topic #1 (0.333): 0.192*"work" + 0.113*"great" + 0.109*"life" + 0.105*"balance" + 0.058*"place" + 0.054*"team" + 0.039*"good" + 0.029*"learn" + 0.020*"depends" + 0.012*"dependent"
2022-09-01 00:27:33,734 : INFO : topic #2 (0.333): 0.078*"wlb" + 0.077*"good" + 0.071*"great" + 0.048*"growth" + 0.042*"career" + 0.039*"bad" + 0.020*"opportunities" + 0.020*"place" + 0.020*"compensation" + 0.019*"learning"
2022-09-01 00:27:33,734 : INFO : topic diff=0.189562, rho=0.196489
2022-09-01 00:27:33,738 : INFO : PROGRESS: pass 3, at document #16000/43803
2022-09-01 00:27:3

2022-09-01 00:27:35,707 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:35,708 : INFO : topic #0 (0.333): 0.090*"company" + 0.054*"good" + 0.049*"great" + 0.042*"culture" + 0.030*"people" + 0.026*"overall" + 0.021*"management" + 0.018*"best" + 0.015*"tech" + 0.009*"big"
2022-09-01 00:27:35,709 : INFO : topic #1 (0.333): 0.213*"work" + 0.146*"great" + 0.139*"life" + 0.138*"balance" + 0.053*"place" + 0.040*"good" + 0.031*"team" + 0.013*"learn" + 0.013*"depends" + 0.006*"dependent"
2022-09-01 00:27:35,709 : INFO : topic #2 (0.333): 0.107*"wlb" + 0.097*"great" + 0.081*"good" + 0.046*"growth" + 0.042*"career" + 0.025*"pay" + 0.024*"bad" + 0.023*"compensation" + 0.022*"benefits" + 0.019*"place"
2022-09-01 00:27:35,709 : INFO : topic diff=0.179223, rho=0.196489
2022-09-01 00:27:35,713 : INFO : PROGRESS: pass 3, at document #34000/43803
2022-09-01 00:27:35,894 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 0

2022-09-01 00:27:37,702 : INFO : PROGRESS: pass 4, at document #6000/43803
2022-09-01 00:27:37,873 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:37,875 : INFO : topic #0 (0.333): 0.078*"company" + 0.052*"good" + 0.042*"great" + 0.039*"culture" + 0.028*"management" + 0.024*"people" + 0.018*"overall" + 0.015*"tech" + 0.013*"leadership" + 0.012*"best"
2022-09-01 00:27:37,875 : INFO : topic #1 (0.333): 0.203*"work" + 0.125*"life" + 0.124*"great" + 0.123*"balance" + 0.056*"place" + 0.047*"team" + 0.042*"good" + 0.025*"learn" + 0.018*"depends" + 0.009*"dependent"
2022-09-01 00:27:37,876 : INFO : topic #2 (0.333): 0.085*"wlb" + 0.078*"good" + 0.077*"great" + 0.051*"growth" + 0.043*"career" + 0.032*"bad" + 0.021*"pay" + 0.020*"place" + 0.020*"compensation" + 0.020*"opportunities"
2022-09-01 00:27:37,876 : INFO : topic diff=0.199144, rho=0.192802
2022-09-01 00:27:37,880 : INFO : PROGRESS: pass 4, at document #8000/43803
2022-09-01 00:27:38,056 : I

2022-09-01 00:27:39,606 : INFO : PROGRESS: pass 4, at document #24000/43803
2022-09-01 00:27:39,788 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:39,790 : INFO : topic #0 (0.333): 0.084*"company" + 0.051*"good" + 0.045*"great" + 0.037*"culture" + 0.031*"people" + 0.024*"overall" + 0.020*"best" + 0.019*"management" + 0.017*"tech" + 0.010*"big"
2022-09-01 00:27:39,790 : INFO : topic #1 (0.333): 0.206*"work" + 0.143*"great" + 0.129*"life" + 0.127*"balance" + 0.061*"place" + 0.038*"good" + 0.032*"team" + 0.018*"learn" + 0.013*"depends" + 0.009*"google"
2022-09-01 00:27:39,791 : INFO : topic #2 (0.333): 0.101*"wlb" + 0.093*"great" + 0.076*"good" + 0.050*"growth" + 0.046*"career" + 0.026*"bad" + 0.020*"place" + 0.019*"pay" + 0.017*"compensation" + 0.017*"slow"
2022-09-01 00:27:39,791 : INFO : topic diff=0.171455, rho=0.192802
2022-09-01 00:27:39,795 : INFO : PROGRESS: pass 4, at document #26000/43803
2022-09-01 00:27:39,972 : INFO : merging cha

2022-09-01 00:27:41,722 : INFO : merging changes from 2000 documents into a model of 43803 documents
2022-09-01 00:27:41,724 : INFO : topic #0 (0.333): 0.090*"company" + 0.054*"good" + 0.046*"great" + 0.045*"culture" + 0.028*"people" + 0.026*"management" + 0.022*"overall" + 0.015*"best" + 0.013*"tech" + 0.011*"leadership"
2022-09-01 00:27:41,724 : INFO : topic #1 (0.333): 0.222*"work" + 0.151*"great" + 0.146*"balance" + 0.146*"life" + 0.050*"place" + 0.042*"good" + 0.027*"team" + 0.011*"depends" + 0.010*"learn" + 0.005*"manager"
2022-09-01 00:27:41,725 : INFO : topic #2 (0.333): 0.107*"wlb" + 0.097*"great" + 0.079*"good" + 0.046*"growth" + 0.039*"career" + 0.024*"bad" + 0.023*"pay" + 0.022*"compensation" + 0.018*"benefits" + 0.018*"place"
2022-09-01 00:27:41,725 : INFO : topic diff=0.198122, rho=0.192802
2022-09-01 00:27:41,956 : INFO : -5.741 per-word bound, 53.5 perplexity estimate based on a held-out corpus of 1803 documents with 6872 words
2022-09-01 00:27:41,956 : INFO : PROGRESS:

Here are the 10 most important words for each of the 3 topics we found:

In [23]:
#lda.print_topics()
lda_model.print_topics(num_words = 5)

2022-09-01 00:27:49,438 : INFO : topic #0 (0.333): 0.089*"company" + 0.054*"good" + 0.042*"great" + 0.041*"culture" + 0.030*"management"
2022-09-01 00:27:49,441 : INFO : topic #1 (0.333): 0.222*"work" + 0.145*"life" + 0.145*"balance" + 0.143*"great" + 0.049*"place"
2022-09-01 00:27:49,443 : INFO : topic #2 (0.333): 0.103*"wlb" + 0.089*"great" + 0.082*"good" + 0.045*"growth" + 0.036*"career"


[(0,
  '0.089*"company" + 0.054*"good" + 0.042*"great" + 0.041*"culture" + 0.030*"management"'),
 (1,
  '0.222*"work" + 0.145*"life" + 0.145*"balance" + 0.143*"great" + 0.049*"place"'),
 (2,
  '0.103*"wlb" + 0.089*"great" + 0.082*"good" + 0.045*"growth" + 0.036*"career"')]

#### Topic Space
If we want to map our documents to the topic space we need to actually use the LdaModel transformer that we created above, like so:

In [23]:
# Transform the docs from the word space to the topic space (like "transform" in sklearn)
lda_corpus = lda_model[corpus]
lda_corpus

<gensim.interfaces.TransformedCorpus at 0x7fb26192b490>

In [24]:
# Store the documents' topic vectors in a list for inspection
lda_docs = [doc for doc in lda_corpus]

Now we can take a look at the document vectors in the topic space, which are measures of the component of each document along each topic. Thus, at most a document vector can have num_topics=5 nonzero components in the topic space, and most have far fewer.

In [25]:
# Check out the document vectors in the topic space for the first 5 documents
lda_docs[0:5]

[[(0, 0.33121967), (1, 0.5834196), (2, 0.0853607)],
 [(0, 0.2583689), (1, 0.6860675), (2, 0.055563603)],
 [(0, 0.08347626), (1, 0.08605927), (2, 0.8304644)],
 [(0, 0.22352344), (1, 0.05555768), (2, 0.7209189)],
 [(0, 0.2666584), (1, 0.06666693), (2, 0.66667473)]]

In [222]:
#import gensim
#import pyLDAvis.gensim_models
#import pyLDAvis.gensim_models
#from pyl import gensim
#Creating Topic Distance Visualization 
#pyLDAvis.enable_notebook()
#p = pyLDAvis.gensim.prepare(optimal_model, corpus, id2word)
#pyl.gensim.prepare(lda, lda_corpus, id2word)
#import pyLDAvis.gensim
#Creating Topic Distance Visualization 
#pyLDA.enable_notebook()
#pyLDA.prepare(lda, corpus, id2word)

TypeError: prepare() missing 2 required positional arguments: 'vocab' and 'term_frequency'

In [26]:
import pyLDAvis
import pyLDAvis.gensim_models as gensim_vis

In [27]:
vec.vocabulary_.items()
word2id = dict((k, v) for k, v in vec.vocabulary_.items())
d = corpora.Dictionary()
d.id2token = id2word
d.token2id = word2id

In [28]:
#gensim_vis.pyLDAvis.prepare(lda, lda_corpus, id2word)
pyLDAvis.enable_notebook()
visualization = gensim_vis.prepare(lda_model, corpus = lda_corpus, dictionary = d)
visualization

  default_term_info = default_term_info.sort_values(


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


________________________________

______________________
### Future output to try

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_matrix = cosine_similarity(doc_term)

### OUTPUT THE CONTENTS OF THE MATRIX
cosine_matrix

In [None]:
# include bigrams in term list
vec = CountVectorizer(stop_words = "english", ngram_range = (1,2))

# try TFIDF approach
vec_tfidf = TfidfVectorizer()
doc_term_tfidf = vec_tfidf.fit_transform(full_data_long).toarray()
pd.DataFrame(doc_term_tfidf, columns = vec_tfidf.get_feature_names_out()

In [None]:
# spaCy application
#!python -m spacy download en_core_web_sm #this is the language model we are using--alternatives exist
import spacy
nlp = spacy.load('en_core_web_sm')

# use .pipe() to iterate over full corpora
df['spacy_doc'] = list(nlp.pipe(df.Text)) #update these terms so it matches current problem then save as new column
df[['spacy_doc', 'Type']].head()

# extract top adjectives for pros and cons reviews
pos_reviews = df[df.Type == 'pos']
pos_adj = [token.text.lower() for doc in 
						pos_reviews.spacy_doc 
						for token in doc if 
						token.pos_ == 'ADJ'] #pos here is "part of speech" not positive

neg_reviews = df[df.Type == 'neg']
neg_adj = [token.text.lower() for doc in 
						neg_reviews.spacy_doc 
						for token in doc if 
                            token.pos_ == 'ADJ'] #pos here is "part of speech" not positive

# find adjective modifiers of some noun 
noun_str = 'neighborhood'
adj_modifiers = []
top_adj_mod = []

for doc in df.spacy_doc: 
    for token in doc:
        if token.text == noun_str:
            for child in token.children:
                if child.dep_ == 'amod':
                    adj_modifiers.append(child.text.lower())

top_adj_mod = Counter(adj_modifiers).most_common(10)


In [None]:
# scattertext application
# note: might need to take a random sample of dataset as ours might be too large
# refer to scatter text demo, and exercise 4 in fancy nlp for hep
import scattertext as st
import pandas as pd

df = pd.read_csv('example.csv')

corpus = st.CorpusFromPandas(df,
						category_col = 'category',
						text_col = 'text',
						nlp = ...).build()

# following should be starting place if needing to random sample 
df.sample(5, random_state=10)

