### Imports and Loading the Data

In [144]:
import numpy as np

from gensim.models import LdaModel

# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy

from textacy import extract

In [18]:
# For topic visualizations 
import pyLDAvis.gensim_models as gensim_vis
import pyLDAvis
# For enabling HTML widget in Jupyter notebook
from pyLDAvis import enable_notebook

enable_notebook(local=True)

In [19]:
def create_list_from_csv(path):
    corpus = []
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            columns = line.split(',')   
            # Columns 1 and 2 contain the company name and the job title, both guaranteed to not include commas, and both separated by a comma. 
            # We are not analyzing this information, so we can safely discard the first two columns.
            # The third "column" contains the job description, but it may contain commas, so we use ",".join() to concatenate all the columns after the second one.
            # csv.reader()'s quotechar parameter does not seem to work for whatever reason, and this just seemed faster. 
            description = ",".join(columns[2:]).strip('"')      # strip('"') to remove leading and trailing quotes
            corpus.append(description)

    return corpus

In [20]:
corpus = create_list_from_csv('jobs.csv') 

In [21]:
print(len(corpus))
print(len(corpus[-1]))
print(corpus[-1][:200])

98
1812
 "What You'll Do Analyze, design, develop, test, review, document and troubleshoot data pipeline / ELT solutions against multiple structured and unstructured data sources. Support our team of analysts


In [22]:
sum = 0
doc_lengths = []
for doc in corpus:
    sum += len(doc.split())
    print("Number of words:", len(doc.split()))
    doc_lengths.append(len(doc.split()))
    
print(f"Total number of words in the corpus: {sum}")
print("Mean:", round(np.mean(doc_lengths),2))
print("Standard deviation:", round(np.std(doc_lengths),2))

Number of words: 178
Number of words: 45
Number of words: 180
Number of words: 180
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 329
Number of words: 198
Number of words: 142
Number of words: 134
Number of words: 134
Number of words: 180
Number of words: 291
Number of words: 291
Number of words: 134
Number of words: 134
Number of words: 230
Number of words: 552
Number of words: 391
Number of words: 118
Number of words: 275
Number of words: 293
Number of words: 355
Number of words: 307
Number of words: 366
Number of words: 209
Number of words: 666
Number of words: 191
Number of words: 448
Number of words: 139
Number of words: 244
Number of words: 216
Number of words: 142
Number of words: 51
Number of words: 109
Number of words: 172
Number of words: 116
Number of words: 151
Number of words: 115
Number of words: 136
Number of words: 271
Number of words: 177
Number of words: 165
Number of words

### Cleaning the corpus

In [194]:
def clean_with_spacy(doc, lemmatize = False):

    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])   # Disable the parser and named entity recognition since we only need the tokenization, lemmatization, and POS tagging
    # Add custom stop words that don't add anything to each topic.
    nlp.Defaults.stop_words |= {"experience", "preferred", "skill", "yelp", "strong", "work", "solutions", "drive", "insights", "use", "needs", "responsibilities", "do", "particularly", "related", "leak", "radio", }
    
    spacy_doc = nlp(doc.lower())

    ngrams = [
        ngram.text.replace(" ", "_")    # ngrams are separated by spaces, so we replace them with underscores
        for ngram in extract.ngrams(spacy_doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
    ]
    
    allowed_pos_tags = ["NOUN", "ADJ", "VERB", "ADV"]
    tokens = []

    if lemmatize:
        # Remove stopwords, punctuation, and numeric tokens
        tokens = [
            token.lemma_ 
            for token in spacy_doc 
            if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters     
                and not len(token.text) == 1                                                            # Remove single character words
                and token.pos_ in allowed_pos_tags                                                      # Keep only words that are nouns, adjectives, verbs, and adverbs
        ]          

        tokens = [token.replace("datum", "data") for token in tokens]                                   # Replace "datum" (lemma of data) with "data" for clarity
    
    else:
        tokens = [
            token.text 
            for token in spacy_doc 
            if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters     
                and not len(token.text) == 1                                                            # Remove single character words
                and token.pos_ in allowed_pos_tags                                                      # Keep only words that are nouns, adjectives, verbs, and adverbs
        ]                                                                                   
    
    return tokens + ngrams

Creating non-lemmatized corpus 

In [195]:
corpus_with_bigrams = [clean_with_spacy(doc) for doc in corpus]
sum = 0
for doc in corpus_with_bigrams:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 14290


In [196]:
print(corpus_with_bigrams[0])

['data', 'collection', 'cleaning', 'assist', 'collection', 'organization', 'data', 'data', 'cleaning', 'preprocessing', 'activities', 'ensure', 'data', 'accuracy', 'data', 'analysis', 'utilize', 'statistical', 'methods', 'tools', 'assist', 'analysis', 'datasets', 'gsp', 'team', 'members', 'identify', 'trends', 'patterns', 'data', 'data', 'visualization', 'support', 'creation', 'visualizations', 'reports', 'communicating', 'data', 'findings', 'collaboration', 'collaborate', 'team', 'members', 'understand', 'data', 'requirements', 'provide', 'support', 'delivering', 'analytical', 'learn', 'experienced', 'team', 'members', 'actively', 'seek', 'guidance', 'generation', 'learn', 'summarize', 'communicatedatainsights', 'clear', 'understandable', 'learning', 'actively', 'participate', 'training', 'development', 'opportunities', 'enhance', 'skills', 'job', 'qualifications', 'bachelor', 'degree', 'relevant', 'field', 'mathematics', 'computer', 'science', 'equivalent', 'basic', 'understanding', 

In [197]:
dictionary = Dictionary(corpus_with_bigrams)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus_with_bigrams]
print(doc_term_matrix[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 4), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 11), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 3), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 2), (68, 4), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1)]


Creating lemmatized corpus

In [198]:
lem_corpus_with_bigrams = [clean_with_spacy(doc, lemmatize=True) for doc in corpus]
sum = 0
for doc in lem_corpus_with_bigrams:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 14290


In [199]:
print(lem_corpus_with_bigrams[0])

['data', 'collection', 'cleaning', 'assist', 'collection', 'organization', 'data', 'data', 'cleaning', 'preprocesse', 'activity', 'ensure', 'data', 'accuracy', 'data', 'analysis', 'utilize', 'statistical', 'method', 'tool', 'assist', 'analysis', 'dataset', 'gsp', 'team', 'member', 'identify', 'trend', 'pattern', 'data', 'data', 'visualization', 'support', 'creation', 'visualization', 'report', 'communicate', 'data', 'finding', 'collaboration', 'collaborate', 'team', 'member', 'understand', 'data', 'requirement', 'provide', 'support', 'deliver', 'analytical', 'learn', 'experienced', 'team', 'member', 'actively', 'seek', 'guidance', 'generation', 'learn', 'summarize', 'communicatedatainsight', 'clear', 'understandable', 'learning', 'actively', 'participate', 'training', 'development', 'opportunity', 'enhance', 'skill', 'job', 'qualification', 'bachelor', 'degree', 'relevant', 'field', 'mathematic', 'computer', 'science', 'equivalent', 'basic', 'understanding', 'data', 'analysis', 'concep

In [200]:
lem_dictionary = Dictionary(lem_corpus_with_bigrams)
lem_doc_term_matrix = [lem_dictionary.doc2bow(doc) for doc in lem_corpus_with_bigrams]
print(lem_doc_term_matrix[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 4), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 11), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 3), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 2), (68, 4), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2)]


### Topic modeling with non-lemmatized corpus

In [201]:
NUM_TOPICS = 8
PATH_TO_MODEL = f"Entry_Jobs_NoLemma_LDA_{NUM_TOPICS}_topics"
lda_model = None

In [202]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.010*"algorithms" + 0.010*"analysis" + 0.009*"sentiment" + 0.009*"text" + 0.009*"quality" + 0.007*"team" + 0.007*"knowledge" + 0.007*"design" + 0.006*"best" + 0.006*"contribute"'),
 (1,
  '0.023*"learning" + 0.020*"machine" + 0.019*"models" + 0.016*"ai" + 0.010*"research" + 0.009*"skills" + 0.008*"machine_learning" + 0.008*"engineering" + 0.008*"systems" + 0.008*"performance"'),
 (2,
  '0.014*"mining" + 0.011*"data_mining" + 0.011*"programs" + 0.009*"op" + 0.009*"co" + 0.009*"school" + 0.006*"idea" + 0.005*"create" + 0.004*"new" + 0.004*"game"'),
 (3,
  '0.086*"data" + 0.016*"skills" + 0.013*"analysis" + 0.011*"tools" + 0.008*"science" + 0.008*"processes" + 0.008*"team" + 0.008*"business" + 0.008*"analytics" + 0.008*"development"'),
 (4,
  '0.064*"data" + 0.011*"skills" + 0.010*"business" + 0.010*"learning" + 0.009*"machine" + 0.008*"models" + 0.008*"teams" + 0.008*"pipelines" + 0.008*"tools" + 0.007*"systems"'),
 (5,
  '0.022*"data" + 0.014*"ability" + 0.009*"spark" + 0.009*"

In [203]:
from gensim.test.utils import datapath
lda_model.save(datapath(PATH_TO_MODEL))

In [204]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_NoLemma_{NUM_TOPICS}.html')

In [205]:
NUM_TOPICS = 3

In [206]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.009*"research" + 0.008*"learning" + 0.008*"knowledge" + 0.006*"team" + 0.006*"development" + 0.006*"design" + 0.006*"skills" + 0.005*"algorithms" + 0.005*"code" + 0.005*"systems"'),
 (1,
  '0.061*"data" + 0.012*"skills" + 0.011*"learning" + 0.009*"machine" + 0.009*"models" + 0.008*"tools" + 0.008*"business" + 0.008*"analysis" + 0.006*"science" + 0.006*"team"'),
 (2,
  '0.029*"data" + 0.013*"business" + 0.009*"skills" + 0.007*"partners" + 0.006*"design" + 0.005*"ability" + 0.005*"support" + 0.005*"different" + 0.005*"processes" + 0.004*"teams"')]

In [207]:
from gensim.test.utils import datapath
lda_model.save(datapath(PATH_TO_MODEL))

LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_NoLemma_{NUM_TOPICS}.html')

In [208]:
NUM_TOPICS = 10

In [209]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.016*"research" + 0.012*"analysis" + 0.010*"algorithms" + 0.010*"sentiment" + 0.009*"team" + 0.009*"text" + 0.009*"design" + 0.008*"contribute" + 0.008*"interns" + 0.008*"research_interns"'),
 (1,
  '0.011*"learning" + 0.011*"machine" + 0.009*"systems" + 0.009*"software" + 0.008*"infrastructure" + 0.008*"apply" + 0.007*"performance" + 0.007*"design" + 0.007*"quality" + 0.006*"skills"'),
 (2,
  '0.000*"data" + 0.000*"learning" + 0.000*"ability" + 0.000*"models" + 0.000*"skills" + 0.000*"development" + 0.000*"machine" + 0.000*"research" + 0.000*"understanding" + 0.000*"knowledge"'),
 (3,
  '0.094*"data" + 0.017*"skills" + 0.013*"analysis" + 0.012*"tools" + 0.010*"science" + 0.010*"processes" + 0.009*"ability" + 0.009*"business" + 0.009*"team" + 0.009*"analytics"'),
 (4,
  '0.038*"data" + 0.018*"learning" + 0.017*"machine" + 0.016*"models" + 0.009*"systems" + 0.009*"business" + 0.009*"ai" + 0.008*"aws" + 0.008*"skills" + 0.008*"pipelines"'),
 (5,
  '0.035*"data" + 0.009*"complex"

In [210]:
from gensim.test.utils import datapath
lda_model.save(datapath(PATH_TO_MODEL))

LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_NoLemma_{NUM_TOPICS}.html')

Effect of number of passes

In [211]:
lda_model = LdaModel(doc_term_matrix, num_topics = 7, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 25, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.012*"research" + 0.010*"analysis" + 0.008*"algorithms" + 0.007*"team" + 0.007*"quality" + 0.007*"sentiment" + 0.007*"software" + 0.007*"contribute" + 0.006*"systems" + 0.006*"collaborate"'),
 (1,
  '0.017*"learning" + 0.014*"machine" + 0.012*"ai" + 0.012*"models" + 0.009*"skills" + 0.007*"performance" + 0.007*"systems" + 0.006*"computer" + 0.006*"infrastructure" + 0.006*"training"'),
 (2,
  '0.000*"data" + 0.000*"learning" + 0.000*"models" + 0.000*"skills" + 0.000*"research" + 0.000*"machine" + 0.000*"engineering" + 0.000*"development" + 0.000*"ability" + 0.000*"technologies"'),
 (3,
  '0.091*"data" + 0.016*"skills" + 0.013*"analysis" + 0.011*"tools" + 0.009*"team" + 0.009*"ability" + 0.008*"processes" + 0.008*"science" + 0.008*"analytical" + 0.007*"reports"'),
 (4,
  '0.050*"data" + 0.018*"learning" + 0.017*"machine" + 0.015*"models" + 0.010*"pipelines" + 0.009*"skills" + 0.009*"business" + 0.009*"systems" + 0.008*"engineering" + 0.008*"aws"'),
 (5,
  '0.037*"data" + 0.011*"

In [212]:
lda_model = LdaModel(doc_term_matrix, num_topics = 7, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.011*"research" + 0.009*"analysis" + 0.008*"algorithms" + 0.008*"team" + 0.007*"quality" + 0.007*"sentiment" + 0.007*"software" + 0.007*"contribute" + 0.006*"collaborate" + 0.006*"systems"'),
 (1,
  '0.016*"learning" + 0.013*"machine" + 0.011*"ai" + 0.010*"models" + 0.009*"skills" + 0.007*"systems" + 0.007*"performance" + 0.006*"infrastructure" + 0.006*"computer" + 0.005*"training"'),
 (2,
  '0.000*"data" + 0.000*"learning" + 0.000*"models" + 0.000*"research" + 0.000*"skills" + 0.000*"machine" + 0.000*"engineering" + 0.000*"development" + 0.000*"ability" + 0.000*"technologies"'),
 (3,
  '0.093*"data" + 0.017*"skills" + 0.013*"analysis" + 0.011*"tools" + 0.009*"team" + 0.009*"science" + 0.009*"ability" + 0.009*"processes" + 0.007*"analytical" + 0.007*"business"'),
 (4,
  '0.043*"data" + 0.020*"learning" + 0.019*"machine" + 0.016*"models" + 0.010*"pipelines" + 0.009*"systems" + 0.009*"business" + 0.009*"engineering" + 0.008*"aws" + 0.008*"skills"'),
 (5,
  '0.035*"data" + 0.010*

In [213]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_NoLemma_7_Auto_50_passes.html')

Just out of curiosity

In [214]:
from gensim.models import HdpModel
# from pprint import pprint

hdp_model = HdpModel(doc_term_matrix, id2word = dictionary)
hdp_model.optimal_ordering()
hdp_model.show_topics(num_topics = 15, num_words = 10)

[(0,
  '0.010*data + 0.005*aws + 0.004*systems + 0.004*analytics + 0.004*machine + 0.004*learning + 0.004*building + 0.004*product + 0.003*manageable + 0.003*enabling'),
 (1,
  '0.005*learning + 0.004*machine + 0.004*machine_learning + 0.004*data + 0.003*models + 0.003*image + 0.002*large + 0.002*energy + 0.002*engineering + 0.002*model'),
 (2,
  '0.014*data + 0.003*environment + 0.003*present + 0.003*skills + 0.002*positions + 0.002*ability + 0.002*codepipeline + 0.002*colleagues + 0.002*participate + 0.002*thrive'),
 (3,
  '0.005*data + 0.004*requirements + 0.003*production + 0.003*company + 0.003*failure + 0.003*clauses + 0.003*machine_learning + 0.003*platforms + 0.002*codebase + 0.002*masterpieces'),
 (4,
  '0.006*data + 0.004*business + 0.003*systems + 0.003*good + 0.003*learning + 0.003*team + 0.003*overcome + 0.003*standardize + 0.002*workflows + 0.002*ems'),
 (5,
  '0.006*data + 0.003*cross + 0.003*methods + 0.003*reports + 0.003*uncovering + 0.003*skills + 0.003*insurance + 0

In [215]:
alpha, beta = hdp_model.hdp_to_lda()

In [216]:
print(alpha.shape)
print(beta.shape)

(150,)
(150, 2190)


### Topic modeling with lemmatized corpus

In [217]:
NUM_TOPICS = 8
PATH_TO_MODEL = f"Entry_Jobs_LDA_{NUM_TOPICS}_topics"
lda_model = None

In [218]:
lda_model = LdaModel(lem_doc_term_matrix, num_topics = NUM_TOPICS, id2word = lem_dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.008*"machine" + 0.008*"record" + 0.008*"public" + 0.008*"track" + 0.008*"cloud" + 0.005*"open" + 0.005*"kubernete" + 0.005*"application" + 0.005*"distribute" + 0.005*"idea"'),
 (1,
  '0.021*"model" + 0.011*"data" + 0.011*"code" + 0.010*"vision" + 0.010*"framework" + 0.010*"language" + 0.009*"knowledge" + 0.008*"proficiency" + 0.008*"research" + 0.008*"team"'),
 (2,
  '0.018*"research" + 0.014*"data" + 0.013*"ability" + 0.008*"candidate" + 0.008*"skill" + 0.007*"system" + 0.007*"research_interns" + 0.007*"intern" + 0.007*"layer" + 0.006*"innovative"'),
 (3,
  '0.036*"model" + 0.024*"machine" + 0.022*"learning" + 0.018*"ai" + 0.016*"data" + 0.013*"machine_learning" + 0.011*"learn" + 0.010*"design" + 0.010*"team" + 0.010*"engineering"'),
 (4,
  '0.021*"data" + 0.014*"application" + 0.011*"include" + 0.010*"learn" + 0.010*"real" + 0.010*"code" + 0.009*"feature" + 0.009*"cloud" + 0.009*"skill" + 0.009*"analysis"'),
 (5,
  '0.018*"data" + 0.013*"skill" + 0.011*"product" + 0.010*"pr

In [219]:
from gensim.test.utils import datapath
lda_model.save(datapath(PATH_TO_MODEL))

In [220]:
LDAvis_prepared = gensim_vis.prepare(lda_model, lem_doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

IndexError: index 1811 is out of bounds for axis 1 with size 1811

In [186]:
NUM_TOPICS = 3

In [187]:
lda_model = LdaModel(lem_doc_term_matrix, num_topics = NUM_TOPICS, id2word = lem_dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.014*"learning" + 0.010*"machine" + 0.010*"models" + 0.009*"data" + 0.009*"research" + 0.008*"knowledge" + 0.006*"design" + 0.006*"skills" + 0.005*"systems" + 0.005*"development"'),
 (1,
  '0.065*"data" + 0.011*"skills" + 0.011*"business" + 0.008*"tools" + 0.008*"learning" + 0.007*"analysis" + 0.006*"analytics" + 0.006*"machine" + 0.006*"science" + 0.006*"teams"'),
 (2,
  '0.038*"data" + 0.014*"skills" + 0.011*"learning" + 0.008*"machine" + 0.008*"team" + 0.008*"models" + 0.006*"tools" + 0.006*"analysis" + 0.006*"ability" + 0.006*"computer"')]

In [188]:
LDAvis_prepared = gensim_vis.prepare(lda_model, lem_doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [189]:
NUM_TOPICS = 10

In [None]:
lda_model = LdaModel(lem_doc_term_matrix, num_topics = NUM_TOPICS, id2word = lem_dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.013*"research" + 0.010*"analysis" + 0.009*"contribute" + 0.009*"algorithms" + 0.009*"systems" + 0.008*"data" + 0.008*"collaborate" + 0.008*"sentiment" + 0.008*"text" + 0.008*"infrastructure"'),
 (1,
  '0.056*"data" + 0.015*"clients" + 0.014*"client" + 0.014*"reports" + 0.011*"teams" + 0.009*"account_teams" + 0.009*"account" + 0.008*"findings" + 0.007*"tools" + 0.006*"analyze"'),
 (2,
  '0.021*"ability" + 0.010*"sql" + 0.010*"processes" + 0.008*"scripts" + 0.008*"excel" + 0.008*"database" + 0.008*"understand" + 0.007*"data" + 0.006*"escalation" + 0.006*"formulas"'),
 (3,
  '0.031*"data" + 0.029*"learning" + 0.025*"machine" + 0.017*"models" + 0.014*"systems" + 0.011*"machine_learning" + 0.011*"aws" + 0.008*"building" + 0.008*"product" + 0.007*"analytics"'),
 (4,
  '0.069*"data" + 0.015*"skills" + 0.014*"business" + 0.009*"team" + 0.009*"science" + 0.008*"tools" + 0.008*"learning" + 0.007*"models" + 0.007*"development" + 0.007*"analysis"'),
 (5,
  '0.015*"knowledge" + 0.011*"und

In [191]:
LDAvis_prepared = gensim_vis.prepare(lda_model, lem_doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [192]:
print("Done2!")

Done!
