### Imports and Loading the Data

In [1]:
import numpy as np

from gensim.models import LdaModel

# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy

from textacy import extract

In [2]:
# For topic visualizations 
import pyLDAvis.gensim_models as gensim_vis
import pyLDAvis
# For enabling HTML widget in Jupyter notebook
from pyLDAvis import enable_notebook

enable_notebook()

In [3]:
def create_list_from_csv(path):
    corpus = []
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            columns = line.split(',')   
            # Columns 1 and 2 contain the company name and the job title, both guaranteed to not include commas, and both separated by a comma. 
            # We are not analyzing this information, so we can safely discard the first two columns.
            # The third "column" contains the job description, but it may contain commas, so we use ",".join() to concatenate all the columns after the second one.
            # csv.reader()'s quotechar parameter does not seem to work for whatever reason, and this just seemed faster. 
            description = ",".join(columns[2:]).strip('"')      # strip('"') to remove leading and trailing quotes
            corpus.append(description)

    return corpus

In [4]:
strings_list = create_list_from_csv('jobs.csv') 

In [5]:
print(len(strings_list))
print(len(strings_list[-1]))
print(strings_list[-1][:200])

262
2051
 "What You'll Do Apply statistical and machine learning techniques to process and analyze unstructured textual data Develop and finetune machine learning models for tasks such as entity recognition, c


In [6]:
sum = 0
doc_lengths = []
for desc_string in strings_list:
    sum += len(desc_string.split())
    print("Number of words:", len(desc_string.split()))
    doc_lengths.append(len(desc_string.split()))
    
print(f"Total number of words in the corpus: {sum}")
print("Mean:", round(np.mean(doc_lengths),2))
print("Standard deviation:", round(np.std(doc_lengths),2))

Number of words: 178
Number of words: 45
Number of words: 180
Number of words: 180
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 329
Number of words: 198
Number of words: 142
Number of words: 134
Number of words: 134
Number of words: 180
Number of words: 291
Number of words: 291
Number of words: 134
Number of words: 134
Number of words: 230
Number of words: 552
Number of words: 391
Number of words: 118
Number of words: 275
Number of words: 293
Number of words: 355
Number of words: 307
Number of words: 366
Number of words: 209
Number of words: 666
Number of words: 191
Number of words: 448
Number of words: 139
Number of words: 216
Number of words: 142
Number of words: 51
Number of words: 109
Number of words: 172
Number of words: 116
Number of words: 151
Number of words: 115
Number of words: 136
Number of words: 271
Number of words: 177
Number of words: 165
Number of words: 177
Number of words

### Cleaning the corpus

In [7]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
nlp.Defaults.stop_words |= {"experience", "preferred", "skill", "yelp", "strong", "work", "solutions", "drive", "insights", "use", "needs", "responsibilities", "do", "particularly", "related", "leak", "radio", "understand", "apply", "self", "like", "work", "qualifications", "do", "bring", "include", "problem" }

In [8]:
"""
Old function, kept here because of how stupid it is

def clean_with_spacy(doc, lemmatize = False):

    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])   # Disable the parser and named entity recognition since we only need the tokenization, lemmatization, and POS tagging
    # Add custom stop words that don't add anything to each topic.
    nlp.Defaults.stop_words |= {"experience", "preferred", "skill", "yelp", "strong", "work", "solutions", "drive", "insights", "use", "needs", "responsibilities", "do", "particularly", "related", "leak", "radio", }
    
    spacy_doc = nlp(doc.lower())

    ngrams = [
        ngram.text.replace(" ", "_")    # ngrams are separated by spaces, so we replace them with underscores
        for ngram in extract.ngrams(spacy_doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
    ]
    
    allowed_pos_tags = ["NOUN", "ADJ", "VERB", "ADV"]
    tokens = []

    if lemmatize:
        # Remove stopwords, punctuation, and numeric tokens
        tokens = [
            token.lemma_ 
            for token in spacy_doc 
            if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters     
                and not len(token.text) == 1                                                            # Remove single character words
                and token.pos_ in allowed_pos_tags                                                      # Keep only words that are nouns, adjectives, verbs, and adverbs
        ]          

        tokens = [token.replace("datum", "data") for token in tokens]                                   # Replace "datum" (lemma of data) with "data" for clarity
    
    else:
        tokens = [
            token.text 
            for token in spacy_doc 
            if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
                and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters     
                and not len(token.text) == 1                                                            # Remove single character words
                and token.pos_ in allowed_pos_tags                                                      # Keep only words that are nouns, adjectives, verbs, and adverbs
        ]                                                                                   
    
    return tokens + ngrams
"""

def clean_without_ngrams(doc):

    spacy_doc = nlp(doc.lower())

    # Remove stopwords, punctuation, and numeric tokens
    tokens = [
        token.text 
        for token in spacy_doc 
        if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
            and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters       
            and not len(token.text) == 1                                                            # Remove single character words
    ]                                                                         
    return tokens

def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    tokens = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        sent_tokens = []
        for token in doc: 
            if "_" in token.text:
                sent_tokens.append(token.text)
            else:
                if token.pos_ in allowed_postags:
                    sent_tokens.append(token.lemma_)
                    
        sent_tokens = [token.replace("datum", "data") for token in sent_tokens]
        tokens.append(sent_tokens)

    return tokens

Creating non-lemmatized corpus (only for testing, no longer used)

In [9]:
# corpus_with_bigrams = [clean_with_spacy(doc) for doc in corpus]
# sum = 0
# for doc in corpus_with_bigrams:
#     sum += len(doc)

# print(f"Total number of words in the cleaned corpus: {sum}")

In [10]:
# print(corpus_with_bigrams[0])

In [11]:
# dictionary = Dictionary(corpus_with_bigrams)
# doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus_with_bigrams]
# print(doc_term_matrix[0])

Creating lemmatized corpus

In [12]:
bag_of_words_list = [clean_without_ngrams(doc) for doc in strings_list]

bigram = Phrases(bag_of_words_list, min_count=10, threshold=20) 
bigram_mod = Phraser(bigram)    # For speed

# Add bigrams
bag_of_words_list = [bigram_mod[doc] for doc in bag_of_words_list]

# Lemmatize the words, exluding bigrams
bag_of_words_list = lemmatize(bag_of_words_list)

sum = 0
for doc in bag_of_words_list:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 36345


In [13]:
print(bag_of_words_list[0])

['data', 'collection', 'cleaning', 'assist', 'collection', 'organization', 'data', 'data', 'clean', 'preprocessing', 'activity', 'ensure', 'data', 'accuracy', 'data', 'analysis', 'utilize', 'statistical', 'method', 'tool', 'assist', 'analysis', 'dataset', 'gsp', 'team_members', 'identify', 'trends_patterns', 'data', 'data', 'visualization', 'support', 'creation', 'visualization', 'report', 'communicate', 'data', 'finding', 'collaboration', 'collaborate', 'team_members', 'data', 'requirement', 'provide', 'support', 'deliver', 'analytical', 'learn', 'experienced', 'team_members', 'actively', 'seek', 'guidance', 'generation', 'learn', 'summarize', 'communicatedatainsight', 'clear', 'understandable', 'learning', 'actively', 'participate', 'training', 'development', 'opportunity', 'enhance', 'skill', 'job', 'bachelor_degree', 'relevant', 'field', 'mathematic', 'computer_science', 'equivalent', 'basic', 'understanding', 'data', 'analysis', 'concept', 'methodology', 'familiarity', 'data', 'an

In [14]:
dictionary = Dictionary(bag_of_words_list)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in bag_of_words_list]
print(doc_term_matrix[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 4), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 11), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 3), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 2), (63, 1), (64, 3), (65, 1), (66, 1), (67, 2), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 2)]


In [70]:
print(len(dictionary))

2764


Just out of curiosity

In [15]:
from gensim.models import HdpModel
# from pprint import pprint

hdp_model = HdpModel(doc_term_matrix, id2word = dictionary)
hdp_model.optimal_ordering()
hdp_model.show_topics(num_topics = 15, num_words = 10)

[(0,
  '0.022*development + 0.009*team + 0.009*software + 0.009*participate + 0.008*engineering + 0.008*data + 0.008*system + 0.008*experience + 0.008*year + 0.007*good'),
 (1,
  '0.017*data + 0.015*model + 0.012*machine_learning + 0.007*degree + 0.007*language + 0.006*team + 0.005*ability + 0.005*processing + 0.005*high + 0.004*natural_language'),
 (2,
  '0.013*technical + 0.011*ai + 0.009*develop + 0.009*design + 0.009*year + 0.009*model + 0.008*test + 0.007*machine_learning + 0.007*data + 0.007*engineer'),
 (3,
  '0.016*model + 0.014*machine_learning + 0.011*data + 0.007*language + 0.007*degree + 0.006*team + 0.005*ability + 0.005*high + 0.005*nlp + 0.005*processing'),
 (4,
  '0.006*data + 0.003*business + 0.003*develop + 0.003*mapreduce + 0.003*participate + 0.003*class + 0.002*technology + 0.002*structured + 0.002*cassandra + 0.002*public'),
 (5,
  '0.006*data + 0.003*tensorflow + 0.003*confidential + 0.002*process + 0.002*apache_spark + 0.002*pipeline + 0.002*technology + 0.002*b

In [16]:
alpha, beta = hdp_model.hdp_to_lda()

In [17]:
print(alpha.shape)
print(beta.shape)

(150,)
(150, 2764)


### Topic modeling with lemmatized corpus

In [77]:
NUM_TOPICS = 3
PATH_TO_MODEL = f"Bigrams_All_Jobs_LDA_{NUM_TOPICS}_topics"
lda_model = None

In [None]:
lda_model_3 = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model_3.show_topics(num_topics = -1, num_words = 10)

In [65]:
from gensim.test.utils import datapath
lda_model_3.save(datapath('Jobs_Chosen_3'))

In [73]:
LDAvis_prepared = gensim_vis.prepare(lda_model_3, doc_term_matrix, dictionary, mds='pcoa')
# pyLDAvis.display(LDAvis_prepared)
# pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [22]:
NUM_TOPICS = 4

In [23]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.082*"data" + 0.016*"tool" + 0.012*"report" + 0.011*"skill" + 0.011*"analysis" + 0.010*"ability" + 0.009*"pipeline" + 0.009*"analytical" + 0.007*"optimize" + 0.007*"process"'),
 (1,
  '0.065*"data" + 0.013*"model" + 0.010*"business" + 0.010*"team" + 0.009*"pipeline" + 0.008*"skill" + 0.008*"ensure" + 0.008*"system" + 0.008*"engineering" + 0.008*"tool"'),
 (2,
  '0.031*"model" + 0.017*"ai" + 0.012*"data" + 0.011*"year" + 0.009*"machine_learning" + 0.008*"ml" + 0.008*"work" + 0.007*"deploy" + 0.007*"skill" + 0.007*"code"'),
 (3,
  '0.026*"model" + 0.025*"machine_learning" + 0.013*"ai" + 0.013*"research" + 0.011*"team" + 0.010*"develop" + 0.010*"engineering" + 0.008*"product" + 0.008*"system" + 0.008*"data"')]

In [24]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [55]:
NUM_TOPICS = 5

In [56]:
lda_model_5 = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model_5.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.071*"data" + 0.014*"tool" + 0.013*"pipeline" + 0.010*"optimize" + 0.010*"performance" + 0.009*"query" + 0.009*"governance" + 0.009*"best_practices" + 0.008*"engineer" + 0.008*"gcp"'),
 (1,
  '0.074*"data" + 0.013*"pipeline" + 0.009*"ensure" + 0.008*"engineering" + 0.008*"business" + 0.008*"analytic" + 0.007*"system" + 0.007*"azure" + 0.006*"tool" + 0.006*"support"'),
 (2,
  '0.030*"model" + 0.023*"ai" + 0.013*"machine_learning" + 0.011*"year" + 0.008*"algorithm" + 0.007*"ml" + 0.007*"code" + 0.007*"deploy" + 0.006*"system" + 0.006*"deep_learning"'),
 (3,
  '0.032*"model" + 0.024*"machine_learning" + 0.012*"research" + 0.010*"engineering" + 0.010*"ai" + 0.009*"product" + 0.006*"team" + 0.006*"technical" + 0.006*"develop" + 0.006*"deep_learning"'),
 (4,
  '0.048*"data" + 0.018*"analysis" + 0.015*"business" + 0.012*"report" + 0.011*"ability" + 0.010*"tool" + 0.009*"statistical" + 0.009*"visualization" + 0.008*"skill" + 0.008*"client"')]

In [64]:
lda_model_5.save(datapath('Jobs_Chosen_5'))

In [58]:
LDAvis_prepared = gensim_vis.prepare(lda_model_5, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [71]:
topic_words = []
for i in range(1,NUM_TOPICS+1): 
    topic = {}   
    topic[i] = LDAvis_prepared.sorted_terms(topic = i, _lambda = 0.67)[:30]["Term"].tolist()
    topic_words.append(topic)

In [72]:
for i in range(NUM_TOPICS):
    print(topic_words[i])

{1: ['data', 'pipeline', 'ensure', 'analytic', 'azure', 'engineering', 'business', 'system', 'support', 'process', 'maintain', 'tool', 'database', 'design', 'science', 'service', 'technology', 'etl', 'manage', 'processing', 'project', 'engineer', 'skill', 'include', 'aw', 'management', 'scientist', 'architecture', 'technical', 'performance']}
{2: ['model', 'machine_learning', 'research', 'engineering', 'product', 'ai', 'deep_learning', 'impact', 'llm', 'develop', 'problem', 'team', 'technical', 'fraud', 'algorithm', 'phd', 'development', 'framework', 'include', 'technique', 'high', 'ml', 'proficiency', 'domain', 'new', 'degree', 'production', 'platform', 'language', 'risk']}
{3: ['model', 'ai', 'year', 'machine_learning', 'algorithm', 'ml', 'deploy', 'code', 'deep_learning', 'training', 'feature', 'image', 'mlop', 'deployment', 'infrastructure', 'industrial', 'technique', 'project', 'experiment', 'system', 'llm', 'improve', 'agent', 'design', 'development', 'real_world', 'generative_ai

In [28]:
NUM_TOPICS = 6

In [29]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.049*"data" + 0.015*"report" + 0.008*"warehouse" + 0.007*"snowflake" + 0.007*"stakeholder" + 0.007*"pipeline" + 0.007*"visualization" + 0.006*"tool" + 0.006*"create" + 0.006*"work"'),
 (1,
  '0.062*"data" + 0.014*"business" + 0.010*"system" + 0.009*"team" + 0.008*"skill" + 0.008*"analytic" + 0.007*"analysis" + 0.007*"support" + 0.007*"design" + 0.006*"develop"'),
 (2,
  '0.024*"model" + 0.020*"ai" + 0.010*"machine_learning" + 0.009*"design" + 0.009*"team" + 0.009*"large" + 0.008*"develop" + 0.008*"language" + 0.008*"project" + 0.008*"include"'),
 (3,
  '0.024*"model" + 0.020*"machine_learning" + 0.019*"research" + 0.016*"ai" + 0.010*"development" + 0.010*"engineering" + 0.009*"system" + 0.007*"software" + 0.007*"team" + 0.007*"deep_learning"'),
 (4,
  '0.071*"data" + 0.015*"skill" + 0.015*"report" + 0.014*"analysis" + 0.012*"team" + 0.011*"ability" + 0.010*"analyze" + 0.010*"analytical" + 0.008*"client" + 0.007*"tool"'),
 (5,
  '0.054*"data" + 0.029*"model" + 0.017*"machine_le

In [30]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [31]:
NUM_TOPICS = 7

In [32]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.042*"data" + 0.010*"report" + 0.009*"warehouse" + 0.009*"snowflake" + 0.007*"tool" + 0.006*"trend" + 0.006*"intern" + 0.006*"create" + 0.005*"visualization" + 0.005*"develop"'),
 (1,
  '0.067*"data" + 0.011*"business" + 0.010*"team" + 0.010*"skill" + 0.008*"analysis" + 0.008*"ability" + 0.008*"tool" + 0.007*"analytic" + 0.007*"design" + 0.007*"system"'),
 (2,
  '0.022*"model" + 0.020*"ai" + 0.011*"large" + 0.010*"data" + 0.010*"knowledge" + 0.008*"aw" + 0.008*"llm" + 0.008*"include" + 0.008*"skill" + 0.007*"year"'),
 (3,
  '0.024*"model" + 0.022*"machine_learning" + 0.019*"ai" + 0.017*"research" + 0.011*"system" + 0.010*"development" + 0.009*"team" + 0.009*"engineering" + 0.009*"deep_learning" + 0.008*"develop"'),
 (4,
  '0.058*"data" + 0.014*"report" + 0.014*"team" + 0.012*"client" + 0.011*"skill" + 0.011*"business" + 0.010*"analysis" + 0.009*"support" + 0.009*"analytical" + 0.007*"different"'),
 (5,
  '0.053*"data" + 0.030*"model" + 0.017*"machine_learning" + 0.014*"pipelin

In [33]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [34]:
NUM_TOPICS = 10

In [35]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.047*"data" + 0.017*"report" + 0.011*"analytical" + 0.008*"tool" + 0.007*"dashboard" + 0.007*"visualization" + 0.007*"extract" + 0.007*"key" + 0.007*"channel" + 0.007*"warehouse"'),
 (1,
  '0.063*"data" + 0.012*"business" + 0.010*"team" + 0.008*"design" + 0.008*"skill" + 0.008*"analytic" + 0.008*"analysis" + 0.008*"system" + 0.007*"development" + 0.007*"tool"'),
 (2,
  '0.018*"model" + 0.018*"ai" + 0.012*"large" + 0.012*"data" + 0.011*"year" + 0.011*"infrastructure" + 0.010*"system" + 0.010*"llm" + 0.008*"machine_learning" + 0.008*"develop"'),
 (3,
  '0.024*"ai" + 0.023*"model" + 0.017*"machine_learning" + 0.016*"development" + 0.016*"research" + 0.012*"team" + 0.011*"llm" + 0.010*"high" + 0.009*"deep_learning" + 0.008*"system"'),
 (4,
  '0.087*"data" + 0.019*"report" + 0.019*"skill" + 0.017*"analysis" + 0.013*"ability" + 0.012*"team" + 0.011*"analyze" + 0.010*"analytical" + 0.010*"support" + 0.010*"process"'),
 (5,
  '0.053*"data" + 0.030*"model" + 0.018*"machine_learning" + 

In [36]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

In [37]:
NUM_TOPICS = 13

In [38]:
lda_model = LdaModel(doc_term_matrix, num_topics = NUM_TOPICS, id2word = dictionary, alpha = 'auto', eta = 'auto', passes = 50, random_state=448)
lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.025*"data" + 0.019*"research" + 0.014*"report" + 0.012*"intern" + 0.011*"trend" + 0.009*"visualization" + 0.009*"collaborate" + 0.009*"dataset" + 0.009*"generate" + 0.009*"written_verbal"'),
 (1,
  '0.037*"data" + 0.011*"team" + 0.010*"business" + 0.010*"project" + 0.009*"design" + 0.008*"technical" + 0.008*"software" + 0.008*"skill" + 0.008*"model" + 0.007*"system"'),
 (2,
  '0.038*"ai" + 0.025*"model" + 0.016*"llm" + 0.012*"development" + 0.010*"system" + 0.010*"knowledge" + 0.010*"infrastructure" + 0.010*"year" + 0.010*"include" + 0.009*"machine_learning"'),
 (3,
  '0.023*"model" + 0.012*"team" + 0.011*"machine_learning" + 0.011*"develop" + 0.011*"research" + 0.010*"deep_learning" + 0.010*"ai" + 0.009*"year" + 0.008*"language" + 0.008*"project"'),
 (4,
  '0.085*"data" + 0.022*"report" + 0.021*"analysis" + 0.021*"skill" + 0.014*"team" + 0.014*"analyze" + 0.013*"ability" + 0.013*"visualization" + 0.013*"analytical" + 0.012*"business"'),
 (5,
  '0.037*"model" + 0.030*"data" +

In [39]:
LDAvis_prepared = gensim_vis.prepare(lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, f'Jobs_LDA_{NUM_TOPICS}.html')

### Trying TF-IDF Weighting

In [40]:
from gensim.models import TfidfModel

# Duplicating these to avoid modifying the originals
tf_corpus = doc_term_matrix
tf_dictionary = dictionary

tfidf = TfidfModel(corpus=tf_corpus, id2word=tf_dictionary)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(tf_corpus)):
    bow = tf_corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(tf_dictionary[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf score 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    tf_corpus[i] = new_bow

In [59]:
idf_lda_model_3 = LdaModel(corpus=tf_corpus, id2word=tf_dictionary, num_topics=3, random_state=448, passes=50, alpha="auto", eta = "auto")
idf_lda_model_3.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.069*"data" + 0.013*"analysis" + 0.012*"tool" + 0.009*"ability" + 0.009*"report" + 0.009*"business" + 0.008*"process" + 0.007*"analytic" + 0.007*"analytical" + 0.006*"query"'),
 (1,
  '0.071*"data" + 0.011*"pipeline" + 0.008*"ensure" + 0.008*"engineering" + 0.008*"business" + 0.007*"system" + 0.006*"team" + 0.006*"analytic" + 0.006*"azure" + 0.006*"tool"'),
 (2,
  '0.033*"model" + 0.020*"machine_learning" + 0.015*"ai" + 0.008*"research" + 0.007*"algorithm" + 0.007*"product" + 0.006*"technique" + 0.006*"engineering" + 0.006*"ml" + 0.006*"deep_learning"')]

In [60]:
LDAvis_prepared = gensim_vis.prepare(idf_lda_model_3, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, 'Jobs_IDF_LDA_3.html')

In [43]:
idf_lda_model = LdaModel(corpus=tf_corpus, id2word=tf_dictionary, num_topics=4, random_state=448, passes=50, alpha="auto", eta = "auto")
idf_lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.069*"data" + 0.013*"tool" + 0.010*"pipeline" + 0.009*"report" + 0.009*"performance" + 0.008*"query" + 0.007*"ability" + 0.007*"optimize" + 0.007*"best_practices" + 0.007*"governance"'),
 (1,
  '0.081*"data" + 0.011*"business" + 0.010*"pipeline" + 0.008*"analytic" + 0.008*"ensure" + 0.007*"analysis" + 0.007*"tool" + 0.007*"engineering" + 0.007*"process" + 0.007*"support"'),
 (2,
  '0.035*"model" + 0.017*"ai" + 0.017*"machine_learning" + 0.008*"algorithm" + 0.007*"deploy" + 0.007*"year" + 0.007*"performance" + 0.007*"technique" + 0.007*"deployment" + 0.006*"tool"'),
 (3,
  '0.024*"model" + 0.018*"machine_learning" + 0.010*"ai" + 0.010*"research" + 0.009*"engineering" + 0.008*"product" + 0.006*"team" + 0.005*"develop" + 0.005*"development" + 0.005*"include"')]

In [44]:
LDAvis_prepared = gensim_vis.prepare(idf_lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, 'Jobs_IDF_LDA_4.html')

In [62]:
idf_lda_model_5 = LdaModel(corpus=tf_corpus, id2word=tf_dictionary, num_topics=5, random_state=448, passes=50, alpha="auto", eta = "auto")
idf_lda_model_5.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.071*"data" + 0.014*"tool" + 0.013*"pipeline" + 0.010*"optimize" + 0.010*"performance" + 0.009*"query" + 0.009*"governance" + 0.009*"best_practices" + 0.008*"engineer" + 0.008*"gcp"'),
 (1,
  '0.074*"data" + 0.013*"pipeline" + 0.009*"ensure" + 0.008*"engineering" + 0.008*"business" + 0.008*"analytic" + 0.007*"system" + 0.007*"azure" + 0.006*"tool" + 0.006*"support"'),
 (2,
  '0.030*"model" + 0.023*"ai" + 0.013*"machine_learning" + 0.011*"year" + 0.008*"algorithm" + 0.007*"ml" + 0.007*"code" + 0.007*"deploy" + 0.006*"system" + 0.006*"deep_learning"'),
 (3,
  '0.032*"model" + 0.024*"machine_learning" + 0.012*"research" + 0.010*"engineering" + 0.010*"ai" + 0.009*"product" + 0.006*"team" + 0.006*"technical" + 0.006*"develop" + 0.006*"deep_learning"'),
 (4,
  '0.048*"data" + 0.018*"analysis" + 0.015*"business" + 0.012*"report" + 0.011*"ability" + 0.010*"tool" + 0.009*"statistical" + 0.009*"visualization" + 0.008*"skill" + 0.008*"client"')]

In [63]:
LDAvis_prepared = gensim_vis.prepare(idf_lda_model_5, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, 'Jobs_IDF_LDA_5.html')

In [47]:
idf_lda_model = LdaModel(corpus=tf_corpus, id2word=tf_dictionary, num_topics=6, random_state=448, passes=50, alpha="auto", eta = "auto")
idf_lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.088*"data" + 0.019*"pipeline" + 0.017*"tool" + 0.013*"governance" + 0.012*"optimize" + 0.011*"performance" + 0.010*"query" + 0.010*"engineer" + 0.009*"best_practices" + 0.008*"quality"'),
 (1,
  '0.079*"data" + 0.013*"business" + 0.010*"analytic" + 0.008*"system" + 0.008*"pipeline" + 0.007*"support" + 0.007*"design" + 0.007*"database" + 0.006*"ensure" + 0.006*"process"'),
 (2,
  '0.023*"model" + 0.021*"ai" + 0.008*"training" + 0.008*"deep_learning" + 0.008*"system" + 0.008*"tool" + 0.007*"infrastructure" + 0.007*"ml" + 0.007*"image" + 0.006*"large"'),
 (3,
  '0.018*"research" + 0.017*"model" + 0.015*"machine_learning" + 0.014*"ai" + 0.008*"engineering" + 0.008*"development" + 0.007*"team" + 0.007*"system" + 0.006*"impact" + 0.006*"technical"'),
 (4,
  '0.073*"data" + 0.017*"analysis" + 0.016*"report" + 0.014*"ability" + 0.012*"analyze" + 0.010*"visualization" + 0.010*"skill" + 0.008*"tool" + 0.008*"analytical" + 0.008*"statistical"'),
 (5,
  '0.035*"model" + 0.021*"machine_le

In [48]:
LDAvis_prepared = gensim_vis.prepare(idf_lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, 'Jobs_IDF_LDA_6.html')

In [49]:
idf_lda_model = LdaModel(corpus=tf_corpus, id2word=tf_dictionary, num_topics=7, random_state=448, passes=50, alpha="auto", eta = "auto")
idf_lda_model.show_topics(num_topics = -1, num_words = 10)

[(0,
  '0.101*"data" + 0.026*"pipeline" + 0.017*"tool" + 0.013*"optimize" + 0.012*"governance" + 0.012*"query" + 0.012*"performance" + 0.011*"engineer" + 0.010*"best_practices" + 0.009*"gcp"'),
 (1,
  '0.088*"data" + 0.010*"business" + 0.009*"analysis" + 0.009*"analytic" + 0.007*"process" + 0.007*"ensure" + 0.007*"design" + 0.007*"support" + 0.007*"database" + 0.007*"skill"'),
 (2,
  '0.026*"model" + 0.018*"ai" + 0.009*"ml" + 0.009*"tool" + 0.009*"large" + 0.008*"aw" + 0.008*"training" + 0.008*"deep_learning" + 0.008*"project" + 0.006*"year"'),
 (3,
  '0.018*"research" + 0.018*"model" + 0.018*"machine_learning" + 0.016*"ai" + 0.008*"system" + 0.008*"engineering" + 0.007*"development" + 0.007*"deep_learning" + 0.006*"include" + 0.006*"llm"'),
 (4,
  '0.037*"data" + 0.015*"report" + 0.013*"team" + 0.012*"business" + 0.011*"client" + 0.009*"analysis" + 0.008*"analytical" + 0.007*"ability" + 0.007*"support" + 0.007*"visualization"'),
 (5,
  '0.036*"model" + 0.021*"machine_learning" + 0.010

In [50]:
LDAvis_prepared = gensim_vis.prepare(idf_lda_model, doc_term_matrix, dictionary, mds='pcoa')
pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared, 'Jobs_IDF_LDA_7.html')

In the end, we go with the 3 topic and 5 topic models.

Some interesting words that might not be in the lectures dictionary could be: deploy, pipeline, etl, llm, power_bi

### Finding words in the vocabulary

In [80]:
vocab = []

for item in dictionary.items():
    vocab.append(item[1])

In [81]:
print(vocab[:10])
print(len(vocab))

['ability', 'accuracy', 'actively', 'activity', 'adapt', 'analysis', 'analytical', 'assist', 'attention_detail', 'bachelor_degree']
2764


In [108]:
words_to_find = ["deploy", "pipeline", "etl", "llm", "power_bi", "generative_ai", "gcp", "spark", "hadoop", "git", "training", "simulation", "probability", "regression", "database", "experiment", "excel", "time_series", "deep_learning", "visualization", "data", "markov_chain", "optimization", "machine_learning", "pytorch", "tensorflow", "kafka", "nlp"]
for word in words_to_find:
    print(word, end=": ")
    if word in vocab:
        print(lda_model_5.get_term_topics(word, minimum_probability=0.001))
    else:
        print("not found")
        

deploy: [(1, 0.0022688073), (2, 0.0070800367), (3, 0.001913702)]
pipeline: [(0, 0.012908377), (1, 0.012512266), (2, 0.0019316918)]
etl: [(1, 0.0040628584)]
llm: [(2, 0.0043812576), (3, 0.0054251635)]
power_bi: [(4, 0.004621462)]
generative_ai: [(2, 0.0031441827), (3, 0.0017050924)]
gcp: [(0, 0.007744878), (1, 0.0016145506), (2, 0.0011161047)]
spark: [(1, 0.0027215357), (3, 0.0017538423)]
hadoop: [(1, 0.0012411025)]
git: []
training: [(2, 0.0050621596), (3, 0.0017649262), (4, 0.0023221765)]
simulation: []
probability: []
regression: []
database: [(1, 0.004898108), (4, 0.003718958)]
experiment: [(2, 0.0037261294), (3, 0.0013175071), (4, 0.001304515)]
excel: [(4, 0.0044799517)]
time_series: [(4, 0.0026590936)]
deep_learning: [(2, 0.0057217707), (3, 0.00581918)]
visualization: [(1, 0.0022268298), (4, 0.008709503)]
data: [(0, 0.07078444), (1, 0.07411064), (4, 0.047758993)]
markov_chain: not found
optimization: [(0, 0.0028213423), (1, 0.0013771141), (2, 0.0027666984), (3, 0.0028853973)]
mach