### Imports and Loading the Data

In [2]:
import numpy as np

# Used to tokenize the text; i.e. create a dictionary mapping words to integers. The dictionary can be used to create a term-document matrix.
from gensim.corpora import Dictionary

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

import spacy

from textacy import extract

In [3]:
def create_list_from_csv(path):
    corpus = []
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            columns = line.split(',')   
            # Columns 1 and 2 contain the company name and the job title, both guaranteed to not include commas, and both separated by a comma. 
            # We are not analyzing this information, so we can safely discard the first two columns.
            # The third "column" contains the job description, but it may contain commas, so we use ",".join() to concatenate all the columns after the second one.
            # csv.reader()'s quotechar parameter does not seem to work for whatever reason, and this just seemed faster. 
            description = ",".join(columns[2:]).strip('"')      # strip('"') to remove leading and trailing quotes
            corpus.append(description)

    return corpus

In [4]:
corpus = create_list_from_csv('jobs.csv') 

In [5]:
print(len(corpus))
print(len(corpus[-1]))
print(corpus[-1][:500])

98
1812
 "What You'll Do Analyze, design, develop, test, review, document and troubleshoot data pipeline / ELT solutions against multiple structured and unstructured data sources. Support our team of analysts through developing requirements and delivering solutions. Develop code to scrape public websites for data and perform ELT processes. Maintain, monitor, and support production ELT processes and respond to error and emergency issues. Who You Are You have excellent knowledge and experience with Big Da


In [6]:
sum = 0
doc_lengths = []
for doc in corpus:
    sum += len(doc.split())
    print("Number of words:", len(doc.split()))
    doc_lengths.append(len(doc.split()))
    
print("Standard deviation:", np.std(doc_lengths))
print("Mean:", np.mean(doc_lengths))
print(f"Total number of words in the corpus: {sum}")

Number of words: 178
Number of words: 45
Number of words: 180
Number of words: 180
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 284
Number of words: 329
Number of words: 198
Number of words: 142
Number of words: 134
Number of words: 134
Number of words: 180
Number of words: 291
Number of words: 291
Number of words: 134
Number of words: 134
Number of words: 230
Number of words: 552
Number of words: 391
Number of words: 118
Number of words: 275
Number of words: 293
Number of words: 355
Number of words: 307
Number of words: 366
Number of words: 209
Number of words: 666
Number of words: 191
Number of words: 448
Number of words: 139
Number of words: 244
Number of words: 216
Number of words: 142
Number of words: 51
Number of words: 109
Number of words: 172
Number of words: 116
Number of words: 151
Number of words: 115
Number of words: 136
Number of words: 271
Number of words: 177
Number of words: 165
Number of words

### Cleaning the corpus

In [8]:
def clean_with_spacy(doc):

    nlp = spacy.load("en_core_web_sm")
    # Add custom stop words that don't add anything to each topic.
    nlp.Defaults.stop_words |= {"experience", "preferred", "skill", "yelp"}
    
    spacy_doc = nlp(doc.lower())

    ngrams = [
        ngram.text.replace(" ", "_")    # ngrams are separated by spaces, so we replace them with underscores
        for ngram in extract.ngrams(spacy_doc, n = 2, min_freq = 4, filter_punct = True, filter_nums = True, exclude_pos=["PROPN", "ORG", "DATE", "X"]) 
        if not ngram.text.__contains__("=") 
            and not ngram.text.__contains__("@") 
            and not ngram.text.__contains__("$")
    ]
    
    allowed_pos_tags = ["NOUN", "ADJ", "VERB", "ADV"]
    
    # Remove stopwords, punctuation, and numeric tokens
    tokens = [
        token.text 
        for token in spacy_doc 
        if not token.is_stop and not token.is_punct and not token.is_digit and token.is_alpha       # Keep only words that are not stop words
            and token.text not in ["_", "+", "=", "\n","-","*","<",">"]                             # Remove special characters     
            and not len(token.text) == 1                                                            # Remove single character words
            and token.pos_ in allowed_pos_tags                                                      # Keep only words that are nouns, adjectives, verbs, and adverbs
    ]                                                                             
    
    return tokens + ngrams

In [9]:
corpus_with_bigrams = [clean_with_spacy(doc) for doc in corpus]
sum = 0
for doc in corpus_with_bigrams:
    sum += len(doc)

print(f"Total number of words in the cleaned corpus: {sum}")

Total number of words in the cleaned corpus: 14872


In [26]:
print(corpus_with_bigrams[0])

['data', 'collection', 'cleaning', 'assist', 'collection', 'organization', 'data', 'data', 'cleaning', 'preprocessing', 'activities', 'ensure', 'data', 'accuracy', 'data', 'analysis', 'utilize', 'statistical', 'methods', 'tools', 'assist', 'analysis', 'datasets', 'work', 'gsp', 'team', 'members', 'identify', 'trends', 'patterns', 'insights', 'data', 'data', 'visualization', 'support', 'creation', 'visualizations', 'reports', 'communicating', 'data', 'findings', 'collaboration', 'collaborate', 'team', 'members', 'understand', 'data', 'requirements', 'provide', 'support', 'delivering', 'analytical', 'solutions', 'learn', 'experienced', 'team', 'members', 'actively', 'seek', 'guidance', 'generation', 'learn', 'summarize', 'communicatedatainsights', 'clear', 'understandable', 'learning', 'actively', 'participate', 'training', 'development', 'opportunities', 'enhance', 'skills', 'job', 'qualifications', 'bachelor', 'degree', 'relevant', 'field', 'mathematics', 'computer', 'science', 'equiva

In [27]:
dictionary = Dictionary(corpus_with_bigrams)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus_with_bigrams]
print(doc_term_matrix[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 4), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 11), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 3), (47, 1), (48, 1), (49, 3), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 3), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 4), (73, 1), (74, 1), (75, 2), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 2)]


In [28]:
NUM_TOPICS = 5
PATH_TO_MODEL = f"Entry_Jobs_Test_LDA_{NUM_TOPICS}_topics"
lda_model = None

In [1]:
from gensim.models import LdaModel

lda_model = LdaModel(doc_term_matrix, num_topics=NUM_TOPICS, id2word = dictionary, alpha = "auto", passes = 10)
lda_model.show_topics(num_words=20)

NameError: name 'doc_term_matrix' is not defined

In [30]:
lda_model.top_topics(doc_term_matrix, dictionary=dictionary, coherence='u_mass')

[([(0.02891476, 'data'),
   (0.008088132, 'work'),
   (0.0072611975, 'business'),
   (0.0070102685, 'analytics'),
   (0.006782891, 'skills'),
   (0.0049247625, 'analysis'),
   (0.0049240515, 'learning'),
   (0.0044365465, 'software'),
   (0.0044176574, 'tools'),
   (0.0042466777, 'develop'),
   (0.004184552, 'systems'),
   (0.004161931, 'design'),
   (0.0041446867, 'insights'),
   (0.004088707, 'solutions'),
   (0.0040054517, 'machine'),
   (0.0039584856, 'knowledge'),
   (0.003935261, 'python'),
   (0.0038474668, 'sql'),
   (0.0038412695, 'understanding'),
   (0.0037641556, 'strong')],
  -0.5648223318265668),
 ([(0.07230802, 'data'),
   (0.01074812, 'skills'),
   (0.009567367, 'business'),
   (0.0080453735, 'work'),
   (0.007714178, 'analysis'),
   (0.007149902, 'strong'),
   (0.006488929, 'python'),
   (0.006410108, 'ability'),
   (0.0063387807, 'team'),
   (0.006117394, 'science'),
   (0.005660055, 'tools'),
   (0.0056115407, 'systems'),
   (0.005559718, 'learning'),
   (0.005268041

In [31]:
from gensim.test.utils import datapath
lda_model.save(datapath(PATH_TO_MODEL))

In [33]:
from gensim.models import HdpModel
# from pprint import pprint

hdp_model = HdpModel(doc_term_matrix, id2word = dictionary)
hdp_model.show_topics()

[(0,
  '0.014*data + 0.007*business + 0.005*systems + 0.004*machine + 0.004*aws + 0.004*product + 0.004*learning + 0.004*skills + 0.004*analytics + 0.004*building + 0.003*use + 0.003*yelp + 0.003*kafka + 0.003*apache + 0.003*users + 0.003*knowledge + 0.003*scale + 0.003*work + 0.003*content + 0.003*strong'),
 (1,
  '0.009*data + 0.003*consistent + 0.003*design + 0.003*workshops + 0.003*ai + 0.003*required + 0.002*skills + 0.002*client + 0.002*gitlab + 0.002*analytical + 0.002*procedures + 0.002*updated + 0.002*unlock + 0.002*cloud + 0.002*configurations + 0.002*science + 0.002*existing + 0.002*cd + 0.002*evaluation + 0.002*work'),
 (2,
  '0.011*data + 0.004*term + 0.003*successful + 0.003*schema + 0.003*learning + 0.003*analysis + 0.003*submitted + 0.002*trusted + 0.002*starting + 0.002*industrial + 0.002*sql + 0.002*machine + 0.002*elt + 0.002*uncover + 0.002*improvements + 0.002*lifecycle + 0.002*reference + 0.002*spark + 0.002*goolge + 0.002*cloud'),
 (3,
  '0.008*data + 0.003*atten

In [34]:
alpha, beta = hdp_model.hdp_to_lda()

In [35]:
print(alpha.shape)
print(beta.shape)

(150,)
(150, 2291)
