In this notebook we will train LDA model on the full text of a job dataset ( cleaned jobs from Kaggle job recommendation challenge ) with 25 topics

# connect to drive

In [None]:
#connect to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# import

In [None]:
!pip install pyLDAvis



In [None]:
import pandas as pd
import ast
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import TfidfModel
from gensim.test.utils import datapath
from gensim import  models

#spacy
import spacy
#vis
import pyLDAvis
import pyLDAvis.gensim

# prepare data

In [None]:
jobs=pd.read_csv("/content/drive/MyDrive/data/kaggle_recommend/all_jobs.csv")
jobs

  and should_run_async(code)


Unnamed: 0.1,Unnamed: 0,JobID,City,State,Country,clean_title,clean_title_l,clean_full_text,text_non_stop,extracted_skills
0,0,457960,Raleigh,NC,US,data specialist,data specialist,data specialist functional area sales support ...,data specialist functional area sales support ...,"['data', 'sales', 'support', 'office', 'reloca..."
1,1,457962,Raleigh,NC,US,global infrastructure operations manager,global infrastructure operations manager,global infrastructure operations manager funct...,global infrastructure operations manager funct...,"['global', 'infrastructure', 'operations', 'se..."
2,2,457967,Raleigh,NC,US,investment analyst,investment analyst,investment analyst functional area accounting ...,investment analyst functional area accounting ...,"['investment', 'accounting', 'finance', 'offic..."
3,3,457998,Chandler,AZ,US,back office medical assistant up to 12 hr,back office medical assistant up to 12 hr,back office medical assistant up to 12 hr b...,back office medical assistant 12 hr back offic...,"['office', 'office', 'clients', 'healthcare', ..."
4,4,457999,Buffalo,NY,US,medical biller,medical biller,medical biller company overview for over 15 ye...,medical biller company overview 15 years clien...,"['client', 'patient support', 'health safety',..."
...,...,...,...,...,...,...,...,...,...,...
309995,179995,948968,Baltimore,MD,US,magazines account executive,magazines account executive,magazines account executive account executive ...,magazines account executive account executive ...,"['magazines', 'magazines', 'product', 'adverti..."
309996,179996,949002,Spring Lake,MI,US,seasonal restaurant staff,seasonal restaurant staff,seasonal restaurant staff we are looking for e...,seasonal restaurant staff looking experienced ...,"['hospitality', 'servers', 'ess', 'servers', '..."
309997,179997,949034,Davenport,IA,US,freight loader,freight loader,freight loader freight handlers needed monday...,freight loader freight handlers needed monday ...,"['freight', 'freight', 'openings', 'freight', ..."
309998,179998,949834,Chicago,IL,US,senior manager field marketing,senior manager field marketing,senior manager field marketing organizationa...,senior manager field marketing organizational ...,"['field marketing', 'institutional', 'services..."


❗: there is **310001 rows**

# prepare to train model

In [None]:
# first we need to split text to words (tokens)
jobs['tokens']=jobs['text_non_stop'].apply(lambda x: x.split())

In [None]:
'''
Create a dictionary from the tokenized text
This is a mapping between the unique tokens encountered in your corpus and unique integer IDs
'''
dictionary = corpora.Dictionary(jobs['tokens'])

In [None]:
'''
Corpus refers to the collection of documents to analyze using LDA.
Each document in the corpus is represented as a bag-of-words, which is a list containing the IDs (from the dictionary) of the words present in
that document, along with their frequencies (how many times each word appears)
'''
corpus = [dictionary.doc2bow(t) for t in jobs['tokens']]

# coherence & perplexity


In [None]:
# Define a function to train LDA model, compute coherence score and perplexity
def train_lda_compute_metrics(num_topics, corpus, dictionary, texts):
    # Train LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10000,
                                           passes=20,
                                           alpha=0.01,
                                           eta=0.1)

    # Compute coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()

    # Compute perplexity
    perplexity = lda_model.log_perplexity(corpus)

    return lda_model, coherence_score, perplexity


In [None]:
# Train LDA models with different numbers of topics and compute metrics
# for full text LDA we will only train on 25 to compare with skills-only model
num_topics_list = [25]
results = []
texts = jobs['tokens']
for num_topics in num_topics_list:
    lda_model, coherence_score, perplexity = train_lda_compute_metrics(num_topics, corpus, dictionary, texts)
    results.append({'num_topics': num_topics, 'coherence': coherence_score, 'perplexity': perplexity})
    temp_file = datapath(f"/content/drive/MyDrive/LDA_models/lda_model_{num_topics}_allText")
    lda_model.save(temp_file)

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Print the results
results_df

  and should_run_async(code)


Unnamed: 0,num_topics,coherence,perplexity
0,25,0.513138,-7.659146


# save model

In [None]:
temp_file = datapath("/content/drive/MyDrive/LDA_models/lda_model_allText_25")
lda_model.save(temp_file)

# print top 10 words

In [None]:
lda_model.print_topics(num_words=10)

  and should_run_async(code)


[(5,
  '0.012*"inventory" + 0.012*"warehouse" + 0.011*"driver" + 0.011*"work" + 0.011*"lift" + 0.010*"must" + 0.009*"truck" + 0.009*"ability" + 0.008*"able" + 0.008*"equipment"'),
 (16,
  '0.048*"sales" + 0.042*"marketing" + 0.031*"entry" + 0.030*"level" + 0.028*"management" + 0.019*"customer" + 0.018*"manager" + 0.016*"business" + 0.016*"service" + 0.011*"advertising"'),
 (2,
  '0.037*"food" + 0.026*"restaurant" + 0.025*"manager" + 0.018*"service" + 0.012*"customer" + 0.010*"quality" + 0.009*"managers" + 0.008*"team" + 0.008*"guest" + 0.008*"experience"'),
 (0,
  '0.036*"automotive" + 0.027*"service" + 0.019*"auto" + 0.014*"parts" + 0.014*"technician" + 0.014*"vehicle" + 0.013*"customer" + 0.012*"repair" + 0.010*"customers" + 0.009*"vehicles"'),
 (8,
  '0.015*"benefits" + 0.014*"company" + 0.014*"work" + 0.013*"insurance" + 0.010*"team" + 0.010*"opportunity" + 0.010*"career" + 0.009*"paid" + 0.009*"life" + 0.008*"dental"'),
 (22,
  '0.023*"business" + 0.014*"management" + 0.011*"exper

# Visualize

In [None]:
import warnings
# Filter out warnings
warnings.filterwarnings("ignore")


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, mds="mmds", R=30)
vis

  and should_run_async(code)
