# LDA visualization: Topic label 0-19 -  APPROACH 1
## NOTE: Approach 1 for calculate Dominant topic

**APPROACH 1:** (Based on the user-defined-function by Lacic et al. - This one was used during feature engineering)
Method:
- Extracting words from the text.
- Checking which of these words are present in the feature_names of the LDA model.
- For each topic, calculate the sum of word importances (from the LDA components) only for the words that overlap with the document.
- Assign the topic with the maximum summed importance as the dominant topic for the document.


**APPROACH 2:** This is the more standard way of determining the dominant topic using scikit-learn's LDA.
Method: 
- Directly transforming the document-term matrix with the LDA model to get the topic distribution for each document. 
- Assign the topic with the highest probability as the dominant topic for the document.

APPROACH 1 is based on the raw occurrence of words from the document in the LDA components, whereas APPROACH 2 is based on the probabilistic topic distribution of the document as outputted by the LDA model.

In [1]:
import numpy as np
import pandas as pd
import subprocess
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from bs4 import BeautifulSoup
import pickle

In [2]:
# Run this for reproduce
# Gets the current working directory
import os
cwd = os.getcwd()
print("Working directory:", cwd)
# Go up one directory from working directory
os.chdir("..")

Working directory: /Users/anhtth/Library/CloudStorage/OneDrive-UniversityofTwente/2023 UT- THESIS/1-Code/0.cb12_main/nb_xai_viz


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
interim_path = "./cb12-interim/"

# Load LDA model and feature names

In [5]:
# LDA model for jobs

lda_title = pickle.load(open(interim_path + "lda_title.model", 'rb'))
tf_feature_names_title = pickle.load(open(interim_path + "lda_title.fnames", 'rb'))

lda_desc = pickle.load(open(interim_path + "lda_desc.model", 'rb'))
tf_feature_names_desc = pickle.load(open(interim_path + "lda_desc.fnames", 'rb'))

lda_req = pickle.load(open(interim_path + "lda_req.model", 'rb'))
tf_feature_names_req = pickle.load(open(interim_path + "lda_req.fnames", 'rb'))

#display_topics(lda_title, tf_feature_names_title, 10)
#display_topics(lda_desc, tf_feature_names_desc, 10)
#display_topics(lda_req, tf_feature_names_req, 10)

In [6]:
# LDA model for work history
lda_user = pickle.load(open(interim_path + "lda_user_history.model", 'rb'))
tf_feature_names_user = pickle.load(open(interim_path + "lda_user_history.fnames", 'rb'))

# Load user defined function used in LDA

In [7]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def calc_lda(df, no_features = 1000, no_topics = 20):
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=2, 
                                    max_features=no_features, 
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(df)
    tf_feature_names = tf_vectorizer.get_feature_names_out()

    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, 
                                    max_iter=5, 
                                    learning_method='online', 
                                    learning_offset=50.,random_state=0).fit(tf)


    no_top_words = 5
    display_topics(lda, tf_feature_names, no_top_words)
    return lda, tf_feature_names

In [8]:
def apply_topics(text, model, feature_names, fn_dict):
    words = re.findall(r"[\w']+", text)
        
    overlap = [value for value in feature_names if value in words]
    
    max_sum_t = -1.0
    max_topic_idx = None
    for topic_idx, topic in enumerate(model.components_):
        sum_t = 0.0
        for word in overlap:
            sum_t += topic[fn_dict[word]]
        
        if sum_t > max_sum_t:
            max_sum_t = sum_t
            max_topic_idx = topic_idx
    
    return max_topic_idx

        
def gen_fn_dict(feature_names):
    fn_dict = {}
    f_idx = 0
    
    for name in feature_names:
        fn_dict[name] = f_idx
        f_idx += 1
        
    return fn_dict

fn_dict_req = gen_fn_dict(tf_feature_names_req)
fn_dict_desc = gen_fn_dict(tf_feature_names_desc)
fn_dict_title = gen_fn_dict(tf_feature_names_title)
fn_dict_user = gen_fn_dict(tf_feature_names_user)

# Import data

In [9]:
# Raw jobs data with text
jobs = pd.read_csv("./data_processed/jobset_clean.csv")

In [10]:
# jobset after LDA (with 3 columns of topic model)
job_lda = pd.read_csv(interim_path + "jobset_lda.csv")

In [11]:
# jobset only contain LDA topic model
job_fm = pd.read_csv('./data_interim/jobs_fm.csv')

In [12]:
jobs["Requirements"].fillna("", inplace=True)
jobs["Description"].fillna("", inplace=True)
jobs["Title"].fillna("", inplace=True)

In [13]:
len(jobs) == len(job_lda)

True

In [14]:
len(job_lda) == len(job_fm)

True

## User work history corpus

In [15]:
# user set after LDA (corpus: column: JobTitle, dominant topic: column: WorkHistoryTopic)
user_lda = pd.read_csv(interim_path + "user_history_lda.csv")

In [16]:
# user only contain LDA topic model and discretized feature of user profile
user_fm = pd.read_csv('./data_interim/users_fm.csv')

In [17]:
len(user_lda) == len(user_fm)

True

# Load the tf_vectorizer used before training LDA model

In [18]:
np.random.seed(0) 
tf_vectorizer = CountVectorizer(max_df=0.95, 
                                min_df=2, 
                                max_features=1000, 
                                stop_words='english')


# Topic contribution for each document

In [19]:
# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# User-define function for topic contribution calculation

In [20]:
import re

def get_topic_contributions(text, model, feature_names, fn_dict):
    words = re.findall(r"[\w']+", text)
    overlap = [value for value in feature_names if value in words]
    
    topic_scores = []
    for topic in model.components_:
        sum_t = sum(topic[fn_dict[word]] for word in overlap if word in fn_dict)
        topic_scores.append(sum_t)
    
    # Normalize the scores to make them more interpretable (similar to probabilities)
    total_score = sum(topic_scores)
    topic_scores = [score / total_score if total_score > 0 else score for score in topic_scores]
    
    return topic_scores

In [21]:
# Get dataframe of topic contribution (QUICK version, using feature names and feature dictionary)
# DO THIS AFTER FIT TRANSFORM CountVectorizer on the new corpus
def get_df_topic_contributions(lda_model, fn_dict, feature_names, corpus):

    topic_contributions = [get_topic_contributions(doc, lda_model, feature_names, fn_dict) for doc in corpus]

    # Convert to a DataFrame
    df_topic_contributions = pd.DataFrame(topic_contributions, columns=[f"Topic{i}" for i in range(lda_model.n_components)])

    # Extract dominant topic for each document
    df_topic_contributions['dominant_topic'] = df_topic_contributions.idxmax(axis=1)
    df_topic_contributions['JobID'] = jobs['JobID'].values
    
    return df_topic_contributions

In [22]:
# Get dataframe of topic contribution (LONG VERSION: re-extract feature names and feature dictionary)
# DO THIS AFTER FIT TRANSFORM CountVectorizer on the new corpus
def get_df_topic_contributions_v2(lda_model, corpus):
    # Create the feature name dictionary for fast lookup
    fn_dict = {word: idx for idx, word in enumerate(tf_vectorizer.get_feature_names_out())}

    topic_contributions = [get_topic_contributions(doc, lda_model, tf_vectorizer.get_feature_names_out(), fn_dict) for doc in corpus]

    # Convert to a DataFrame
    df_topic_contributions = pd.DataFrame(topic_contributions, columns=[f"Topic{i}" for i in range(lda_model.n_components)])

    # Extract dominant topic for each document
    df_topic_contributions['dominant_topic'] = df_topic_contributions.idxmax(axis=1)
    df_topic_contributions['JobID'] = jobs['JobID'].values
    
    return df_topic_contributions

## Corpus: Job Title

In [23]:
np.random.seed(0)
tf_title = tf_vectorizer.fit_transform(jobs["Title"])

In [24]:
%%time
df_topic_contributions_title = get_df_topic_contributions(lda_model = lda_title, 
                                                          feature_names = tf_feature_names_title,
                                                         fn_dict = fn_dict_title,
                                                         corpus = jobs["Title"])

CPU times: user 1min 29s, sys: 914 ms, total: 1min 29s
Wall time: 1min 30s


In [25]:
df_topic_contributions_title.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,JobID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,7
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,9


In [26]:
# Check with the use case (JobID = 32115) >> Dominant topic: 0
df_topic_contributions_title[df_topic_contributions_title.JobID==32115].T

Unnamed: 0,823340
Topic0,0.0
Topic1,0.0
Topic2,0.0
Topic3,0.0
Topic4,0.0
Topic5,0.0
Topic6,0.0
Topic7,0.0
Topic8,0.0
Topic9,0.0


In [27]:
%%time
df_topic_contributions_title_v2 = get_df_topic_contributions_v2(lda_model = lda_title, corpus = jobs["Title"])

CPU times: user 8min 17s, sys: 5.93 s, total: 8min 23s
Wall time: 8min 33s


In [28]:
df_topic_contributions_title_v2.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,JobID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,7
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,9


In [29]:
# Check with the use case (JobID = 32115) >> Dominant topic: 0
df_topic_contributions_title_v2[df_topic_contributions_title_v2.JobID==32115].T

Unnamed: 0,823340
Topic0,0.0
Topic1,0.0
Topic2,0.0
Topic3,0.0
Topic4,0.0
Topic5,0.0
Topic6,0.0
Topic7,0.0
Topic8,0.0
Topic9,0.0


In [30]:
df_topic_contributions_title.to_csv('./nb_xai_viz/job_title_topic_contribution_approach1.csv', header=True, index=False)

## Corpus: Job Requirement

In [31]:
np.random.seed(0) 
tf_req = tf_vectorizer.fit_transform(jobs["Requirements"])

In [32]:
%%time
df_topic_contributions_req = get_df_topic_contributions(lda_model = lda_req,  
                                                        fn_dict = fn_dict_req,
                                                       feature_names = tf_feature_names_req,
                                                       corpus = jobs["Requirements"])

CPU times: user 21min 42s, sys: 12.1 s, total: 21min 54s
Wall time: 31min 52s


In [33]:
df_topic_contributions_req.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,JobID
0,2.956947e-07,0.031004,0.03512688,0.03265922,0.03829639,0.3963397,2.956947e-07,0.0120084,0.09558092,0.001114,...,0.004045866,2.956947e-07,0.003714875,2.956947e-07,0.05202218,2.956947e-07,0.1630038,2.956947e-07,Topic5,1
1,2.343465e-07,0.01487741,0.02097691,0.07910635,2.343465e-07,0.1839718,2.343465e-07,0.03861221,0.1106454,0.004833,...,0.003021517,0.001848669,0.007936995,2.343465e-07,0.03684743,0.0004965933,0.3553368,0.001207556,Topic18,4
2,0.355774,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,0.2182099,2.618084e-07,2.618084e-07,8.3e-05,...,2.618084e-07,2.618084e-07,2.618084e-07,0.4259291,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,Topic15,7
3,0.355774,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,0.2182099,2.618084e-07,2.618084e-07,8.3e-05,...,2.618084e-07,2.618084e-07,2.618084e-07,0.4259291,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,Topic15,8
4,0.355774,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,0.2182099,2.618084e-07,2.618084e-07,8.3e-05,...,2.618084e-07,2.618084e-07,2.618084e-07,0.4259291,2.618084e-07,2.618084e-07,2.618084e-07,2.618084e-07,Topic15,9


In [34]:
# Check with the use case (JobID = 32115) >> Dominant topic: 15
df_topic_contributions_req[df_topic_contributions_req.JobID==32115].T

Unnamed: 0,823340
Topic0,0.355774
Topic1,0.0
Topic2,0.0
Topic3,0.0
Topic4,0.0
Topic5,0.0
Topic6,0.21821
Topic7,0.0
Topic8,0.0
Topic9,0.000083


In [35]:
df_topic_contributions_req.to_csv('./nb_xai_viz/job_req_topic_contribution_approach1.csv', header=True, index=False)

## Corpus: Job Description

In [36]:
np.random.seed(0)
tf_desc = tf_vectorizer.fit_transform(jobs["Description"])

In [37]:
%%time
df_topic_contributions_desc = get_df_topic_contributions(lda_model = lda_desc,
                                                         fn_dict = fn_dict_desc,
                                                         feature_names = tf_feature_names_desc,
                                                         corpus = jobs["Description"]
                                                        )

CPU times: user 55min 59s, sys: 35.1 s, total: 56min 34s
Wall time: 2h 58min 29s


In [38]:
df_topic_contributions_desc.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,JobID
0,0.008765,0.135354,0.116275,0.024496,0.037504,0.029715,0.017361,0.054652,0.038463,0.136434,...,0.025763,0.037027,0.027957,0.151726,0.03187471,0.052917,0.008848,0.013383,Topic15,1
1,0.027675,0.081932,0.054512,0.015138,0.040189,0.072165,0.020001,0.069018,0.017697,0.1265,...,0.020893,0.024925,0.034422,0.127947,0.03084547,0.055423,0.101503,0.02252,Topic15,4
2,0.001574,0.038947,0.251121,0.048345,0.014032,0.027268,0.011726,0.031536,0.019484,0.012822,...,0.026225,0.012098,0.000548,0.014039,8.161023e-08,0.062874,0.385774,0.024747,Topic18,7
3,0.028836,0.035223,0.055944,0.01653,0.058781,0.110032,0.026151,0.159088,0.012459,0.10766,...,0.024002,0.028109,0.023838,0.071605,0.0120159,0.061189,0.067034,0.031525,Topic7,8
4,0.014834,0.086596,0.11095,0.021228,0.033253,0.073304,0.028704,0.158103,0.028009,0.033722,...,0.014658,0.107279,0.035702,0.035205,0.02336701,0.059354,0.013637,0.036099,Topic7,9


In [39]:
# Check with the use case (JobID = 32115) >> Dominant topic: 13
df_topic_contributions_desc[df_topic_contributions_desc.JobID==32115].T

Unnamed: 0,823340
Topic0,0.019659
Topic1,0.050619
Topic2,0.033473
Topic3,0.0
Topic4,0.000345
Topic5,0.019219
Topic6,0.002961
Topic7,0.038565
Topic8,0.000845
Topic9,0.05107


In [40]:
df_topic_contributions_desc.to_csv('./nb_xai_viz/job_desc_topic_contribution_approach1.csv', header=True, index=False)

## Corpus: User Work History

In [41]:
np.random.seed(0)
tf_vectorizer = CountVectorizer(max_df=0.95, 
                                min_df=2, 
                                max_features=1000, 
                                stop_words='english')
tf_user = tf_vectorizer.fit_transform(user_lda["JobTitle"])

In [42]:
%%time

topic_contributions = [get_topic_contributions(doc, lda_user,  tf_feature_names_desc, fn_dict_user) for doc in user_lda["JobTitle"]]

# Convert to a DataFrame
df_topic_contributions_user = pd.DataFrame(topic_contributions, columns=[f"Topic{i}" for i in range(lda_user.n_components)])

# Extract dominant topic for each document
df_topic_contributions_user['dominant_topic'] = df_topic_contributions_user.idxmax(axis=1)
df_topic_contributions_user['UserID'] = user_lda['UserID'].values

CPU times: user 22.2 s, sys: 410 ms, total: 22.6 s
Wall time: 23.2 s


In [43]:
df_topic_contributions_user.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,...,Topic12,Topic13,Topic14,Topic15,Topic16,Topic17,Topic18,Topic19,dominant_topic,UserID
0,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.997995,...,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,Topic9,80
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,123
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,162
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,178
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Topic0,344


In [44]:
# Check with the use case (UserID=13) >> Dominant topic: 0
df_topic_contributions_user[df_topic_contributions_user.UserID==13].T

Unnamed: 0,112440
Topic0,0.0
Topic1,0.0
Topic2,0.0
Topic3,0.0
Topic4,0.0
Topic5,0.0
Topic6,0.0
Topic7,0.0
Topic8,0.0
Topic9,0.0


In [45]:
df_topic_contributions_user.to_csv('./nb_xai_viz/user_topic_contribution_approach1.csv', header=True, index=False)

## Check with the Approach 1 in feature engineering

In [46]:
# function to get documents index for job corpus, given JobID
def get_doc_idx(j_id):
    job = job_lda[job_lda.JobID==j_id]
    j_idx = job.index.values[0]
    return j_idx

In [47]:
# function to get documents index for work history corpus, given UserID
def get_user_idx(u_id):
    user = user_lda[user_lda.UserID==u_id]
    u_idx = user.index.values[0]
    return u_idx

### Calculate Dominant Topic: Use case

In [48]:
## Test one one Job ID, using apply_topics function:
user_id = 13
job_id = 32115

In [49]:
job_doc_idx = get_doc_idx(j_id=job_id)
job_doc_idx

823340

In [50]:
job_lda[job_lda.JobID==job_id].T

Unnamed: 0,823340
JobID,32115
WindowID,6
Title,"Specialized Transportation Services, Inc"
Description,Specialized Transportation Service...
Requirements,Please refer to the Job Description to view th...
City,Smyrna
State,TN
Country,US
Zip5,37167.0
StartDate,2012-05-12 00:10:16.827


In [51]:
jobs['Requirements'][job_doc_idx]

'Please refer to the Job Description to view the requirements for this job'

In [52]:
# Check with the apply_topics function for the document
import re
apply_topics(text = jobs['Requirements'][job_doc_idx],
             model = lda_req,
             feature_names = tf_feature_names_req,
             fn_dict = fn_dict_req)

15

In [53]:
# Check with the apply_topics function for the document
import re
apply_topics(text = jobs['Description'][job_doc_idx],
             model = lda_desc,
             feature_names = tf_feature_names_desc,
             fn_dict = fn_dict_desc)

13

In [54]:
# Check with the apply_topics function for the document
import re
apply_topics(text = jobs['Title'][job_doc_idx],
             model = lda_title,
             feature_names = tf_feature_names_title,
             fn_dict = fn_dict_title)

0

In [55]:
user_doc_idx = get_user_idx(u_id=user_id)
user_doc_idx

112440

In [56]:
fn_dict_user = gen_fn_dict(tf_feature_names_user)

In [57]:
# Check with the apply_topics function for the document
import re
apply_topics(text = user_lda['JobTitle'][user_doc_idx],
             model = lda_user,
             feature_names = tf_feature_names_user,
             fn_dict = fn_dict_user)

0

In [58]:
user_lda[user_lda.UserID == user_id].T

Unnamed: 0,112440
UserID,13
Split,Test
City,Philadelphia
State,PA
Country,US
DegreeType,4
WorkHistoryCount,6
TotalYearsExperience,5.0
CurrentlyEmployed,1
ManagedOthers,0


# Extract key words of each topic: This is SAME as approach 2.

## Corpus: Job Requirement

In [59]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(lda_req.components_)

In [60]:
# Assign Column and Index
df_topic_keywords.columns = tf_feature_names_req
df_topic_keywords.index = range(lda_req.n_components)

In [61]:
# View
df_topic_keywords.head()

Unnamed: 0,00,000,10,100,12,18,20,2008,2010,2011,...,workplace,works,world,worldwide,write,writing,written,www,year,years
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,0.05,0.05,0.05,0.05,1361.466694,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,21464.033692,8684.14021
2,0.05,4292.025472,0.05,18536.52497,0.05,0.05,1007.729235,4430.107048,6439.336763,20623.206581,...,0.05,0.05,31313.1593,24271.220062,0.05,0.05,0.05,15408.136096,9489.089066,20565.339058
3,0.05,0.05,10368.269183,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,19634.155828,0.05,0.05,3720.944683,6149.962119,5772.109333,0.05,0.05,0.05
4,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [62]:
df_topic_keywords

Unnamed: 0,00,000,10,100,12,18,20,2008,2010,2011,...,workplace,works,world,worldwide,write,writing,written,www,year,years
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,0.05,0.05,0.05,0.05,1361.466694,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,21464.033692,8684.14021
2,0.05,4292.025472,0.05,18536.52497,0.05,0.05,1007.729235,4430.107048,6439.336763,20623.206581,...,0.05,0.05,31313.1593,24271.220062,0.05,0.05,0.05,15408.136096,9489.089066,20565.339058
3,0.05,0.05,10368.269183,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,19634.155828,0.05,0.05,3720.944683,6149.962119,5772.109333,0.05,0.05,0.05
4,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
5,0.05,0.05,0.05,0.05,5818.698363,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,1082.218266,0.05,0.05,56983.176544,266491.939944
6,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
7,0.05,347.966106,0.05,0.05,0.05,0.05,0.05,0.05,0.075369,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,7690.49379,0.05
8,0.05,0.05,18831.699654,333.767658,1746.451547,0.05,970.070134,0.05,0.050001,0.05,...,0.05,0.05,0.05,0.05,0.05,12977.879336,45.808385,0.05,0.05,67043.64854
9,0.050015,12160.779116,11866.062894,0.05,913.440178,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,27496.250923,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [63]:
df_topic_keywords.to_csv('./nb_xai_viz/job_req_topic_keywords_approach1.csv', header=True, index=True)

## Corpus: Job Description

In [64]:
# Topic-Keyword Matrix
df_topic_keywords_desc = pd.DataFrame(lda_desc.components_)

# Assign Column and Index
df_topic_keywords_desc.columns = tf_feature_names_desc
df_topic_keywords_desc.index = range(lda_desc.n_components)


In [65]:
df_topic_keywords_desc

Unnamed: 0,00,000,10,100,11,12,15,20,25,30,...,working,workplace,works,world,worldwide,writing,written,www,year,years
0,0.05,0.05,3.264541,0.05,0.05,0.05,0.05,0.05,0.05,3497.62436,...,4273.550079,0.05,48.271523,0.05,0.05,0.05,0.05,0.05,0.05,0.05
1,0.05,0.05,12646.866854,818.453927,10091.653927,6811.934789,5929.036888,3540.319509,564.109433,0.05,...,10018.303184,1673.283596,43618.640186,0.05,0.05,4162.005182,17697.794633,0.05,12.556032,0.05
2,0.05,0.05,0.05,4694.930393,0.05,0.05,0.05,5057.061399,8591.560014,7353.41096,...,12416.465897,0.05,0.053319,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,13769.521919,0.05
4,0.05,0.05,4202.714637,2664.474262,917.699589,1035.89437,3600.87704,5770.457303,3314.049233,11158.716234,...,28742.38156,386.245189,2210.342776,0.05,0.05,0.05,0.05,0.05,0.05,0.05
5,0.05,27786.007292,9318.566667,6279.719871,594.37802,0.05,6654.103267,0.05,6778.284877,13468.381086,...,31299.749829,0.05,0.05,0.05,0.05,0.05,0.05,0.05,19958.641661,40372.766363
6,0.05,0.05,853.837073,0.05,444.957222,2481.314629,542.327332,1026.409155,0.05,249.197264,...,3463.528931,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,11614.225365
7,0.05,17932.224981,0.05,0.05,0.05,0.05,2196.765585,18907.980112,2475.455406,18298.003366,...,42114.172021,0.05,0.05,54828.466376,0.05,0.05,0.05,0.05,24706.819352,10671.762754
8,0.05,0.05,280.967995,305.314792,0.05,408.000231,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,6613.264361
9,0.05,0.08548,0.05,1690.870421,0.05,0.05,0.05,0.05,0.05,3053.173782,...,24741.226449,0.05,13934.380059,3918.29954,0.05,0.05,0.05,0.051665,2889.534366,1558.560955


In [66]:
df_topic_keywords_desc.to_csv('./nb_xai_viz/job_desc_topic_keywords_approach1.csv', header=True, index=True)

## Job Title

In [67]:
# Topic-Keyword Matrix
df_topic_keywords_title = pd.DataFrame(lda_title.components_)

# Assign Column and Index
df_topic_keywords_title.columns = tf_feature_names_title
df_topic_keywords_title.index = range(lda_title.n_components)


In [68]:
df_topic_keywords_title

Unnamed: 0,00,000,10,100,100k,11,12,15,1st,20,...,windows,wireless,women,work,worker,workers,worth,writer,year,york
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,1645.076567
1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,0.05,0.05,0.05,0.05,436.277161,0.05,0.05,0.05,0.05,0.05,...,0.05,7317.705574,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,6401.12415,0.05,0.05,0.05,0.05,0.05,0.05
5,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
6,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
7,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
8,0.05,0.05,0.05,0.05,0.05,0.05,0.05,403.108653,0.05,0.05,...,0.05,0.05,0.05,0.05,4193.270574,0.05,0.05,0.05,0.05,0.05
9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [69]:
df_topic_keywords_title.to_csv('./nb_xai_viz/job_title_topic_keywords_approach1.csv', header=True, index=True)

## Corpus: User Work History

In [70]:
# Topic-Keyword Matrix
df_topic_keywords_user = pd.DataFrame(lda_user.components_)

# Assign Column and Index
df_topic_keywords_user.columns = tf_feature_names_user
df_topic_keywords_user.index = range(lda_user.n_components)


In [71]:
df_topic_keywords_user

Unnamed: 0,2nd,3rd,abuse,academic,access,account,accountant,accountantaccountant,accountantaccounting,accountantaccounts,...,west,wireless,women,work,worker,workercustomer,writer,yard,year,youth
0,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,131.785991,0.05,0.05,0.05,0.05,0.05,0.05,503.523953
1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
2,0.05,0.05,0.05,0.05,307.790743,0.05,0.05,0.05,243.001563,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
3,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,168.578521,227.028006,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
4,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
5,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
6,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
7,0.05,0.05,0.05,0.05,0.05,0.05,0.05,148.376247,0.05,0.05,...,0.05,0.05,0.05,923.269287,0.05,0.05,0.05,0.05,0.05,0.05
8,0.05,0.05,204.336457,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,2539.551074,0.05,603.928435,0.05,0.05,0.05
9,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


In [72]:
df_topic_keywords_user.to_csv('./nb_xai_viz/user_topic_keywords_approach1.csv', header=True, index=True)

# Plot Worldcloud (SAME AS APPROACH 2) >> SKIP

# UDF: Extract key words contribute to a dominant topic
- Purpose: Show the word in the document (Job Requirement, Job Description, Job Title, or WorkHistory) that contribute the most to the given topic.
- This has to be customized due to different approach to calculate dominant topic

In [73]:
import re

def get_top_word_single(text, model, feature_names, fn_dict, top_n=10):
    words = re.findall(r"[\w']+", text)
    overlap = [value for value in feature_names if value in words]
    
    max_sum_t = -1.0
    max_topic_idx = None
    max_topic_keywords = []
    
    for topic_idx, topic in enumerate(model.components_):
        sum_t = 0.0
        word_contributions = []

        for word in overlap:
            word_contribution = topic[fn_dict[word]] if word in fn_dict else 0
            sum_t += word_contribution
            word_contributions.append((word, word_contribution))
        
        # Sort words based on contribution
        sorted_contributions = sorted(word_contributions, key=lambda x: x[1], reverse=True)
        top_keywords = [word for word, _ in sorted_contributions[:top_n]]
        
        if sum_t > max_sum_t:
            max_sum_t = sum_t
            max_topic_idx = topic_idx
            max_topic_keywords = top_keywords
    
    return max_topic_idx, max_topic_keywords


In [74]:
%%time
doc_index = 0  # for example, change this to the index of the document you're interested in
document = jobs["Description"].iloc[doc_index]
dominant_topic, top_keywords = get_top_word_single(document, 
                                                   model = lda_desc, 
                                                   feature_names = tf_feature_names_desc,
                                                   fn_dict = fn_dict_desc,
                                                   top_n=20)

print(f"Dominant Topic: {dominant_topic}")
print(f"Top Keywords: {', '.join(top_keywords)}")


Dominant Topic: 15
Top Keywords: design, technical, development, support, business, team, issues, including, integration, developing, architecture, include, job, practices, ensure, best, customer, provides, operations, resources
CPU times: user 2.63 ms, sys: 482 µs, total: 3.11 ms
Wall time: 3.88 ms


In [75]:
# Validate with dominant topic during feature engineering
apply_topics(text = jobs['Description'][doc_index],
             model = lda_desc,
             feature_names = tf_feature_names_desc,
             fn_dict = fn_dict_desc)

15

In [76]:
job_lda.iloc[doc_index].T

JobID                                                           1
WindowID                                                        1
Title                            Security Engineer Technical Lead
Description     Security Clearance Required:  Top Secret Job N...
Requirements    SKILL SET Network Security tools: Webdefend We...
City                                                   Washington
State                                                          DC
Country                                                        US
Zip5                                                        20531
StartDate                                 2012-03-07 13:17:01.643
EndDate                                       2012-04-06 23:59:59
ReqTopic                                                        5
DescTopic                                                      15
TitTopic                                                        0
Name: 0, dtype: object