# LDA on jobs desriptions, requirements, title

- Input: CB12, jobset_clean.csv
- FYI: https://github.com/lacic/session-knn-ae

In [1]:
import numpy as np
import pandas as pd
import subprocess
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from bs4 import BeautifulSoup
import pickle

In [2]:
# Read data
jobs = pd.read_csv("./data_processed/jobset_clean.csv")
#jobs = jobs.rename(columns={"JobID": "item_id", "State": "state", "Country": "country", "City": "city", "Zip5": "zip5"})
#jobs = jobs.set_index("item_id")

  jobs = pd.read_csv("./data_processed/jobset_clean.csv")


In [3]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050509 entries, 0 to 1050508
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   JobID         1050509 non-null  int64 
 1   WindowID      1050509 non-null  int64 
 2   Title         1050509 non-null  object
 3   Description   1050289 non-null  object
 4   Requirements  1047776 non-null  object
 5   City          1050507 non-null  object
 6   State         1050508 non-null  object
 7   Country       1050506 non-null  object
 8   Zip5          658844 non-null   object
 9   StartDate     1050509 non-null  object
 10  EndDate       1050500 non-null  object
dtypes: int64(2), object(9)
memory usage: 88.2+ MB


In [5]:
print("Unique cities: " + str(len(jobs.City.unique())))
print("Unique states: " + str(len(jobs.State.unique())))
print("Unique zip codes: " + str(len(jobs.Zip5.unique())))
print("Unique countries: " + str(len(jobs.Country.unique())))


Unique cities: 10998
Unique states: 61
Unique zip codes: 43853
Unique countries: 66


In [6]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def calc_lda(df, no_features = 1000, no_topics = 20):
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=2, 
                                    max_features=no_features, 
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(df)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, 
                                    max_iter=5, 
                                    learning_method='online', 
                                    learning_offset=50.,random_state=0).fit(tf)


    no_top_words = 5
    display_topics(lda, tf_feature_names, no_top_words)
    return lda, tf_feature_names

In [7]:
jobs.isnull().values.any()

True

In [8]:
jobs['Title'].isnull().values.sum()

0

In [9]:
jobs['Description'].isnull().values.sum()

220

In [10]:
jobs['Requirements'].isnull().values.sum()

2733

In [11]:
jobs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050509 entries, 0 to 1050508
Data columns (total 11 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   JobID         1050509 non-null  int64 
 1   WindowID      1050509 non-null  int64 
 2   Title         1050509 non-null  object
 3   Description   1050289 non-null  object
 4   Requirements  1047776 non-null  object
 5   City          1050507 non-null  object
 6   State         1050508 non-null  object
 7   Country       1050506 non-null  object
 8   Zip5          658844 non-null   object
 9   StartDate     1050509 non-null  object
 10  EndDate       1050500 non-null  object
dtypes: int64(2), object(9)
memory usage: 88.2+ MB


In [12]:
jobs["Requirements"].fillna("", inplace=True)
jobs["Description"].fillna("", inplace=True)
jobs["Title"].fillna("", inplace=True)

In [13]:
lda_req, tf_feature_names_req = calc_lda(jobs["Requirements"])



Topic 0:
requirements resident car meet items
Topic 1:
care license nurse nursing driver
Topic 2:
company companies aflac insurance fortune
Topic 3:
ability management business knowledge team
Topic 4:
bull ndash matco 350 road
Topic 5:
experience years degree required preferred
Topic 6:
job description refer view requirements
Topic 7:
sales business insurance outside marketing
Topic 8:
experience systems development engineering design
Topic 9:
professionals office industry half contact
Topic 10:
work opportunity time retail home
Topic 11:
able ability required work experience
Topic 12:
server responsible windows maintaining wireless
Topic 13:
accounting finance automotive franchise financial
Topic 14:
manager marketing sales management customer
Topic 15:
job pt temp conditions representatives
Topic 16:
benefits opportunity employer equal com
Topic 17:
maintenance mechanical electrical repair industrial
Topic 18:
skills experience ability communication strong
Topic 19:
fast paced status

In [14]:
lda_desc, tf_feature_names_desc = calc_lda(jobs["Description"])

Topic 0:
client clients staffing candidates firm
Topic 1:
procedures ensure policies duties appropriate
Topic 2:
customer customers service sales products
Topic 3:
store retail sales manager customer
Topic 4:
equipment maintenance work safety manufacturing
Topic 5:
benefits dental medical insurance life
Topic 6:
bull aerotek status hands job
Topic 7:
career looking team work opportunities
Topic 8:
security technology network technical services
Topic 9:
business management development develop marketing
Topic 10:
services company world global industry
Topic 11:
sales business marketing training outside
Topic 12:
financial credit business services banking
Topic 13:
com time apply position resume
Topic 14:
care patient health medical patients
Topic 15:
project design technical development support
Topic 16:
accounting financial data reports reporting
Topic 17:
rsquo work environment team amp
Topic 18:
experience skills ability years required
Topic 19:
school employer education training equa

In [15]:
lda_title, tf_feature_names_title = calc_lda(jobs["Title"])

Topic 0:
level entry restaurant truck needed
Topic 1:
time associate accountant franchise warehouse
Topic 2:
support openings tech operator teller
Topic 3:
sales representative account insurance outside
Topic 4:
senior financial designer work healthcare
Topic 5:
specialist management services maintenance general
Topic 6:
engineer director software nursing product
Topic 7:
sr development agent lead technical
Topic 8:
technician clinical mechanic bilingual hr
Topic 9:
service customer nurse consultant registered
Topic 10:
therapist marketing home health physical
Topic 11:
manager project store center trainee
Topic 12:
analyst business supervisor benefits loan
Topic 13:
quality job team shift production
Topic 14:
rn care executive lpn security
Topic 15:
driver cdl class instructor residential
Topic 16:
retail developer automotive java web
Topic 17:
assistant medical administrative occupational human
Topic 18:
coordinator travel recruiter process experience
Topic 19:
administrator ii profe

In [16]:
%%time
interim_path = "./cb12-interim/"
pickle.dump(lda_title, open(interim_path + "lda_title.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_title, open(interim_path + "lda_title.fnames", 'wb'), protocol=4)

pickle.dump(lda_desc, open(interim_path + "lda_desc.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_desc, open(interim_path + "lda_desc.fnames", 'wb'), protocol=4)

pickle.dump(lda_req, open(interim_path + "lda_req.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_req, open(interim_path + "lda_req.fnames", 'wb'), protocol=4)

CPU times: user 2.79 ms, sys: 4.63 ms, total: 7.42 ms
Wall time: 2.37 s


In [17]:
lda_title = pickle.load(open(interim_path + "lda_title.model", 'rb'))
tf_feature_names_title = pickle.load(open(interim_path + "lda_title.fnames", 'rb'))

lda_desc = pickle.load(open(interim_path + "lda_desc.model", 'rb'))
tf_feature_names_desc = pickle.load(open(interim_path + "lda_desc.fnames", 'rb'))

lda_req = pickle.load(open(interim_path + "lda_req.model", 'rb'))
tf_feature_names_req = pickle.load(open(interim_path + "lda_req.fnames", 'rb'))

#display_topics(lda_title, tf_feature_names_title, 10)
#display_topics(lda_desc, tf_feature_names_desc, 10)
#display_topics(lda_req, tf_feature_names_req, 10)

In [18]:
def apply_topics(text, model, feature_names, fn_dict):
    words = re.findall(r"[\w']+", text)
        
    overlap = [value for value in feature_names if value in words]
    
    max_sum_t = -1.0
    max_topic_idx = None
    for topic_idx, topic in enumerate(model.components_):
        sum_t = 0.0
        for word in overlap:
            sum_t += topic[fn_dict[word]]
        
        if sum_t > max_sum_t:
            max_sum_t = sum_t
            max_topic_idx = topic_idx
    
    return max_topic_idx

        
def gen_fn_dict(feature_names):
    fn_dict = {}
    f_idx = 0
    
    for name in feature_names:
        fn_dict[name] = f_idx
        f_idx += 1
        
    return fn_dict

fn_dict_req = gen_fn_dict(tf_feature_names_req)
fn_dict_desc = gen_fn_dict(tf_feature_names_desc)
fn_dict_title = gen_fn_dict(tf_feature_names_title)


In [19]:
import re

jobs['ReqTopic'] = jobs.apply(lambda x: apply_topics(x['Requirements'], lda_req, tf_feature_names_req, fn_dict_req), axis=1)
jobs['DescTopic'] = jobs.apply(lambda x: apply_topics(x['Description'], lda_desc, tf_feature_names_desc, fn_dict_desc), axis=1)
jobs['TitTopic'] = jobs.apply(lambda x: apply_topics(x['Title'], lda_title, tf_feature_names_title, fn_dict_title), axis=1)


jobs.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate,ReqTopic,DescTopic,TitTopic
0,1,1,Security Engineer Technical Lead,Security Clearance Required: Top Secret Job N...,SKILL SET Network Security tools: Webdefend We...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59,5,15,0
1,4,1,SAP Business Analyst WM,NO Corp. to Corp resumes are being considered ...,WHAT YOU NEED: Four year college degreeMinimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59,18,15,0
2,7,1,P T HUMAN RESOURCES ASSISTANT,P T HUMAN RESOURCES ASSISTANT 1-2 ye...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59,15,18,0
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59,15,7,0
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59,15,7,0


In [20]:
jobs.JobID.unique()

array([      1,       4,       7, ..., 1116294, 1116304, 1116307])

In [21]:
# Export data
jobs.to_csv(interim_path + "jobset_lda.csv", header=True, index=False)
len(jobs)


1050509

In [22]:
jobs.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate,ReqTopic,DescTopic,TitTopic
0,1,1,Security Engineer Technical Lead,Security Clearance Required: Top Secret Job N...,SKILL SET Network Security tools: Webdefend We...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59,5,15,0
1,4,1,SAP Business Analyst WM,NO Corp. to Corp resumes are being considered ...,WHAT YOU NEED: Four year college degreeMinimum...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59,18,15,0
2,7,1,P T HUMAN RESOURCES ASSISTANT,P T HUMAN RESOURCES ASSISTANT 1-2 ye...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59,15,18,0
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59,15,7,0
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59,15,7,0
