# LDA user profile


In [1]:
import caffeine
caffeine.on(display=False)

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn import metrics

In [3]:
# Load clean dataset for user profile and work history
user_set = pd.read_csv("./data_interim/user_set_cleaned.csv")
work_history = pd.read_csv('./data_interim/work_history_cleaned.csv')

In [4]:
user_set.head()

Unnamed: 0,UserID,Split,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,80,Train,Williamstown,NJ,US,1,5,11.0,1,1,5
1,123,Train,Baton Rouge,LA,US,4,1,9.0,1,0,0
2,162,Train,Long Beach,CA,US,5,10,25.0,0,0,0
3,178,Train,Greenville,SC,US,1,6,35.0,0,1,4
4,344,Train,Newport News,VA,US,1,3,7.0,1,0,0


In [5]:
work_history.head()

Unnamed: 0,UserID,WindowID,Split,JobTitle
0,80,1,Train,"Auto Publishing/Electro Mechanical Technician,..."
1,80,1,Train,"Enhanced Baker Cell Technician, EBC Technician"
2,80,1,Train,Lead was was also given the position as Drug ...
3,80,1,Train,"Sales Associate, Installer"
4,123,1,Train,Lead Hostess and Takeout Server


In [6]:
len(user_set)

152292

In [8]:
group_work_history = pd.DataFrame(work_history.groupby("UserID").JobTitle.sum().reset_index())

In [10]:
group_work_history.head()

Unnamed: 0,UserID,JobTitle
0,7,Front End Supervisor/CashierCustomer Service R...
1,9,CookMechanicFile Clerk
2,13,Pennsylvania MentorStudent WorkerInternship in...
3,16,Senior Messaging Engineer (Exchange Lead)Excha...
4,26,Information Technology ManagerConsultantSuppor...


In [11]:
group_work_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152292 entries, 0 to 152291
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   UserID    152292 non-null  int64 
 1   JobTitle  152292 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [12]:
user_merge = user_set.merge(group_work_history, on='UserID', how='left')

In [13]:
user_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152292 entries, 0 to 152291
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   UserID                152292 non-null  int64  
 1   Split                 152292 non-null  object 
 2   City                  152292 non-null  object 
 3   State                 152078 non-null  object 
 4   Country               152292 non-null  object 
 5   DegreeType            152292 non-null  int64  
 6   WorkHistoryCount      152292 non-null  int64  
 7   TotalYearsExperience  152292 non-null  float64
 8   CurrentlyEmployed     152292 non-null  int64  
 9   ManagedOthers         152292 non-null  int64  
 10  ManagedHowMany        152292 non-null  int64  
 11  JobTitle              152292 non-null  object 
dtypes: float64(1), int64(6), object(5)
memory usage: 15.1+ MB


In [14]:
import subprocess
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from bs4 import BeautifulSoup
import pickle

In [15]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def calc_lda(df, no_features = 1000, no_topics = 20):
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    tf_vectorizer = CountVectorizer(max_df=0.95, 
                                    min_df=2, 
                                    max_features=no_features, 
                                    stop_words='english')
    tf = tf_vectorizer.fit_transform(df)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA
    lda = LatentDirichletAllocation(n_components=no_topics, 
                                    max_iter=5, 
                                    learning_method='online', 
                                    learning_offset=50.,random_state=0).fit(tf)


    no_top_words = 5
    display_topics(lda, tf_feature_names, no_top_words)
    return lda, tf_feature_names

In [16]:
lda_user, tf_feature_names_user = calc_lda(user_merge["JobTitle"])



Topic 0:
cashier counselor property lab retail
Topic 1:
manager senior maintenance owner shift
Topic 2:
support operations technical rep member
Topic 3:
sales manager resources human representative
Topic 4:
business officer leader manager managersales
Topic 5:
manager marketing lead store managerassistant
Topic 6:
specialist administrator manager secretary resource
Topic 7:
development staff social work managerdirector
Topic 8:
operator production worker receiving forklift
Topic 9:
technician engineer management agent quality
Topic 10:
service customer representative inventory representativecustomer
Topic 11:
account analyst president time systems
Topic 12:
supervisor control warehouse field contract
Topic 13:
desk financial center program designer
Topic 14:
coordinator care nurse health case
Topic 15:
director project executive general consultant
Topic 16:
assistant office administrative medical billing
Topic 17:
associate data driver intern certified
Topic 18:
clerk services team ent

In [18]:
interim_path = "./data_interim_lda/"
pickle.dump(lda_user, open(interim_path + "lda_user_history.model", 'wb'), protocol=4)
pickle.dump(tf_feature_names_user, open(interim_path + "lda_user_history.fnames", 'wb'), protocol=4)


In [20]:
def apply_topics(text, model, feature_names, fn_dict):
    words = re.findall(r"[\w']+", text)
        
    overlap = [value for value in feature_names if value in words]
    
    max_sum_t = -1.0
    max_topic_idx = None
    for topic_idx, topic in enumerate(model.components_):
        sum_t = 0.0
        for word in overlap:
            sum_t += topic[fn_dict[word]]
        
        if sum_t > max_sum_t:
            max_sum_t = sum_t
            max_topic_idx = topic_idx
    
    return max_topic_idx

        
def gen_fn_dict(feature_names):
    fn_dict = {}
    f_idx = 0
    
    for name in feature_names:
        fn_dict[name] = f_idx
        f_idx += 1
        
    return fn_dict


In [21]:
fn_dict_user = gen_fn_dict(tf_feature_names_user)

In [23]:
%%time
import re

user_merge['WorkHistoryTopic'] = user_merge.apply(lambda x: apply_topics(x['JobTitle'], lda_user, tf_feature_names_user, fn_dict_user), axis=1)


CPU times: user 24.1 s, sys: 124 ms, total: 24.2 s
Wall time: 24.3 s


In [24]:
# Export data
user_merge.to_csv(interim_path + "user_history_lda.csv", header=True, index=False)

In [25]:
user_merge.head()

Unnamed: 0,UserID,Split,City,State,Country,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,JobTitle,WorkHistoryTopic
0,80,Train,Williamstown,NJ,US,1,5,11.0,1,1,5,"Auto Publishing/Electro Mechanical Technician,...",9
1,123,Train,Baton Rouge,LA,US,4,1,9.0,1,0,0,Lead Hostess and Takeout Server,0
2,162,Train,Long Beach,CA,US,5,10,25.0,0,0,0,Student AssistantGraduate AssistantTreasurerOw...,0
3,178,Train,Greenville,SC,US,1,6,35.0,0,1,4,CHEMICAL MANAGERChemical Management Company Sp...,0
4,344,Train,Newport News,VA,US,1,3,7.0,1,0,0,Restaurant ServerCashier,0
