# LDA using Scikit Learn

In [25]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

**Importing the data**

In [26]:
df = pd.read_csv("Company_Dataset/dataset_others.csv")
df = df.dropna(subset=["company_Link"])
df.head(3)

Unnamed: 0,Company,job_name,job_link,ML Labeled Function,city,country,Function,employment_type,remote,seniority level,Job Status,Date Reviewed,data analyst,company_Link,job_location,job_details,job_id,posting_error,description,Clusters_Labelled
0,Spectrum,Sr Technical Writer,https://sjobs.brassring.com/TGnewUI/Search/hom...,Other,Englewood,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,JOB SUMMARYThis position is responsible for de...,4175936f-1a2c-4635-a536-4348d5bf89f5,,https://www.smartrecruiters.com/Humanity/74399...,,,7
1,Spectrum,Production Specialist I - Spectrum News 13 Orl...,https://sjobs.brassring.com/TGnewUI/Search/hom...,Other,Orlando,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"Spectrum Networks is looking for enthusiastic,...",fc34ab5f-ee2d-4145-81c4-126f52b1df35,,https://www.smartrecruiters.com/Humanity/74399...,,,6
2,Spectrum,"Manager, Advanced Advertising Sales - Ad Sales",https://sjobs.brassring.com/TGnewUI/Search/hom...,Other,Los Angeles,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,Spectrum Reach is looking for a dynamic Manage...,60391e39-17ba-4de2-a564-ede2412d6791,,https://www.smartrecruiters.com/Humanity/74399...,,,0


**Data Cleaning**

In [27]:
# Convert to list
data = df['company_Link'].values.tolist()
print(data[0])

JOB SUMMARYThis position is responsible for developing and designing complex instructional and informational tools needed to assure safe, appropriate and effective use of engineering technology. Combines multi-media knowledge and strong communication skills with technical expertise to educate across the entire spectrum of users' abilities, technical experience, and visual and auditory capabilities.MAJOR DUTIES AND RESPONSIBILITIESWork closely with the subject matter experts, at all levels within the organization, to record and share technical documentation. Guides partners through discovery conversations to uncover the degree of detail and key information.Document operational procedures, engineering design documents, methods of procedures, implementation guides and engineering drawings.Manage the archiving, tracking and updating of reference materials across multiple media platforms. Responsible for QA, configuration management and version control of all documentationDesign, create mat

**Tokenize**

In [28]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['job', 'summarythis', 'position', 'is', 'responsible', 'for', 'developing', 'and', 'designing', 'complex', 'instructional', 'and', 'informational', 'tools', 'needed', 'to', 'assure', 'safe', 'appropriate', 'and', 'effective', 'use', 'of', 'engineering', 'technology', 'combines', 'multi', 'media', 'knowledge', 'and', 'strong', 'communication', 'skills', 'with', 'technical', 'expertise', 'to', 'educate', 'across', 'the', 'entire', 'spectrum', 'of', 'users', 'abilities', 'technical', 'experience', 'and', 'visual', 'and', 'auditory', 'capabilities', 'major', 'duties', 'and', 'closely', 'with', 'the', 'subject', 'matter', 'experts', 'at', 'all', 'levels', 'within', 'the', 'organization', 'to', 'record', 'and', 'share', 'technical', 'documentation', 'guides', 'partners', 'through', 'discovery', 'conversations', 'to', 'uncover', 'the', 'degree', 'of', 'detail', 'and', 'key', 'information', 'document', 'operational', 'procedures', 'engineering', 'design', 'documents', 'methods', 'of', 'proce

**Stemming**

In [29]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=["NOUN", "VERB"]) #select noun and verb
print(data_lemmatized[:2])

['job position be develop design tool need assure use engineering technology combine medium knowledge communication skill expertise educate spectrum user abilitie experience auditory capability duty matter expert level organization record share documentation guide partner discovery conversation uncover degree detail information document procedure engineering design document method procedure implementation guide engineering drawing manage archive tracking updating reference material medium platform configuration management version control create material conduct class train other documentation technique practice edit information accuracy readability review content test scenario ensure material be serve intend purpose stay writing editing communication platform practice educate writer require skill ability ability articulate translate document method ability utilize learn range technology tool relate ability utilize learn range change ability design create format degree communication des

**Vectorizing**

In [30]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
# minimum reqd occurences of a word 
                             stop_words='english',             
# remove stop words
                             lowercase=True,                   
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  
# num chars > 3
                             # max_features=50000,             
# max number of uniq words    
                            ngram_range=(1, 3))
data_vectorized = vectorizer.fit_transform(data_lemmatized)
data_vectorized

<687x8981 sparse matrix of type '<class 'numpy.int64'>'
	with 315407 stored elements in Compressed Sparse Row format>

**LDA Model using sklearn**

In [31]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=100)


In [32]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
 evaluate_every=-1, learning_decay=0.7,
 learning_method="online", learning_offset=10.0,
 max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
 n_components=10, n_jobs=-1, perp_tol=0.1,
 random_state=100, topic_word_prior=None,
 total_samples=1000000.0, verbose=0)

LatentDirichletAllocation(learning_method='online', n_jobs=-1, random_state=100)

**Diagnosing model performance with perplexity and log-likelihood**

In [33]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -2852044.439039415
Perplexity:  1188.597938401916
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 20,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


**Finding out Optimum number of topics using GridSearch**

In [35]:
# Define Search Param
search_params = {'n_components': [5, 10, 15, 20, 25], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0), n_jobs=1,
       param_grid={'n_topics': [5, 10, 15, 20, 25], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

GridSearchCV(error_score='raise',
             estimator=LatentDirichletAllocation(learning_method=None,
                                                 n_jobs=1),
             n_jobs=1,
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_topics': [5, 10, 15, 20, 25]},
             return_train_score='warn')

In [36]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -669859.1337022403
Model Perplexity:  1445.6643499819388


Grid Search has concluded that a maximum of 10 topics is optimal.

**Dominant topic in each job posting**

In [37]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Cluster " + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Job " + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return "color: {col}".format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Cluster 0,Cluster 1,Cluster 2,Cluster 3,Cluster 4,Cluster 5,Cluster 6,Cluster 7,Cluster 8,Cluster 9,dominant_topic
Job 0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.68,0.0,0.09,7
Job 1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6
Job 2,0.0,0.0,0.0,0.48,0.0,0.0,0.02,0.17,0.0,0.33,3
Job 3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0.0,7
Job 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0.0,7
Job 5,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.47,0.0,0.0,2
Job 6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Job 7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7
Job 8,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.67,0.0,8
Job 9,0.0,0.0,0.43,0.0,0.0,0.0,0.0,0.57,0.0,0.0,7


Relevant topics are in green

**Checking the clusters**

Top 20 keywords of each cluster:

In [38]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=20)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Cluster '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Cluster 0,learn,test,education,team,ensure,integrity,payroll,experience,make,work,customer,application,testing,join,job,aos,turnitin,service,institution,year
Cluster 1,aos,datum,consumer,work,payroll,team,client,solution,information,status,nielseniq,action,good,aos happen,happen,provide,world,tax,insight,experience
Cluster 2,equipment,work,company,network,perform,use,ability,repair,require,test,plant,record,include,tool,ladder,maintenance,fiber,power,operate,standard
Cluster 3,advertising,reach,spectrum reach,account,campaign,sale,spectrum,client,team,medium,position,support,advertising sale,research,solution,market,marketing,customer,screen,ability
Cluster 4,internship,include,opportunity,experience,business,field,team,people,spectrum,industry,development,program,intern,nielsen,aos,design,time,aoll,project,work
Cluster 5,washburn,student,employment,employee,job,auburn,auburn washburn,provide,include,law,origin age,race,race color,origin,age,religion,color,race color religion,color religion,disability
Cluster 6,news,team,network,ability,spectrum,work,story,environment,skill,include,spectrum network,community,drive,skill ability,content,break,write,look,produce,break news
Cluster 7,work,customer,ability,experience,service,support,team,job,company,field,management,include,knowledge,product,business,require,communication,ensure,skill,provide
Cluster 8,customer,team,sale,store,work,experience,management,spectrum,employee,manager,ability,provide,build,skill,meet,product,service,environment,culture,care
Cluster 9,client,team,sale,datum,work,aos,business,experience,solution,company,provide,status,manage,information,management,consumer,develop,service,opportunity,drive


In [39]:
#Topics = ["Update Version/Fix Crash Problem","Download/Internet Access","Learn and Share","Card Payment","Notification/Support", 
          #"Account Problem", "Device/Design/Password", "Language/Recommend/Screen Size", "Graphic/ Game Design/ Level and Coin", "Photo/Search"]
df_topic_keywords["Job Type"]= ""
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,Job Type
Cluster 0,learn,test,education,team,ensure,integrity,payroll,experience,make,work,...,application,testing,join,job,aos,turnitin,service,institution,year,
Cluster 1,aos,datum,consumer,work,payroll,team,client,solution,information,status,...,action,good,aos happen,happen,provide,world,tax,insight,experience,
Cluster 2,equipment,work,company,network,perform,use,ability,repair,require,test,...,record,include,tool,ladder,maintenance,fiber,power,operate,standard,
Cluster 3,advertising,reach,spectrum reach,account,campaign,sale,spectrum,client,team,medium,...,support,advertising sale,research,solution,market,marketing,customer,screen,ability,
Cluster 4,internship,include,opportunity,experience,business,field,team,people,spectrum,industry,...,program,intern,nielsen,aos,design,time,aoll,project,work,
Cluster 5,washburn,student,employment,employee,job,auburn,auburn washburn,provide,include,law,...,race,race color,origin,age,religion,color,race color religion,color religion,disability,
Cluster 6,news,team,network,ability,spectrum,work,story,environment,skill,include,...,community,drive,skill ability,content,break,write,look,produce,break news,
Cluster 7,work,customer,ability,experience,service,support,team,job,company,field,...,include,knowledge,product,business,require,communication,ensure,skill,provide,
Cluster 8,customer,team,sale,store,work,experience,management,spectrum,employee,manager,...,provide,build,skill,meet,product,service,environment,culture,care,
Cluster 9,client,team,sale,datum,work,aos,business,experience,solution,company,...,status,manage,information,management,consumer,develop,service,opportunity,drive,


In [41]:
df_document_topics.to_excel("LDA_Scikit_Cluster_Labeled.xlsx")

**Source:** https://yanlinc.medium.com/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6