# LDA Model

## Importing Libraries

In [6]:
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Preparing the Data

In [7]:
import pandas as pd

data = pd.read_csv('Company_Dataset/dataset_others.csv')
data.head(3)

Unnamed: 0,Company,job_name,job_link,ML Labeled Function,city,country,Function,employment_type,remote,seniority level,Job Status,Date Reviewed,data analyst,company_Link,job_location,job_details,job_id,posting_error,description
0,Spectrum,"Outside Sales Representative | $5,000 Sign On ...",https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Opelika,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"$5,000 Sign On Bonus* + $2,500 training pay + ...",d74c82fb-27f8-435c-b54b-14ebccc7e9cd,,https://www.smartrecruiters.com/Humanity/74399...,,
1,Spectrum,Sr Technical Writer,https://sjobs.brassring.com/TGnewUI/Search/hom...,Other,Englewood,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,JOB SUMMARYThis position is responsible for de...,4175936f-1a2c-4635-a536-4348d5bf89f5,,https://www.smartrecruiters.com/Humanity/74399...,,
2,Spectrum,Advertising Account Executive- New Business,https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Bay City,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,Tenacious go-getter. Inquisitive problem solve...,11a21cd7-86c2-48a4-96b9-83fc59e428c1,,https://www.smartrecruiters.com/Humanity/74399...,,


In [8]:
stopwords = stopwords.words("english")
print (stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
data_desc = data['company_Link']
print(data_desc[0])

$5,000 Sign On Bonus* + $2,500 training pay + unlimited commissionAt A GlanceFull-time territory sales role ideal for sales pros and individuals looking to launch their Sales career who appreciate flexibility, with career advancement opportunities and unlimited commissionA role that is ideal for goal-oriented professionals who thrive on meeting new people, selling our essential services, and being on the moveBenefits include paid comprehensive training, excellent health benefits, paid time off, free Spectrum services, education assistance, and more***Get Up To SpeedThrough virtual classroom training and face-to-face sessions, our hands-on training philosophy partners you with established pros to learn the sales skills needed to close the deals in no time. Throughout your career at Spectrum, you will always have the continuing support and encouragement of your fellow Outside Sales Representative peers, Sales Managers, and company leadership.You Have Unlimited PotentialAs a part of our O

**Lemmatization**

In [11]:
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

data_desc = data_desc.astype('str')
lemmatized_texts = lemmatization(data_desc)
print (lemmatized_texts[0])

training pay unlimited time territory sale role ideal sale pro individual look launch sale career appreciate flexibility career advancement opportunity unlimited commissiona role be ideal goal orient professional thrive meet new people sell essential service be movebenefit include pay comprehensive training excellent health benefit pay time free service education assistance more***get virtual classroom training face face session hand training philosophy partner establish pro learn sale skill need close deal time career always have continue support encouragement fellow peer sale manager company leadership have unlimited part outside team potential earn be limitless right now we‚Äôre offer training pay unlimited commission!we‚äôll have spectrum training team building be priority want succeed territory sale offer support such peer mentor regular group huddlesongoe cutting edge trainingsolid lead give strong startkeep move motivated dedicated member team have opportunity enhance sale skill

## Tokenization

In [12]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)

print (data_words[0])

['training', 'pay', 'unlimited', 'time', 'territory', 'sale', 'role', 'ideal', 'sale', 'pro', 'individual', 'look', 'launch', 'sale', 'career', 'appreciate', 'flexibility', 'career', 'advancement', 'opportunity', 'unlimited', 'commissiona', 'role', 'be', 'ideal', 'goal', 'orient', 'professional', 'thrive', 'meet', 'new', 'people', 'sell', 'essential', 'service', 'be', 'movebenefit', 'include', 'pay', 'comprehensive', 'training', 'excellent', 'health', 'benefit', 'pay', 'time', 'free', 'service', 'education', 'assistance', 'more', 'get', 'virtual', 'classroom', 'training', 'face', 'face', 'session', 'hand', 'training', 'philosophy', 'partner', 'establish', 'pro', 'learn', 'sale', 'skill', 'need', 'close', 'deal', 'time', 'career', 'always', 'have', 'continue', 'support', 'encouragement', 'fellow', 'peer', 'sale', 'manager', 'company', 'leadership', 'have', 'unlimited', 'part', 'outside', 'team', 'potential', 'earn', 'be', 'limitless', 'right', 'now', 'we', 'aore', 'offer', 'training', '

## Bigrams and Trigrams

In [13]:
#Bigrams and trigrams
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return(bigram[doc] for doc in texts)

def make_trigrams(texts):
    return(trigram[bigram[doc]] for doc in texts)

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

data_bigrams_trigrams = list(data_bigrams_trigrams)
data_bigrams_trigrams[0]

['training',
 'pay',
 'unlimited',
 'time',
 'territory',
 'sale',
 'role',
 'ideal',
 'sale',
 'pro_individual',
 'look_launch',
 'sale',
 'career_appreciate_flexibility',
 'career',
 'advancement',
 'opportunity',
 'unlimited_commissiona',
 'role',
 'be',
 'ideal',
 'goal',
 'orient',
 'professional',
 'thrive',
 'meet',
 'new',
 'people',
 'sell',
 'essential',
 'service',
 'be',
 'movebenefit',
 'include',
 'pay',
 'comprehensive',
 'training',
 'excellent',
 'health',
 'benefit',
 'pay',
 'time',
 'free',
 'service',
 'education_assistance',
 'more',
 'get_virtual_classroom',
 'training',
 'face',
 'face',
 'session_hand',
 'training',
 'philosophy_partner_establish_pro',
 'learn',
 'sale',
 'skill',
 'need_close_deal',
 'time',
 'career',
 'always',
 'have',
 'continue',
 'support_encouragement_fellow',
 'peer',
 'sale',
 'manager',
 'company',
 'leadership',
 'have',
 'unlimited',
 'part',
 'outside',
 'team',
 'potential',
 'earn',
 'be',
 'limitless_right',
 'now_we_aore',
 'o

## Creating Dictionary

In [14]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

## LDA Topic modeling

In [15]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

## Vizualizing the Data

In [16]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(


In [17]:
#saving graph as html
pyLDAvis.save_html(vis, "lda_final.html")