# Day 4 - Word2Vec on job offers

In [62]:
import pandas as pd

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, LdaMulticore
from gensim.models.phrases import Phrases, Phraser

from ast import literal_eval

from gensim.corpora import Dictionary
import pyLDAvis
from pyLDAvis import gensim

In [13]:
df = pd.read_csv('data/job_ofer.csv')
df.shape

(36109, 8)

In [14]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


## Word2Vec

In [16]:
corpus = df['title'].map(simple_preprocess)

In [17]:
corpus

0                            [machine, learning, engineer]
1           [deep, learning, applied, researcher, chicago]
2                            [machine, learning, engineer]
3                     [machine, learning, data, scientist]
4                                       [cloud, architect]
                               ...                        
36104    [cdl, drivers, hauling, to, canada, no, touch,...
36105    [research, scientist, human, computer, interac...
36106    [clinical, post, market, surveillance, special...
36107    [excellent, teaching, opportunity, in, china, ...
36108                [assistant, director, talent, culure]
Name: title, Length: 36109, dtype: object

In [19]:
model = Word2Vec(corpus, size=100, window=2, min_count=1)

In [20]:
model.wv.most_similar('machine')

[('deep', 0.9477141499519348),
 ('edge', 0.8719989061355591),
 ('captivate', 0.8263579607009888),
 ('inference', 0.8172296285629272),
 ('scientists', 0.8115745186805725),
 ('nlp', 0.8066504597663879),
 ('natural', 0.8048179149627686),
 ('researcher', 0.803945779800415),
 ('computer', 0.7986159324645996),
 ('algorithm', 0.7892760038375854)]

### Phrases example

In [24]:
corpus = [
    ['a','b','c'],
    ['a','b','x'],
    ['y','a','b'],    
    ['q','a','b','z'],    
]
bigram = Phraser(Phrases(corpus, min_count=1, threshold =1))

In [25]:
bigram[['k','a','b','c']]

['k', 'a_b', 'c']

### Title + phrases

In [28]:
title_corpus = df['title'].map(simple_preprocess)
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))

In [29]:
title_corpus_phrase = [title_bigram[s] for s in title_corpus]

In [30]:
model = Word2Vec(title_corpus_phrase, size=100, window=2, min_count=1)

In [31]:
model.wv.most_similar('machine')

[('franklin_tn', 0.8684641718864441),
 ('licensed_massage', 0.859023928642273),
 ('competitive_pay', 0.8582509756088257),
 ('division_risk', 0.858134925365448),
 ('für_den', 0.8565014004707336),
 ('lead_people', 0.8562629222869873),
 ('start_at', 0.8554681539535522),
 ('the_north', 0.8550440669059753),
 ('sfdc_solution', 0.8550319075584412),
 ('als', 0.8549489378929138)]

In [38]:
def prepare_corpus(corpus, bigram ):
    return [bigram[s] + s for s in corpus]

In [33]:
t = 'Deep Learning Applied Researcher - Chicago'

In [34]:
simple_preprocess(t)

['deep', 'learning', 'applied', 'researcher', 'chicago']

In [35]:
title_bigram[simple_preprocess(t)]

['deep_learning', 'applied', 'researcher', 'chicago']

In [40]:
title_model = Word2Vec(prepare_corpus(title_corpus, title_bigram), size=100, window=2, min_count=1)

In [51]:
title_model.wv.most_similar('nlp')

[('ml', 0.9942561984062195),
 ('deep_learning', 0.9924538135528564),
 ('principal_data', 0.991608738899231),
 ('computer_vision', 0.9906652569770813),
 ('deep', 0.9869205951690674),
 ('computer', 0.9851434230804443),
 ('big_data', 0.9849541783332825),
 ('alpharetta', 0.9847122430801392),
 ('applied_research', 0.983695387840271),
 ('applied', 0.9816345572471619)]

## Description

In [44]:
def prepare_corpus(corpus, bigram ):
    return [bigram[s] + s for s in corpus]

In [45]:
desc_corpus = df['description'].map(simple_preprocess)
desc_bigram = Phraser(Phrases(desc_corpus, min_count=1, threshold=1))

In [47]:
desc_model = Word2Vec(prepare_corpus(desc_corpus, desc_bigram), size=100, window=2, min_count=1)

In [48]:
desc_model.wv.most_similar('machine')

[('ml', 0.607627272605896),
 ('algorithms', 0.5784522891044617),
 ('cnc', 0.577143669128418),
 ('instrument', 0.5701477527618408),
 ('reinforcement', 0.5657170414924622),
 ('computer', 0.5645312070846558),
 ('using_machine', 0.5606918334960938),
 ('implanting', 0.559789776802063),
 ('deploy_machine', 0.5596036911010742),
 ('inference', 0.5577758550643921)]

## Title Visualization

In [58]:
%time dictionary = Dictionary(title_corpus)

CPU times: user 717 ms, sys: 42.6 ms, total: 760 ms
Wall time: 761 ms


In [59]:
list(dictionary.items())[:10]

[(0, 'engineer'),
 (1, 'learning'),
 (2, 'machine'),
 (3, 'applied'),
 (4, 'chicago'),
 (5, 'deep'),
 (6, 'researcher'),
 (7, 'data'),
 (8, 'scientist'),
 (9, 'architect')]

In [60]:
%time bow_corpus = [dictionary.doc2bow(s) for s in title_corpus]

CPU times: user 365 ms, sys: 41.2 ms, total: 406 ms
Wall time: 406 ms


In [63]:
%time lda_model = LdaMulticore(bow_corpus, id2word=dictionary, \
                               num_topics=50, passes=20, workers=8)

CPU times: user 1min 6s, sys: 11.8 s, total: 1min 17s
Wall time: 2min 25s


In [64]:
for i, t in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(i,t))

Topic: 0 
Words: 0.353*"engineer" + 0.133*"software" + 0.081*"senior" + 0.032*"developer" + 0.020*"cloud" + 0.018*"end" + 0.016*"full" + 0.016*"stack" + 0.016*"sr" + 0.014*"front"

Topic: 1 
Words: 0.143*"president" + 0.143*"vice" + 0.032*"audit" + 0.029*"energy" + 0.025*"graduate" + 0.024*"internal" + 0.024*"inc" + 0.022*"program" + 0.019*"senior" + 0.019*"manager"

Topic: 2 
Words: 0.130*"intern" + 0.091*"area" + 0.050*"content" + 0.046*"fall" + 0.044*"bay" + 0.041*"professional" + 0.028*"capital" + 0.024*"trade" + 0.021*"tutor" + 0.019*"manager"

Topic: 3 
Words: 0.116*"engineering" + 0.101*"maintenance" + 0.094*"room" + 0.085*"officer" + 0.064*"care" + 0.043*"manager" + 0.040*"chief" + 0.019*"technician" + 0.015*"department" + 0.014*"kitchen"

Topic: 4 
Words: 0.125*"engineer" + 0.068*"learning" + 0.067*"design" + 0.061*"machine" + 0.056*"systems" + 0.041*"manufacturing" + 0.038*"process" + 0.035*"principal" + 0.030*"electrical" + 0.029*"with"

Topic: 5 
Words: 0.355*"associate" + 

In [65]:
lda_vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
