# Day 5 - Doc2Vec

In [34]:
import pandas as pd

from gensim.utils import simple_preprocess
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models.phrases import Phraser, Phrases

In [9]:
df = pd.read_csv('data/job_ofer.csv')
df.shape

(36109, 8)

In [10]:
df.head()

Unnamed: 0,title,company_name,address,description,seniority_level,employment_type,job_function,industries
0,Machine Learning Engineer,Intellipro Group Inc,"Palo Alto, CA, US","['About The Company', ""W*** is reshaping the f...",Entry level,Full-time,Engineering,Information Technology and Services
1,Deep Learning Applied Researcher - Chicago,Ethosia,"Chicago, IL, US","['תיאור המשרה', 'Deep learning for Computer Vi...",Associate,Full-time,Other,Information Technology and Services
2,Machine Learning Engineer,Motorola Solutions,"Chicago, IL, US","['Company Overview', 'At Motorola Solutions, w...",Entry level,Full-time,Engineering,Information Technology and Services
3,Machine Learning / Data Scientist,Proprius LLC,"San Francisco, CA, US",['Our client is a digital invention agency foc...,Entry level,Full-time,Engineering,Information Technology and Services
4,Cloud Architect,TCS,"Framingham, Massachusetts, United States","['Technical/Functional Skills', ' ', 'Good to ...",Mid-Senior level,Full-time,Engineering,Information Technology and Services


In [11]:
title_corpus = df['title'].map(simple_preprocess)

In [13]:
title_tagged = [TaggedDocument(words=s, tags=[i]) for i,s in enumerate(title_corpus)]

In [30]:
title_model = Doc2Vec(vector_size=300, window_size=5, min_count=1)
title_model.build_vocab(title_tagged)

In [19]:
%time title_model.train(title_tagged, \
                        total_examples = title_model.corpus_count, epochs=10)

CPU times: user 43.5 s, sys: 7.86 s, total: 51.4 s
Wall time: 42.1 s


In [21]:
title_model.docvecs.most_similar(0)

[(20980, 0.9116872549057007),
 (32040, 0.9041644930839539),
 (4798, 0.9030004739761353),
 (3679, 0.8965742588043213),
 (2127, 0.896297812461853),
 (1595, 0.8962187767028809),
 (23476, 0.8917381167411804),
 (13371, 0.8909156322479248),
 (28993, 0.8902729749679565),
 (7171, 0.8889036178588867)]

In [22]:
df[df.index==0].title

0    Machine Learning Engineer
Name: title, dtype: object

In [27]:
df[df.index==20980].title

20980    Computer Vision/Machine Learning Algorithm Eng...
Name: title, dtype: object

In [29]:
df[df.index==4798].title

4798    Tech Lead, React Native
Name: title, dtype: object

### Phrases

In [36]:
title_bigram = Phraser(Phrases(title_corpus, min_count=1, threshold=1))
def prepare_corpus(corpus, bigram):
    return [bigram[s] + s for s in corpus]

ext_corp = prepare_corpus(title_corpus, title_bigram)

In [39]:
title_tagged_ext = [TaggedDocument(words=s, tags=[i]) for i,s in enumerate(ext_corp)]

In [46]:
title_model = Doc2Vec(vector_size=300, window_size=3, min_count=2)
title_model.build_vocab(title_tagged_ext)

In [47]:
%time title_model.train(title_tagged, \
                        total_examples = title_model.corpus_count, epochs=10)

CPU times: user 42.5 s, sys: 6.85 s, total: 49.3 s
Wall time: 39.7 s


In [48]:
title_model.docvecs.most_similar(0)

[(24695, 0.9099168181419373),
 (16587, 0.8849876523017883),
 (15460, 0.883492112159729),
 (9435, 0.881622850894928),
 (2, 0.8813334107398987),
 (13742, 0.8661069869995117),
 (26577, 0.8614345788955688),
 (16079, 0.860126793384552),
 (366, 0.8578841090202332),
 (14437, 0.8521586656570435)]

In [50]:
df[df.index==0].title

0    Machine Learning Engineer
Name: title, dtype: object

In [51]:
df[df.index==24695].title

24695    Machine Learning Engineer · FinCrime
Name: title, dtype: object

In [52]:
df[df.index==16587].title

16587    Machine Learning Engineer – Personalization
Name: title, dtype: object

In [53]:
df[df.index==15460].title

15460    Senior Machine Learning Engineer, Data Platform
Name: title, dtype: object