# Load Doc2Vec model

In [0]:
from gensim.models.doc2vec import Doc2Vec
import pandas as pd

model = Doc2Vec.load('doc2vec_model/doc2vec_model_cont_er')

# Load corpus

In [0]:
from gensim.models.doc2vec import TaggedLineDocument
corpus = TaggedLineDocument("doc2vec_data/corpus_continuing_er.cor")

In [0]:
for idx, i in enumerate(corpus):
    print(i.tags)
    break

[0]


In [0]:
len(model.docvecs)

457277

# Assessing doc2vec model

In [0]:
def find_corpus_docs_similar_docs():
    for doc in corpus:
        doc_id = doc.tags[0]
        inferred_vector = model.infer_vector(doc.words)
        # get only top 10
        sims = model.docvecs.most_similar([inferred_vector], topn=10)
        similar_document_ids = [doc_id for doc_id, _ in sims]
        if doc_id in similar_document_ids:
            yield doc_id, similar_document_ids.index(doc_id), similar_document_ids
        else:
            yield doc_id, 999999999999, similar_document_ids # mark some dead value

In [0]:
similar_docs_gen = find_corpus_docs_similar_docs()

In [0]:
similarities = [None] * len(model.docvecs)
for similarity in similar_docs_gen:
    if similarity[0] % 10000 == 0: 
      print("Current doc ID %d" % similarity[0])
    similarities[similarity[0]] = similarity

In [0]:
df = pd.DataFrame(similarities, columns=['doc_id','most_similar_index','top10_similarity_ranking'])

In [0]:
df.head(10)

NameError: name 'df' is not defined

## Saving findings

In [0]:
df.to_json('findings/doc2vec_evaluation.json.gz', orient='records', lines=True, compression='gzip')

# Load evaluation

In [0]:
ev_df = pd.read_json('findings/doc2vec_evaluation.json.gz', orient='records', lines=True, compression='gzip')

In [0]:
import collections
ranking_counter = collections.Counter(ev_df['most_similar_index'])
ranking_counter

num = ranking_counter[0]
denom = 0
for i in range(0, 10):
    denom += ranking_counter[i] 
    
print("%f%% of the documents are ranked first" % ((num / denom) * 100)) 

93.856985% of the documents are ranked first


In [0]:
l_corpus = list(corpus)

In [0]:
# sample 10 randomly
sampled = ev_df.sample(n=10, random_state=56)

comparing_docs = sampled['top10_similarity_ranking'].transform(lambda x: list(map(lambda y: l_corpus[y], x)))

In [0]:
for doc_id, similar_docs in comparing_docs.iteritems():
  print("================================")
  print("For document #%d" % doc_id)
  for idx, doc in enumerate(similar_docs[:3]):
    print("most similar #%d" % (idx + 1), doc)
  print("================================")

For document #181094
most similar #1 TaggedDocument(['duties', 'data', 'entry', 'filing', 'customer', 'service', 'editing', 'journal', 'entry', "'s", 'sap', 'tax', 'payments', 'water', 'payments', 'daily', 'reports', 'telephone', 'inquiry', "'s"], [181094])
most similar #2 TaggedDocument(['greet', 'costumer', "'s", 'serve', 'water', 'take', 'order'], [291849])
most similar #3 TaggedDocument(['greeting', 'customers', 'serving', 'alcohol', 'food', 'taking', 'payments', 'scheduling', 'staff', "'s", 'shifts'], [306937])
For document #383085
most similar #1 TaggedDocument(['heaven', "'s", 'floral', 'windsor', 'sept', 'present', 'assistant', 'event', 'staff', 'servers', 'lists', 'duties', 'like', 'greeting', 'guests', 'leading', 'table', 'responding', 'requests', 'checking', 'table', 'decoration', 'refilling', 'glasses', 'anticipating', 'guest', 'needs', 'keeping', 'premises', 'clean'], [383085])
most similar #2 TaggedDocument(['responsible', 'greeting', 'customers', 'leading', 'tables', 'de

In [0]:
from gensim.test.utils import common_corpus, common_dictionary

len(common_dictionary)

12

In [0]:
num_occupation_data = 1110
onet_occupation_similarity = [None] * len(model.docvecs) # make it simple, wasting space but okay

def compute_top5_onet_occupation_similarity():
  for doc in corpus:
    doc_id = doc.tags[0]
    if doc_id > num_occupation_data - 1:
      similarity = model.docvecs.most_similar(positive=[model.infer_vector(doc.words)], clip_start=0, clip_end=num_occupation_data - 1, topn=5)
      for similar_doc_id, similarity_val in similarity:
        yield doc_id, similar_doc_id, similarity_val

In [0]:
onet_similarity_gen = compute_top5_onet_occupation_similarity()

In [0]:
onet_similarity_l = list(onet_similarity_gen)

  if np.issubdtype(vec.dtype, np.int):


In [0]:
df = pd.DataFrame(onet_similarity_l, columns=['doc_id', 'similar_doc_id', 'similarity'])
df.to_json('findings/resume_job_title_onet_similarity.json.gz', orient='records', lines=True, compression='gzip')

In [0]:
df.head(20)

Unnamed: 0,doc_id,similar_doc_id,similarity
0,1110,697,0.248268
1,1110,333,0.240068
2,1110,391,0.218263
3,1110,289,0.209814
4,1110,358,0.209138
5,1111,47,0.288252
6,1111,678,0.259915
7,1111,476,0.244159
8,1111,40,0.238234
9,1111,5,0.227943
