In [90]:
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import PyPDF2
from tqdm import tqdm
from collections import Counter
import json
import re
import string
import gzip
import numpy as np
from nltk.tokenize import word_tokenize


In [91]:
def filter_punctuation(tokens):
    punct = string.punctuation
    return [word   for word in tokens   if word not in punct ]


In [92]:
def read_and_clean_jds(infile):
    print("\nReading and cleaning text from {}".format(infile))
    jds = []
    categories = []
    with gzip.open(infile,'rt') as f:
        for line in tqdm(f):
            
            json_line = json.loads(line)
            category = json_line['category']
            category = re.sub(r'\/','_',category)
            jd = json_line['job_description']
            jd_text = jd.translate(str.maketrans('','',string.punctuation))
            jd_text = re.sub(r'\s+', ' ', jd_text)
            jd_tokens = filter_punctuation(word_tokenize(jd_text.lower()))
            category_text = re.sub(r'\s+', '_', category)
            categories.append(category_text.lower())
            jds.append(jd_tokens)
            
    return jds, categories

In [93]:
def read_and_clean_resume_pdf(filename):
    # resume_path = 'resume1.pdf'
    resume_text = ''
    pdfReader = PyPDF2.PdfFileReader(filename)
    for i in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(i)
        resume_text += pageObj.extractText()
    
    # pre normalize tokenization
    resume_text = resume_text.lower()
    resume_text = re.sub('[^a-z]', ' ', resume_text) 
    resume_text = re.sub(r'\s+', ' ', resume_text)
    return resume_text

In [94]:
def split_training_set(jds, labels, test_size=0.3, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(jds, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [59]:
job_description_data = 'small_indeed_jd.ldjson.gz'
jds, categories = read_and_clean_jds(job_description_data)
print(len(jds), len(categories))


Reading and cleaning text from small_indeed_jd.ldjson.gz


987it [00:01, 508.69it/s]

987 987





In [95]:
X_train, X_test, y_train, y_test = split_training_set(jds, categories)


Training set label counts: Counter({'healthcare': 87, 'administrative': 79, 'construction_facilities': 57, 'manufacturing_mechanical': 50, 'computer_internet': 49, 'sales': 48, 'restaurant_food_service': 43, 'transportation_logistics': 40, 'customer_service': 37, 'education_training': 23, 'accounting_finance': 22, 'retail': 22, 'engineering_architecture': 20, 'human_resources': 20, 'marketing_advertising_pr': 16, 'upper_management_consulting': 15, 'arts_entertainment_publishing': 10, 'hospitality_travel': 8, 'banking_loans': 8, 'insurance': 6, 'telecommunications': 6, 'non-profit_volunteering': 6, 'law_enforcement_security': 6, 'real_estate': 4, 'government_military': 4, 'pharmaceutical_bio-tech': 3, 'legal': 1})
Test set     label counts: Counter({'healthcare': 37, 'administrative': 34, 'construction_facilities': 24, 'manufacturing_mechanical': 21, 'computer_internet': 21, 'sales': 21, 'restaurant_food_service': 18, 'transportation_logistics': 17, 'customer_service': 16, 'education_tr

In [96]:

def tag_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        
        yield TaggedDocument(list_of_words, [i])

In [46]:
def train_model(data):
    
    # tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(data)]

    tagged_data = list(tag_document(data))
    # initialize model
    model = Doc2Vec(vector_size=50, min_count=10, epochs=50)
    # vocabulary building
    model.build_vocab(tagged_data)
   
    print(type(tagged_data))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model_name = 'doc2vec_latest_pynb'
    model.save(model_name)
    print("Model saved")
    return model_name

In [105]:
# model_name = train_model(X_train)
# baseline_model = Doc2Vec.load(model_name)
X_train[:2]
tagged_x = tag_document(X_train[:3])
list(tagged_x)[:1]
# train_data = list(tagged_x)
# train_data[:1]
# list(tagged_x)[0]
# training_corpus = list(tagged_x)[0]
# print(training_corpus[:2])

[]

In [48]:
# model_name = train_model(X_train)
baseline_model = Doc2Vec.load('doc2Vec_latest_pynb')

In [50]:
resume = read_and_clean_resume_pdf('resume1.pdf')
# print("Resume \n", resume)
resume_tokens = word_tokenize(resume)
# print(resume_tokens)

['vedansh', 'shrivastavavedansh', 'gmail', 'com', 'bhopal', 'dehradun', 'education', 'bachelor', 'of', 'technology', 'b', 'tech', 'computer', 'science', 'with', 'specialization', 'in', 'bigdata', 'university', 'of', 'petroleum', 'and', 'energy', 'studies', 'senior', 'secondary', 'xii', 'science', 'delhi', 'public', 'school', 'cbse', 'board', 'year', 'of', 'completion', 'secondary', 'x', 'delhi', 'public', 'school', 'cbse', 'board', 'year', 'of', 'completion', 'internships', 'head', 'of', 'volunteers', 'arushi', 'ngo', 'bhopal', 'may', 'jun', 'the', 'internship', 'was', 'for', 'handling', 'the', 'physically', 'challenged', 'students', 'recording', 'books', 'for', 'visually', 'challenged', 'students', 'trainings', 'deep', 'learning', 'specialization', 'deeplearning', 'ai', 'online', 'oct', 'nov', 'machine', 'learning', 'stanford', 'university', 'coursera', 'online', 'mar', 'may', 'apache', 'spark', 'with', 'scala', 'hands', 'on', 'bigdata', 'udemy', 'online', 'feb', 'feb', 'projects', 'f

In [71]:
test_jd = X_test[24]
print(test_jd)
test_jd = filter_punctuation(test_jd)
# test_jd
# string.punctuation

['parttime', 'store', 'clerk', 'job', 'id', '20412021', 'location', '509', 'n', 'ankeny', 'blvd', 'ankeny', 'ia', '50021', 'type', 'regular', 'parttime', 'nonexempt', 'starting', 'pay', 'rate', '1500hour', 'some', 'weekend', 'availability', 'required', 'have', 'you', 'been', 'looking', 'for', 'retail', 'work', 'that', 'supports', 'a', 'great', 'mission', 'are', 'you', 'passionate', 'about', 'upcycling', 'and', 'thrifting', 'check', 'out', 'how', 'good', 'your', 'workday', 'could', 'be', 'as', 'a', 'store', 'clerk', 'for', 'goodwill', 'by', 'joining', 'our', 'retail', 'team', 'as', 'a', 'store', 'clerk', 'your', 'work', 'will', 'aid', 'our', 'mission', 'of', 'improving', 'the', 'quality', 'of', 'life', 'for', 'all', 'individuals', 'by', 'providing', 'skills', 'training', 'and', 'helping', 'people', 'find', 'jobs', 'you', 'will', 'be', 'the', 'first', 'point', 'of', 'contact', 'for', 'goodwill', 'customers', 'and', 'the', 'face', 'they', 'remember', 'we', 'are', 'looking', 'for', 'enthus

In [121]:
v1 = baseline_model.infer_vector(resume_tokens)
v2 = baseline_model.infer_vector(test_jd)
print(v1)
print(v2)

cos_sim = (np.dot(np.array(v1), np.array(v2))) / (np.linalg.norm(np.array(v1)) * np.linalg.norm(np.array(v2)))
print("Baseline Cosin Similarity:\n", round(cos_sim, 3))


[-0.79615664  2.0496933  -3.710262    2.3002439  -1.9355851   2.0683818
 -1.0377407   0.44553    -4.7310266   1.9009143  -0.9832995  -1.630151
  0.39742747  4.949778    2.2433662   1.6613435   0.9010183  -0.02044732
  0.4024497   0.14856085  1.1665983  -2.8814588  -0.7574075   1.8949652
 -0.56534064 -2.1479695  -2.2632341   1.610586   -1.2831848  -1.4628993
  2.754947    2.859974    0.6548168   1.9437877  -0.5107176  -1.1868858
 -0.9437332  -4.1944766  -0.25953102 -1.3360353  -0.36016396 -0.6593772
 -0.6282683  -1.5428878  -0.02956845  2.1297455  -1.9636341  -0.5933804
  2.2594023   0.00584629]
[-0.5202444   0.48078534 -0.46594632 -2.4538963  -2.7204158  -0.1848196
  0.55382746 -1.5595003  -0.4827493  -1.1593653  -0.15400176 -0.5057762
  1.9684969   1.1163969   1.4469483  -0.68277186 -2.2832112  -0.81727505
 -1.8638242   2.0957997   0.71377605  0.18107471  0.05174188 -0.44147855
 -2.1581647   0.8624479   0.28595883  0.3098966   1.3386638  -0.84458137
  2.1193812   0.30126932 -0.7783356

In [111]:
# training_corpus = list(tagged_x)
training_data = list(tag_document(X_train))
print(list(training_data)[:1])


[TaggedDocument(words=['hello', 'we', 'are', 'looking', 'to', 'fill', 'dishwasher', 'position', 'in', 'our', 'brand', 'new', 'indian', 'restaurant', 'on', 'rainbow', 'and', '215', 'bilingual', 'speakers', 'are', 'welcome', 'to', 'apply', 'we', 'are', 'also', 'looking', 'for', 'other', 'positions', 'like', 'kitchen', 'helper', 'line', 'cooks', 'etc', 'please', 'mention', 'the', 'position', 'you', 'are', 'applying', 'for', 'job', 'type', 'fulltime', 'pay', '975', '1050', 'per', 'hour', 'benefits', 'employee', 'assistance', 'program', 'referral', 'program', 'physical', 'setting', 'bar', 'casual', 'dining', 'restaurant', 'fine', 'dining', 'restaurant', 'schedule', '10', 'hour', 'shift', '12', 'hour', 'shift', '8', 'hour', 'shift', 'day', 'shift', 'evening', 'shift', 'every', 'weekend', 'holidays', 'monday', 'to', 'friday', 'weekend', 'availability', 'ability', 'to', 'commuterelocate', 'las', 'vegas', 'nv', 'reliably', 'commute', 'or', 'planning', 'to', 'relocate', 'before', 'starting', 'wo

In [117]:
# assesing the model

ranks = []
second_ranks = []
for doc_id in range(len(training_data)):
    inferred_vector = baseline_model.infer_vector(training_data[doc_id].words)
    sims = baseline_model.dv.most_similar([inferred_vector], topn=len(baseline_model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print(counter)

Counter({0: 690})


In [118]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(training_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % baseline_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(training_data[sims[index][0]].words)))

Document (689): «description please note the anticipated schedule for this position will be from 1000 pm to 630 am tuesday through saturday here at the washington state department of veterans affairs wdva we are passionate about our mission of “ serving those who served ” as a national leader in our advocacy for more than 500000 veterans and their family members we strive to connect them to earned benefits as well as innovative programs focused on their overall health and wellness in addition we provide critical community services through a variety of programs and at our four state veterans homes located in orting port orchard spokane and walla walla these locations provide medicare and medicaid nursing home care to honorably discharged veterans including in some instances their spouses widows or gold star we are in need of a nightshift rn2 to provide nursing related care to residents in our spokane veterans home this includes assistance with activities of daily living monitoring and r

In [120]:
import random
doc_id = random.randint(0, len(training_data)-1)

# compare and print the second most similar document
print('Train Document ({}): <<{}>>\n'.format(doc_id, ' '.join(training_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: <<{}>>\n'.format(sim_id, ' '.join(training_data[sim_id[0]].words)))

Train Document (127): <<as a warehousedelivery driver you will be responsible for receiving and placing inventory into stock picking and packing orders for delivery timely and accurate delivery andor pickup of goods to specified locations loading and unloading trucks and trailers warehouse housekeeping and inventory control you will know youre a match to this position if you are a selfstarter organized and familiar with the delivery area previous warehouse experience maintenance experience andor delivery experience preferred primary duties load and unload trucks in an accurate and efficient manner become familiar with delivery routes and customers preferences perform warehouse duties when called upon which includes materials handling shipping and receiving picking and packing orders qualifications must be 21 years of age or older with a valid drivers license cdl not required current medical certification card or ability to obtain a card must meet companys requirements to drive company 