In [6]:
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import PyPDF2
from tqdm import tqdm
from collections import Counter
import json
import re
import string
import gzip
import numpy as np
from nltk.tokenize import word_tokenize


In [12]:
def filter_punctuation(tokens):
    punct = string.punctuation
    return [word   for word in tokens   if word not in punct ]


In [57]:
def read_and_clean_jds(infile):
    print("\nReading and cleaning text from {}".format(infile))
    jds = []
    categories = []
    with gzip.open(infile,'rt') as f:
        for line in tqdm(f):
            
            json_line = json.loads(line)
            category = json_line['category']
            category = re.sub(r'\/','_',category)
            jd = json_line['job_description']
            jd_text = jd.translate(str.maketrans('','',string.punctuation))
            jd_text = re.sub(r'\s+', ' ', jd_text)
            jd_tokens = filter_punctuation(word_tokenize(jd_text.lower()))
            category_text = re.sub(r'\s+', '_', category)
            categories.append(category_text.lower())
            jds.append(jd_tokens)
            
    return jds, categories

In [5]:
def read_and_clean_resume_pdf(filename):
    # resume_path = 'resume1.pdf'
    resume_text = ''
    pdfReader = PyPDF2.PdfFileReader(filename)
    for i in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(i)
        resume_text += pageObj.extractText()
    
    # pre normalize tokenization
    resume_text = resume_text.lower()
    resume_text = re.sub('[^a-z]', ' ', resume_text) 
    resume_text = re.sub(r'\s+', ' ', resume_text)
    return resume_text

In [58]:
def split_training_set(jds, labels, test_size=0.3, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(jds, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [59]:
job_description_data = 'small_indeed_jd.ldjson.gz'
jds, categories = read_and_clean_jds(job_description_data)
print(len(jds), len(categories))


Reading and cleaning text from small_indeed_jd.ldjson.gz


987it [00:01, 508.69it/s]

987 987





In [60]:
X_train, X_test, y_train, y_test = split_training_set(jds, categories)


Training set label counts: Counter({'healthcare': 87, 'administrative': 79, 'construction_facilities': 57, 'manufacturing_mechanical': 50, 'computer_internet': 49, 'sales': 48, 'restaurant_food_service': 43, 'transportation_logistics': 40, 'customer_service': 37, 'education_training': 23, 'accounting_finance': 22, 'retail': 22, 'engineering_architecture': 20, 'human_resources': 20, 'marketing_advertising_pr': 16, 'upper_management_consulting': 15, 'arts_entertainment_publishing': 10, 'hospitality_travel': 8, 'banking_loans': 8, 'insurance': 6, 'telecommunications': 6, 'non-profit_volunteering': 6, 'law_enforcement_security': 6, 'real_estate': 4, 'government_military': 4, 'pharmaceutical_bio-tech': 3, 'legal': 1})
Test set     label counts: Counter({'healthcare': 37, 'administrative': 34, 'construction_facilities': 24, 'manufacturing_mechanical': 21, 'computer_internet': 21, 'sales': 21, 'restaurant_food_service': 18, 'transportation_logistics': 17, 'customer_service': 16, 'education_tr

In [27]:

def tag_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        
        yield TaggedDocument(list_of_words, [i])

In [46]:
def train_model(data):
    
    # tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(data)]

    tagged_data = list(tag_document(data))
    # initialize model
    model = Doc2Vec(vector_size=50, min_count=10, epochs=50)
    # vocabulary building
    model.build_vocab(tagged_data)
   
    print(type(tagged_data))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model_name = 'doc2vec_latest_pynb'
    model.save(model_name)
    print("Model saved")
    return model_name

In [38]:
# model_name = train_model(X_train)
# baseline_model = Doc2Vec.load(model_name)
X_train[:2]
tagged_x = tag_document(X_train[:3])
list(tagged_x)[0]

TaggedDocument(words=['hello', 'we', 'are', 'looking', 'to', 'fill', 'dishwasher', 'position', 'in', 'our', 'brand', 'new', 'indian', 'restaurant', 'on', 'rainbow', 'and', '215.', 'bilingual', 'speakers', 'are', 'welcome', 'to', 'apply', 'we', 'are', 'also', 'looking', 'for', 'other', 'positions', 'like', 'kitchen', 'helper', 'line', 'cooks', 'etc', 'please', 'mention', 'the', 'position', 'you', 'are', 'applying', 'for', 'job', 'type', 'full-time', 'pay', '9.75', '10.50', 'per', 'hour', 'benefits', 'employee', 'assistance', 'program', 'referral', 'program', 'physical', 'setting', 'bar', 'casual', 'dining', 'restaurant', 'fine', 'dining', 'restaurant', 'schedule', '10', 'hour', 'shift', '12', 'hour', 'shift', '8', 'hour', 'shift', 'day', 'shift', 'evening', 'shift', 'every', 'weekend', 'holidays', 'monday', 'to', 'friday', 'weekend', 'availability', 'ability', 'to', 'commute/relocate', 'las', 'vegas', 'nv', 'reliably', 'commute', 'or', 'planning', 'to', 'relocate', 'before', 'starting',

In [48]:
# model_name = train_model(X_train)
baseline_model = Doc2Vec.load('doc2Vec_latest_pynb')

In [50]:
resume = read_and_clean_resume_pdf('resume1.pdf')
# print("Resume \n", resume)
resume_tokens = word_tokenize(resume)
# print(resume_tokens)

['vedansh', 'shrivastavavedansh', 'gmail', 'com', 'bhopal', 'dehradun', 'education', 'bachelor', 'of', 'technology', 'b', 'tech', 'computer', 'science', 'with', 'specialization', 'in', 'bigdata', 'university', 'of', 'petroleum', 'and', 'energy', 'studies', 'senior', 'secondary', 'xii', 'science', 'delhi', 'public', 'school', 'cbse', 'board', 'year', 'of', 'completion', 'secondary', 'x', 'delhi', 'public', 'school', 'cbse', 'board', 'year', 'of', 'completion', 'internships', 'head', 'of', 'volunteers', 'arushi', 'ngo', 'bhopal', 'may', 'jun', 'the', 'internship', 'was', 'for', 'handling', 'the', 'physically', 'challenged', 'students', 'recording', 'books', 'for', 'visually', 'challenged', 'students', 'trainings', 'deep', 'learning', 'specialization', 'deeplearning', 'ai', 'online', 'oct', 'nov', 'machine', 'learning', 'stanford', 'university', 'coursera', 'online', 'mar', 'may', 'apache', 'spark', 'with', 'scala', 'hands', 'on', 'bigdata', 'udemy', 'online', 'feb', 'feb', 'projects', 'f

In [66]:
test_jd = X_test[20]
print(test_jd)
test_jd = filter_punctuation(test_jd)
# test_jd
# string.punctuation

['our',
 'company',
 'equus',
 'workforce',
 'solutions',
 'overview',
 'join',
 'us',
 'in',
 'transforming',
 'peoples',
 'lives',
 'and',
 'their',
 'communities',
 'career',
 'advisers',
 'provide',
 'guidance',
 'about',
 'career',
 'choice',
 'employment',
 'training',
 'and',
 'further',
 'education',
 'opportunities',
 'to',
 'clients',
 'including',
 'young',
 'people',
 'and',
 'the',
 'unemployed',
 'right',
 'now',
 'we',
 'have',
 'an',
 'exciting',
 'opportunity',
 'for',
 'you',
 'to',
 'join',
 'our',
 'team',
 'as',
 'a',
 'tanf',
 'career',
 'navigator',
 'responsibilities',
 'prepares',
 'organizes',
 'and',
 'maintains',
 'accurate',
 'updated',
 'information',
 'in',
 'both',
 'electronic',
 'and',
 'paper',
 'participant',
 'files',
 'reflecting',
 'the',
 'entire',
 'history',
 'of',
 'a',
 'program',
 'participant',
 'including',
 'log',
 'of',
 'supportive',
 'services',
 'issued',
 'provides',
 'case',
 'management',
 'with',
 'a',
 'focus',
 'on',
 'helping',

In [67]:
v1 = baseline_model.infer_vector(resume_tokens)
v2 = baseline_model.infer_vector(test_jd)

cos_sim = (np.dot(np.array(v1), np.array(v2))) / (np.linalg.norm(np.array(v1)) * np.linalg.norm(np.array(v2)))
print("Baseline Cosin Similarity:\n", round(cos_sim, 3))


Baseline Cosin Similarity:
 0.071
