In [157]:
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import PyPDF2
from tqdm import tqdm
from collections import Counter
import json
import re
import string
import gzip
import numpy as np
from nltk.tokenize import word_tokenize
from traceback_with_variables import activate_by_import


In [180]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def load_stopwords(filename):
    stopwords = [] # ASSIGNMENT: replace this with your code
    with open(filename, "r") as f:
        for line in tqdm(f):
            stopwords.append(re.sub(r'\n','',line))
    return set(stopwords)

In [195]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import split_on_space
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_punctuation


def clean_strings(data):
    stop_words = load_stopwords('mallet_en_stoplist.txt')
    data = remove_stopwords(data)
    data = strip_multiple_whitespaces(data)
    data = strip_non_alphanum(data)
    data = strip_punctuation(data)
    data = split_on_space(data.lower())
    return data

In [123]:
def filter_punctuation(tokens):
    punct = string.punctuation
    return [word   for word in tokens   if word not in punct ]


In [159]:
def read_and_clean_jds(infile):
    print("\nReading and cleaning text from {}".format(infile))
    jds = []
    categories = []
    with gzip.open(infile,'rt') as f:
        for line in tqdm(f):
            json_line = ''
            try:
                json_line = json.loads(line)
            except:
                print("An execption occurred loading the following line")
                print(line)
                next
            category = json_line['category']
            category = re.sub(r'\/','_',category)
            jd = json_line['job_description']
            jd_text = jd.translate(str.maketrans('','',string.punctuation))
            jd_text = re.sub(r'\s+', ' ', jd_text)
            # jd_tokens = filter_punctuation(word_tokenize(jd_text.lower()))
            jd_tokens = clean_strings(jd_text)
            category_text = re.sub(r'\s+', '_', category)
            categories.append(category_text.lower())
            jds.append(jd_tokens)
            
    return jds, categories

In [None]:
def read_and_clean_resume_pdf(filename):
    # resume_path = 'resume1.pdf'
    resume_text = ''
    pdfReader = PyPDF2.PdfFileReader(filename)
    for i in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(i)
        resume_text += pageObj.extractText()
    
    # pre normalize tokenization
    resume_text = resume_text.lower()
    resume_text = re.sub('[^a-z]', ' ', resume_text) 
    resume_text = re.sub(r'\s+', ' ', resume_text)
    return resume_text

In [130]:
def split_training_set(jds, labels, test_size=0.3, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(jds, labels, test_size=test_size, random_state=random_seed, stratify=labels)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [162]:
job_description_data = 'indeed_job_posting.ldjson.gz'
jds, categories = read_and_clean_jds(job_description_data)
print(len(jds), len(categories))


Reading and cleaning text from indeed_job_posting.ldjson.gz


29928it [00:55, 539.79it/s]

29928 29928





In [163]:
X_train, X_test, y_train, y_test = split_training_set(jds, categories)
dict(Counter(y_test))


Training set label counts: Counter({'healthcare': 2553, 'administrative': 2503, 'manufacturing_mechanical': 1575, 'construction_facilities': 1559, 'computer_internet': 1528, 'sales': 1362, 'transportation_logistics': 1297, 'restaurant_food_service': 1205, 'customer_service': 990, 'retail': 788, 'accounting_finance': 725, 'engineering_architecture': 720, 'education_training': 664, 'upper_management_consulting': 476, 'human_resources': 470, 'marketing_advertising_pr': 446, 'banking_loans': 329, 'non-profit_volunteering': 255, 'insurance': 233, 'hospitality_travel': 229, 'law_enforcement_security': 223, 'arts_entertainment_publishing': 192, 'legal': 160, 'pharmaceutical_bio-tech': 143, 'real_estate': 128, 'telecommunications': 127, 'government_military': 69})
Test set     label counts: Counter({'healthcare': 1094, 'administrative': 1073, 'manufacturing_mechanical': 675, 'construction_facilities': 668, 'computer_internet': 655, 'sales': 584, 'transportation_logistics': 556, 'restaurant_foo

{'restaurant_food_service': 517,
 'healthcare': 1094,
 'marketing_advertising_pr': 191,
 'non-profit_volunteering': 109,
 'human_resources': 202,
 'law_enforcement_security': 95,
 'insurance': 100,
 'hospitality_travel': 98,
 'administrative': 1073,
 'computer_internet': 655,
 'manufacturing_mechanical': 675,
 'construction_facilities': 668,
 'transportation_logistics': 556,
 'sales': 584,
 'retail': 337,
 'education_training': 285,
 'upper_management_consulting': 204,
 'government_military': 30,
 'customer_service': 424,
 'accounting_finance': 311,
 'engineering_architecture': 309,
 'pharmaceutical_bio-tech': 61,
 'real_estate': 55,
 'banking_loans': 141,
 'arts_entertainment_publishing': 82,
 'telecommunications': 55,
 'legal': 68}

In [164]:
dict_test_count = dict(Counter(y_test))

In [165]:

def tag_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        
        yield TaggedDocument(list_of_words, [i])

In [166]:
def train_model(data):
    
    # tagged_data = [TaggedDocument(words = word_tokenize(_d.lower()), tags = [str(i)]) for i, _d in enumerate(data)]

    tagged_data = list(tag_document(data))
    # initialize model
    model = Doc2Vec(vector_size=50, min_count=10, epochs=50)
    # vocabulary building
    model.build_vocab(tagged_data)
   
    print(type(tagged_data))
    model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
    model_name = 'doc2vec_latest_pynb_2'
    model.save(model_name)
    print("Model saved")
    return model_name

In [167]:
# model_name = train_model(X_train)
# baseline_model = Doc2Vec.load(model_name)
X_train[:2]
tagged_x = tag_document(X_train[:3])
list(tagged_x)[:1]
# train_data = list(tagged_x)
# train_data[:1]
# list(tagged_x)[0]
# training_corpus = list(tagged_x)[0]
# print(training_corpus[:2])

[TaggedDocument(words=['description', 'purpose', 'of', 'position', 'the', 'dc', 'loss', 'prevention', 'manager', 'assists', 'in', 'the', 'detention', 'of', 'employees', 'and', 'contracted', 'vendors', 'for', 'theft', 'fraud', 'or', 'combative', 'situations', 'the', 'dc', 'loss', 'prevention', 'manager', 'observes', 'cameras', 'at', 'length', 'while', 'protecting', 'the', 'company', '’', 's', 'assets', 'and', 'maintains', 'a', 'strong', 'loss', 'prevention', 'and', 'safety', 'culture', 'in', 'the', 'warehouse', 'to', 'reduce', 'losses', 'and', 'liabilities', 'essential', 'duties', 'and', 'responsibilities', 'ensure', 'adequate', 'loss', 'prevention', 'and', 'security', 'staffing', 'within', 'the', 'dc', 'and', 'ensure', 'employees', 'are', 'properly', 'trained', 'supervise', 'motivate', 'and', 'develop', 'loss', 'prevention', 'team', 'members', 'manage', 'all', 'investigations', 'of', 'merchandise', 'loss', 'including', 'internal', 'theft', 'maintain', 'a', 'close', 'working', 'relation

In [168]:
model_name = train_model(X_train)
# model _2 is trained with larger dataset
baseline_model = Doc2Vec.load('doc2Vec_latest_pynb_2')

<class 'list'>
Model saved


In [169]:
baseline_model = Doc2Vec.load('doc2Vec_latest_pynb_2')

In [170]:
resume = read_and_clean_resume_pdf('resume1.pdf')
# print("Resume \n", resume)
resume_tokens = word_tokenize(resume)
# print(resume_tokens)

In [171]:
test_jd = X_test[24]
print(test_jd)
test_jd = filter_punctuation(test_jd)
# test_jd
# string.punctuation

['at', 'american', 'signature', 'inc', 'we', 'believe', 'everyone', 'has', 'the', 'right', 'to', 'a', 'wellfurnished', 'life', 'every', 'day', 'our', 'customers', 'embark', 'upon', 'their', 'personal', 'style', 'journey—an', 'adventure', 'to', 'that', 'purpose', 'we', 'are', 'committed', 'to', 'providing', 'our', 'customers', 'with', 'an', 'easy', 'and', 'fun', 'furniture', 'shopping', 'experience', 'the', 'visual', 'merchandiser', 'is', 'critical', 'in', 'ensuring', 'we', 'meet', 'that', 'goal', 'by', 'implementing', 'the', 'visual', 'merchandising', 'strategy', 'this', 'person', 'creates', 'appealing', 'and', 'eyecatching', 'visual', 'displays', 'that', 'lead', 'the', 'customer', 'through', 'the', 'entire', 'store', 'this', 'role', 'reports', 'directly', 'to', 'the', 'store', 'manager', 'some', 'of', 'the', 'functions', 'the', 'visual', 'merchandiser', 'associate', 'will', 'perform', 'embodies', 'our', 'values', 'adventure', 'guides', 'serve', 'others', 'and', 'own', 'it', 'responsib

In [172]:
v1 = baseline_model.infer_vector(resume_tokens)
v2 = baseline_model.infer_vector(test_jd)
print(v1)
print(v2)

cos_sim = (np.dot(np.array(v1), np.array(v2))) / (np.linalg.norm(np.array(v1)) * np.linalg.norm(np.array(v2)))
print("Baseline Cosin Similarity:\n", round(cos_sim, 3))


[-0.02817023 -2.9834063  -2.1680255  -0.99298805 -1.6784064   0.9995433
  0.5104036   0.9826045  -0.64675534  0.45878452 -1.1378207   0.17948298
  0.12890847 -1.6156888  -0.11455686  1.8475986   0.18894452  0.28700772
  2.5126631  -2.887785   -0.52730125  0.19553296 -2.4441578  -2.4779186
 -0.7345245   3.1311193  -0.11598843  1.1395485  -2.3057697   1.8959346
  1.076565    4.558837   -4.481152    0.84730405 -2.2348993   2.4392035
 -4.746667    0.37996256 -0.19597453  2.712077    0.82944036 -1.0827668
 -1.5195162  -0.67740834  0.13965823  0.05648417 -0.54383105 -0.9709262
  3.3701053   1.2368729 ]
[-1.6653608   1.643185    0.32922906 -1.3868117   1.5204774  -2.930758
  0.24913858  0.7447512  -2.5980022  -1.0749512   0.83170336 -0.7779447
  0.4174793  -2.8418672   1.1400396  -2.6061044  -0.08156357  1.7677779
 -2.6633978   0.7569284   0.2568745   1.5723798   0.52084875  1.1488518
 -0.27910826  2.93171    -2.195914    0.50587744 -0.00839286  0.5554253
  0.9636893   1.1173477   0.1205392  

In [173]:
# training_corpus = list(tagged_x)
training_data = list(tag_document(X_train))
print(list(training_data)[:1])


[TaggedDocument(words=['description', 'purpose', 'of', 'position', 'the', 'dc', 'loss', 'prevention', 'manager', 'assists', 'in', 'the', 'detention', 'of', 'employees', 'and', 'contracted', 'vendors', 'for', 'theft', 'fraud', 'or', 'combative', 'situations', 'the', 'dc', 'loss', 'prevention', 'manager', 'observes', 'cameras', 'at', 'length', 'while', 'protecting', 'the', 'company', '’', 's', 'assets', 'and', 'maintains', 'a', 'strong', 'loss', 'prevention', 'and', 'safety', 'culture', 'in', 'the', 'warehouse', 'to', 'reduce', 'losses', 'and', 'liabilities', 'essential', 'duties', 'and', 'responsibilities', 'ensure', 'adequate', 'loss', 'prevention', 'and', 'security', 'staffing', 'within', 'the', 'dc', 'and', 'ensure', 'employees', 'are', 'properly', 'trained', 'supervise', 'motivate', 'and', 'develop', 'loss', 'prevention', 'team', 'members', 'manage', 'all', 'investigations', 'of', 'merchandise', 'loss', 'including', 'internal', 'theft', 'maintain', 'a', 'close', 'working', 'relation

In [174]:
# assesing the model

ranks = []
second_ranks = []
for doc_id in range(len(training_data)):
    inferred_vector = baseline_model.infer_vector(training_data[doc_id].words)
    sims = baseline_model.dv.most_similar([inferred_vector], topn=len(baseline_model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

counter = Counter(ranks)
print(counter)

Counter({0: 20592, 1: 302, 2: 40, 3: 8, 4: 3, 5: 2, 7: 1, 6: 1})


In [175]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(training_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % baseline_model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(training_data[sims[index][0]].words)))
# for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
#     print(u'%s %s: «%s»\n' % (label, sims[index], ' training corupurs'))

Document (20948): «background and drug screen required dot medical card required load box truck with telecom equipment and to deliver to the leads in the area load the debris from the locations take it back to the warehousedumpster 75 deliveries 25 general warehouse work shipping receiving cleaning up etc serious inquiries only job types fulltime contract pay 1600 1800 per hour benefits 401k life insurance paid time off schedule monday to friday education high school or equivalent preferred shift availability day shift preferred work location one location»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc10,s0.001,t3):

MOST (20948, 0.9767364263534546): «background and drug screen required dot medical card required load box truck with telecom equipment and to deliver to the leads in the area load the debris from the locations take it back to the warehousedumpster 75 deliveries 25 general warehouse work shipping receiving cleaning up etc serious inquiries only job types fullt

In [176]:
import random
doc_id = random.randint(0, len(training_data)-1)

# compare and print the second most similar document
print('Train Document ({}): <<{}>>\n'.format(doc_id, ' '.join(training_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: <<{}>>\n'.format(sim_id, ' '.join(training_data[sim_id[0]].words)))

Train Document (3110): <<senior electrical engineer mks instruments power solutions products provides the plasma and control solutions that is a key supplier to the worlds ’ largest electronics producers plasma control presents many technical challenges that needs innovative thinking and inquisitive engineering minds if this sounds like you then we want to meet you we are looking for an exceptional senior electrical engineer who will design develop modify and evaluate electronic systems for high power equipment for the semiconductor processing industry as part of a crossfunctional team you will help determine design architecture lead system development analyze systemlevel performance data and drive problem solving exercises to ensure reliable designs to meet customer requirements this position is located in rochester ny what you will do design and productize leading edge high efficiency rf power equipment for thin films processing familiarity with power amplifier topologies class ab cl

In [177]:

cat_dict_test_clean = {cat: [] for cat in y_test}
# print(cat_dict_test_clean)
for i, v in enumerate(X_test):
    
    cat_dict_test_clean[y_test[i]].append(v)
# len(cat_dict_test_clean.values())
# len(X_test)
print(Counter(y_test))
len(cat_dict_test_clean['telecommunications'])

Counter({'healthcare': 1094, 'administrative': 1073, 'manufacturing_mechanical': 675, 'construction_facilities': 668, 'computer_internet': 655, 'sales': 584, 'transportation_logistics': 556, 'restaurant_food_service': 517, 'customer_service': 424, 'retail': 337, 'accounting_finance': 311, 'engineering_architecture': 309, 'education_training': 285, 'upper_management_consulting': 204, 'human_resources': 202, 'marketing_advertising_pr': 191, 'banking_loans': 141, 'non-profit_volunteering': 109, 'insurance': 100, 'hospitality_travel': 98, 'law_enforcement_security': 95, 'arts_entertainment_publishing': 82, 'legal': 68, 'pharmaceutical_bio-tech': 61, 'real_estate': 55, 'telecommunications': 55, 'government_military': 30})


55

In [178]:
metadata = {}
infered_vector_test = {} # contains, categor-wise, inferred doc vecs for each document in the test set
for cat, docs in cat_dict_test_clean.items():
    infered_vector_test[cat] = [baseline_model.infer_vector(doc) for doc in list(docs)]
    metadata[cat] = len(infered_vector_test[cat])
print(infered_vector_test['sales'][0])
print(metadata)

[-0.54000103 -1.7815771  -0.01822292  1.3970417  -0.6052036  -1.6282384
  0.67797077 -1.5123386  -1.3855098   1.7619376   1.4525471  -0.23652257
  0.53050756 -1.4712969   0.18086226  0.10112073  0.12608562 -0.78034097
 -2.7077274   2.5269616   0.11072759  0.86593026 -0.76860225  1.3190836
 -3.257572    1.0147134  -0.14026503  0.05470254 -1.2703363   2.824476
  1.6340835   1.1205475   0.6687991  -2.3340898  -0.23435831 -0.3902723
  0.88627875  0.6476853   0.12789269 -0.32600844  0.83350426  1.46178
 -1.7818257  -1.0799935   0.04380473 -1.0037669   0.41418263  0.884764
  3.266313   -1.4845078 ]
{'restaurant_food_service': 517, 'healthcare': 1094, 'marketing_advertising_pr': 191, 'non-profit_volunteering': 109, 'human_resources': 202, 'law_enforcement_security': 95, 'insurance': 100, 'hospitality_travel': 98, 'administrative': 1073, 'computer_internet': 655, 'manufacturing_mechanical': 675, 'construction_facilities': 668, 'transportation_logistics': 556, 'sales': 584, 'retail': 337, 'educ

In [179]:
import csv

def write_to_csv(input, output_file, delimeter='\t'):
    with open(output_file, "w") as f:
        writer = csv.writer(f, delimiter=delimeter)
        writer.writerows(input)
        
veclist_metadata = []
veclist = []

for cat in cat_dict_test_clean.keys():
    for tag in [cat]*metadata[cat]:
        veclist_metadata.append([tag])
    for vec in infered_vector_test[cat]:
        veclist.append(list(vec))
        
write_to_csv(veclist, "doc2vec_indeed_large_jds_test_vectors.csv")
write_to_csv(veclist_metadata, "doc2vec_indeed_large_jds_test_vectors_metadata.csv")

In [187]:
type(X_train[0])
x = ['a','b','c']
' '.join(x)
all_td = [jd for sub_jd in X_train for jd in sub_jd]

   

In [189]:
my_counter = Counter(all_td)


In [190]:
def print_sorted_items(dict, n=10, order='ascending'):
    if order == 'descending':
        multiplier = -1
    else:
        multiplier = 1
    ranked = sorted(dict.items(), key=lambda x: x[1] * multiplier)
    for key, value in ranked[:n] :
        print(key, value)

In [194]:
print_sorted_items(my_counter, n=100, order='descending')

and 587819
to 335509
the 293906
of 240816
a 170158
in 160023
with 128349
for 123051
or 93793
is 76329
as 75727
work 68534
experience 62337
our 62282
be 61826
are 58128
you 57103
we 55613
on 50352
all 49756
an 44674
will 40053
that 40000
’ 38331
required 38223
ability 38167
team 36806
job 36728
by 35861
this 35820
skills 34851
at 33720
other 32366
must 30653
your 28384
service 25796
s 25568
have 25090
management 24976
position 24829
including 24021
preferred 23044
customer 22934
time 22830
company 22311
insurance 21852
from 21454
health 20960
business 20896
years 20723
information 20200
care 19912
requirements 19869
support 19447
knowledge 19439
benefits 19416
location 18910
environment 18900
duties 18837
not 18494
may 18183
services 18050
provide 17847
employees 17362
working 16791
ensure 16622
employee 16622
their 16579
responsibilities 16327
education 16215
new 15583
program 15512
one 15394
development 15299
training 15284
equipment 15166
sales 14984
able 14843
perform 14729
customer