In [163]:
## import necessary libraries

import pandas as pd
import nltk
import string
import gensim
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from sent2vec.vectorizer import Vectorizer
from sent2vec.splitter import Splitter
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.scripts.glove2word2vec import glove2word2vec

In [19]:
df = pd.read_csv('./potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


## Word2Vec

In [129]:
job_desc_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
    
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional Human Resources'
        elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                            'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()

    ## remove stopwords
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    desc = nltk.word_tokenize(desc)
    
    job_desc_list.append(desc)

In [130]:
job_desc_list.append(['seeking', 'human', 'resources'])

In [131]:
job_desc_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [132]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
vectors = vectorizer.vectors

In [133]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.611827164888382, dist-2: 0.29940420389175415


In [89]:
test_df = df
test_df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [134]:
scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['word2vec_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.929219,0.388173
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.948172,0.104410
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.984994,0.700596
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.973138,0.291818
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.935457,0.229988
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.914459,0.729863
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.982065,0.750962
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.976543,0.282647
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.983066,0.133268


## BERT

In [84]:
bert_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
            
    desc = ' '.join(desc)
    bert_list.append(desc)

In [85]:
bert_list.append('seeking human resources')

In [86]:
bert_list

['2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 'Native English Teacher at EPIK English Program in Korea',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Human Resources Senior Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP Chief Human Resources Officer Marketing Communications CSR Officer ENGIE Houston The Woodlands Energy General Professional in Human Resources Senior Professional in Human Resources',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 '2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 '2019 C T Bauer Colleg

In [87]:
vectorizer = Vectorizer()
vectorizer.bert(bert_list)
vectors = vectorizer.vectors

Vectorization done on cpu device


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [88]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.07078111171722412, dist-2: 0.01500558853149414


In [98]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['bert_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.929219
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.948172
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.984994
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.973138
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.935457
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.914459
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.982065
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.976543
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.983066


In [136]:
test_df.sort_values(by=['word2vec_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.994894,0.920869
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.996201,0.903864
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.996201,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.975402,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,0.991686,0.793325
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.974341,0.767376


## TF-IDF

In [147]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bert_list)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [154]:
test_df['tfidf_fit'] = cosine_sim[-1][:-1]

In [162]:
test_df.sort_values(by=['word2vec_fit'], ascending=False).head(50)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,tfidf_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.994894,0.920869,0.625865
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.996201,0.903864,0.646155
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.996201,0.903864,0.646155
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.975402,0.83922,0.552318
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101,0.447527
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101,0.447527
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101,0.447527
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.974998,0.808101,0.447527
73,74,Human Resources Professional,Greater Boston Area,16,0.991686,0.793325,0.351856
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.974341,0.767376,0.170633


## GloVe

In [165]:
glove_path = './glove.6B.300d.txt'
word2vec_output_file = 'glove.6B.300d.txt.word2vec'
glove2word2vec(glove_path, word2vec_output_file)

(400000, 300)

In [167]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './glove.6B.300d.txt')
vectors = vectorizer.vectors

ValueError: invalid literal for int() with base 10: 'the'

In [None]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

In [None]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['bert_fit'] = scores
test_df