In [48]:
## import necessary libraries

import pandas as pd
import nltk
import string
import gensim
import itertools
import numpy as np
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from sent2vec.vectorizer import Vectorizer
from sent2vec.splitter import Splitter
from gensim.models import Word2Vec
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from glove import Glove

from gensim.matutils import corpus2csc
from gensim.corpora import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
from mittens import GloVe

In [2]:
df = pd.read_csv('./potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


## Word2Vec

In [3]:
job_desc_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
    
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional Human Resources'
            
        ## remove words that are not in pretrained vectors
        elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                            'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()

    ## remove stopwords
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    desc = nltk.word_tokenize(desc)
    
    job_desc_list.append(desc)

In [4]:
job_desc_list.append(['seeking', 'human', 'resources'])

In [5]:
job_desc_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [6]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
vectors = vectorizer.vectors

In [7]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.611827164888382, dist-2: 0.29940420389175415


In [8]:
test_df = df
test_df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [9]:
scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['word2vec_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268


In [10]:
test_df.sort_values(by=['word2vec_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,0.767376


## BERT

In [11]:
bert_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
        
    desc = ' '.join(desc)
    bert_list.append(desc)

In [12]:
bert_list.append('seeking human resources')

In [13]:
bert_list

['2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 'Native English Teacher at EPIK English Program in Korea',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Human Resources Senior Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP Chief Human Resources Officer Marketing Communications CSR Officer ENGIE Houston The Woodlands Energy General Professional in Human Resources Senior Professional in Human Resources',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 '2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 '2019 C T Bauer Colleg

In [97]:
vectorizer = Vectorizer()
vectorizer.bert(bert_list)
vectors = vectorizer.vectors

Vectorization done on cpu device


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [101]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.07078111171722412, dist-2: 0.01500558853149414


In [106]:
len(bert_list)

105

In [16]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['bert_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066


In [17]:
test_df.sort_values(by=['bert_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985


## TF-IDF

In [18]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bert_list)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
test_df['tfidf_fit'] = cosine_sim[-1][:-1]

In [20]:
test_df.sort_values(by=['tfidf_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702


## GloVe

In [40]:
def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}
 
    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))
 
    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
 
    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
    co_occurrence_matrix = np.zeros((len(vocab), len(vocab)))
 
    # Loop through the bigrams taking the current and previous word,
    # and the number of occurrences of the bigram.
    for bigram in bigram_freq:
        current = bigram[0][1]
        previous = bigram[0][0]
        count = bigram[1]
        pos_current = vocab_index[current]
        pos_previous = vocab_index[previous]
        co_occurrence_matrix[pos_current][pos_previous] = count
    co_occurrence_matrix = np.matrix(co_occurrence_matrix)
 
    # return the matrix and the index
    return co_occurrence_matrix, vocab_index
 
# Create one list using many lists
data = list(itertools.chain.from_iterable(job_desc_list))
matrix, vocab_index = generate_co_occurrence_matrix(data)
 
 
data_matrix = pd.DataFrame(matrix, index=vocab_index,
                             columns=vocab_index)
print(data_matrix)

             general  ryan  environment  portfolio  csr  officer  staffing  \
general          0.0   0.0          0.0        0.0  0.0      0.0       0.0   
ryan             0.0   0.0          0.0        0.0  0.0      0.0       0.0   
environment      0.0   0.0          0.0        0.0  0.0      0.0       0.0   
portfolio        0.0   0.0          0.0        0.0  0.0      0.0       0.0   
csr              0.0   0.0          0.0        0.0  0.0      0.0       0.0   
...              ...   ...          ...        ...  ...      ...       ...   
generalist       0.0   0.0          0.0        0.0  0.0      0.0       0.0   
gis              0.0   0.0          0.0        0.0  0.0      0.0       0.0   
paint            0.0   0.0          0.0        0.0  0.0      0.0       0.0   
bachelor         0.0   0.0          0.0        0.0  0.0      0.0       0.0   
senior           0.0   0.0          0.0        0.0  0.0      0.0       0.0   

             conflict  software  team  ...  teacher  informatio

In [83]:
from collections import defaultdict

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

In [86]:
test_list = ['i love programming', 'i love math', 'i tolerate biology']

In [89]:
co_occurrence(test_list, 1).to_numpy()

array([[0, 0, 0, 0, 0, 1],
       [0, 0, 2, 0, 0, 1],
       [0, 2, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 0, 0]], dtype=int16)

In [94]:
co_occ = co_occurrence(bert_list, 1)

In [95]:
co_occ

Unnamed: 0,2019,2020,2621,408,709,a,about,administration,administrative,admissions,...,up,victoria,wellington,western,westfield,with,within,woodlands,work,world
2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2621,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
408,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
709,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
with,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
within,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
woodlands,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
work,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
glove = GloVe()

In [91]:
vectors = glove.fit(co_occ)

ValueError: operands could not be broadcast together with shapes (161,100) (202,100) (161,100) 

In [77]:
test_mat = pd.DataFrame(co_occur(test_list), index=vocab_index, columns=vocab_index)
test_mat

Unnamed: 0,biology,i,love,math,programming,tolerate
biology,0,1,0,0,0,1
i,1,0,2,1,1,1
love,0,2,0,1,1,0
math,0,1,1,0,0,0
programming,0,1,1,0,0,0
tolerate,1,1,0,0,0,0


In [47]:
matrix

matrix([[0., 0., 0., 0., 1., 0.],
        [1., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.]])

In [35]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.6372775068708971, dist-2: 0.9346757793690581


In [38]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['GloVe_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,GloVe_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219,0.097827,0.637278
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172,0.000000,0.956693
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994,0.292820,0.934676
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138,0.000000,0.869236
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457,0.000000,0.971912
...,...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459,0.283608,1.001570
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065,0.172445,1.004240
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543,0.000000,1.076448
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066,0.000000,1.094846


In [39]:
test_df.sort_values(by=['GloVe_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,GloVe_fit
90,91,Lead Official at Western Illinois University,Greater Chicago Area,39,,0.232249,0.960595,0.0,1.203981
66,67,"Human Resources, Staffing and Recruiting Profe...","Jackson, Mississippi Area",500+,,0.673157,0.962998,0.140963,1.189573
7,8,HR Senior Specialist,San Francisco Bay Area,500+,,0.699413,0.984148,0.224059,1.186325
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,,0.750962,0.982967,0.172445,1.174395
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994,0.29282,1.17052
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,1.168125
103,104,Director Of Administration at Excellence Logging,"Katy, Texas",500+,,0.281777,0.968243,0.0,1.166776
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138,0.0,1.14592
18,19,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219,0.097827,1.129295
76,77,Human Resources|\nConflict Management|\nPolici...,Dallas/Fort Worth Area,409,,0.621532,0.972505,0.076297,1.125646


In [None]:
ranknet