In [23]:
## import necessary libraries

import pandas as pd
import nltk
import string
import gensim
import itertools
import csv
import numpy as np
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sent2vec.vectorizer import Vectorizer
from sent2vec.splitter import Splitter
from gensim.models import Word2Vec
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from glove import Glove

from gensim.matutils import corpus2csc
from gensim.corpora import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
from mittens import GloVe

In [2]:
df = pd.read_csv('./potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


## Word2Vec

In [3]:
job_desc_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
    
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional Human Resources'
            
        ## remove words that are not in pretrained vectors
        elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                            'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()

    ## remove stopwords
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    desc = nltk.word_tokenize(desc)
    
    job_desc_list.append(desc)

In [4]:
job_desc_list.append(['seeking', 'human', 'resources'])

In [5]:
job_desc_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [6]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
vectors = vectorizer.vectors

In [7]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.611827164888382, dist-2: 0.29940420389175415


In [8]:
test_df = df
test_df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [9]:
scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['word2vec_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268


In [10]:
test_df.sort_values(by=['word2vec_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,0.767376


## BERT

In [11]:
bert_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
    
    desc = ' '.join(desc)
    bert_list.append(desc)

In [12]:
bert_list.append('seeking human resources')

In [13]:
bert_list

['2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 'Native English Teacher at EPIK English Program in Korea',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Human Resources Senior Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP Chief Human Resources Officer Marketing Communications CSR Officer ENGIE Houston The Woodlands Energy General Professional in Human Resources Senior Professional in Human Resources',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 '2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 '2019 C T Bauer Colleg

In [14]:
vectorizer = Vectorizer()
vectorizer.bert(bert_list)
vectors = vectorizer.vectors

Vectorization done on cpu device


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
dist_1 = spatial.distance.cosine(vectors[-1], vectors[0])
dist_2 = spatial.distance.cosine(vectors[-1], vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.07078111171722412, dist-2: 0.01500558853149414


In [16]:
len(bert_list)

105

In [17]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(1 - spatial.distance.cosine(vectors[num], vectors[-1]))
    
test_df['bert_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066


In [18]:
test_df.sort_values(by=['bert_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985


## TF-IDF

In [19]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bert_list)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [20]:
test_df['tfidf_fit'] = cosine_sim[-1][:-1]

In [21]:
test_df.sort_values(by=['tfidf_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702


## GloVe

In [23]:
from collections import defaultdict

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

In [24]:
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ',quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

glove_path = "./glove.6B.300d.txt"
pre_glove = glove2dict(glove_path)

In [52]:
'human resources'.split()

['human', 'resources']

In [87]:
glove_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
        elif desc[word] in ['HRIS', 'ENGIE', 'ScottMadden', 'Styczynski', 'Nortia', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()
    
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    glove_list.append(desc)

glove_list.append('seeking human resources')
glove_list

['c bauer college business graduate magna cum laude aspiring human resources professional',
 'native english teacher epik english program korea',
 'aspiring human resources professional',
 'people development coordinator ryan',
 'advisory board member celal bayar university',
 'aspiring human resources specialist',
 'student humber college aspiring human resources generalist',
 'human resources senior specialist',
 'student humber college aspiring human resources generalist',
 'seeking human resources  generalist positions',
 'student chapman university',
 'svp chief human resources officer marketing communications csr officer  houston woodlands energy general professional in human resources senior professional in human resources',
 'human resources coordinator intercontinental buckhead atlanta',
 'c bauer college business graduate magna cum laude aspiring human resources professional',
 'c bauer college business graduate magna cum laude aspiring human resources professional',
 'native

In [34]:
cv = CountVectorizer(ngram_range=(1,1), vocabulary=test_vocab)
X = cv.fit_transform(test_list)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [35]:
coocc_ar

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0]], dtype=int64)

In [26]:
co_occ = co_occurrence(bert_list, 1)

In [27]:
co_occ

Unnamed: 0,2019,2020,2621,408,709,a,about,administration,administrative,admissions,...,up,victoria,wellington,western,westfield,with,within,woodlands,work,world
2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2621,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
408,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
709,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
with,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
within,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
woodlands,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
work,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
glove = GloVe()

In [91]:
vectors = glove.fit(co_occ)

ValueError: operands could not be broadcast together with shapes (161,100) (202,100) (161,100) 

In [63]:
test = 'Mark zuckerberg owns the facebook company'

one = [i.lower() for i in test.split()]
one

['mark', 'zuckerberg', 'owns', 'the', 'facebook', 'company']

In [62]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-300")

[===-----------------------------------------------] 6.2% 23.4/376.1MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





NameError: name 'preprocess' is not defined

In [88]:
def preprocess(s):
    return [i.lower() for i in s.split()]

def get_vector(s):
    return np.sum(np.array([model[i] for i in preprocess(s)]), axis=0)

print('s0 vs s1 ->',1 - spatial.distance.cosine(get_vector(glove_list[0]), get_vector(glove_list[-1])))

s0 vs s1 -> 0.4665846824645996


In [89]:
scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(get_vector(glove_list[num]), get_vector(glove_list[-1])))
    
test_df['GloVe_fit'] = scores
test_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,GloVe_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066,0.000000,0.459518


In [90]:
test_df.sort_values(by=['GloVe_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,GloVe_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865,0.945992
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318,0.914078
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686,0.351856,0.864729
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193,0.18337,0.849824


In [91]:
test_df.drop('fit', axis=1, inplace=True)
test_df

Unnamed: 0,id,job_title,location,connection,word2vec_fit,bert_fit,tfidf_fit,GloVe_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.133268,0.983066,0.000000,0.459518
