# Potential Talents

I will be attempting to rank potential job candidates based on a query and return a list of candidates that best fit the query. Once ranked I will be re-ranking candidates based on ideal candidates from the first list.

In [1]:
## import necessary libraries

import pandas as pd
import nltk
import string
import gensim
import itertools
import numpy as np
import gensim.downloader as api
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sent2vec.vectorizer import Vectorizer
from sent2vec.splitter import Splitter
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

## Data Description

> **Id:** Unique identifier for candidate  
> **Job_title:** Job title for candidate  
> **Location:** Geographical location for candidate   
> **Connections:** Number of connections candidate has  
> **Fit:** How fit the candidate is for the role (numeric between 0-1)

In [96]:
df = pd.read_csv('./potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


I will be getting word vectors for each candidates job title and comparing its cosine distance with the query word vectors to find the best fitting candidates. I will use Word2Vec, BERT, GloVe and TF-IDF to see which word vectors give the closest fit to the query.

## Word2Vec

In [3]:
## clean up data and make a list that can be run through the Word2Vec model

def word2vec_list(df, query):
    clean_list = []

    for num in range(len(df)):
        desc = df.iloc[num]['job_title']
        ## remove punctuation from job description    
        for symbol in list(string.punctuation):
            desc = desc.replace(symbol, ' ')

        ## write out acronyms for human resource positions
        desc = nltk.word_tokenize(desc)
        for word in range(len(desc)):
            if desc[word] == 'HR':
                desc[word] = 'Human Resources'
            elif desc[word] == 'CHRO':
                desc[word] = 'Chief Human Resources Officer'
            elif desc[word] == 'GPHR':
                desc[word] = 'General Professional Human Resources'
            elif desc[word] == 'SPHR':
                desc[word] = 'Senior Professional Human Resources'

            ## remove words that are not in pretrained vectors
            ## most of these words are the names of colleges
            elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                                'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
                desc[word] = ''
            desc[word] = desc[word].lower()

        ## remove stopwords
        desc = [token for token in desc if token not in stopwords.words('english')]
        desc = [token for token in desc if not token.isdigit()]

        desc = ' '.join(desc)
        desc = nltk.word_tokenize(desc)

        clean_list.append(desc)
    
    ## tokenize query and add to list
    clean_list.append(query.lower().split())
    
    return clean_list

In [4]:
job_desc_list = word2vec_list(df, 'seeking human resources')

In [5]:
job_desc_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [6]:
## get word vectors using Google's pretrained word vectors

vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
word2vec_vectors = vectorizer.vectors

In [7]:
dist_1 = spatial.distance.cosine(word2vec_vectors[-1], word2vec_vectors[0])
dist_2 = spatial.distance.cosine(word2vec_vectors[-1], word2vec_vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.611827164888382, dist-2: 0.29940420389175415


In [8]:
## create a new dataframe to add vector fit scores

vector_df = df
vector_df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [9]:
## append word2vec scores to dataframe

scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(word2vec_vectors[num], word2vec_vectors[-1]))
    
vector_df['word2vec_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268


In [10]:
vector_df.sort_values(by=['word2vec_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,0.767376


It looks like the word2vec model works really well with the top fitting results returning candidates that match the query 'seeking human resources'

## BERT

Next we'll add BERT vectors to our dataframe. The sent2vec package that I am using does not require tokenization for BERT vectors so we will not be needing to clean our data as much as we did with the Word2Vec model.

In [11]:
def bert_list(df, query):
    bert_list = []

    for num in range(len(df)):
        desc = df.iloc[num]['job_title']
        ## remove punctuation from job description    
        for symbol in list(string.punctuation):
            desc = desc.replace(symbol, ' ')

        ## write out acronyms for human resource positions
        desc = nltk.word_tokenize(desc)
        for word in range(len(desc)):
            if desc[word] == 'HR':
                desc[word] = 'Human Resources'
            elif desc[word] == 'CHRO':
                desc[word] = 'Chief Human Resources Officer'
            elif desc[word] == 'GPHR':
                desc[word] = 'General Professional in Human Resources'
            elif desc[word] == 'SPHR':
                desc[word] = 'Senior Professional in Human Resources'

        desc = ' '.join(desc)
        bert_list.append(desc)
        
    bert_list.append(query)
    
    return bert_list

In [12]:
bert_job_list = bert_list(df, 'seeking human resources')

In [13]:
bert_job_list

['2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 'Native English Teacher at EPIK English Program in Korea',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Human Resources Senior Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP Chief Human Resources Officer Marketing Communications CSR Officer ENGIE Houston The Woodlands Energy General Professional in Human Resources Senior Professional in Human Resources',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 '2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 '2019 C T Bauer Colleg

In [14]:
vectorizer = Vectorizer()
vectorizer.bert(bert_job_list)
bert_vectors = vectorizer.vectors

Vectorization done on cpu device


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
dist_1 = spatial.distance.cosine(bert_vectors[-1], bert_vectors[0])
dist_2 = spatial.distance.cosine(bert_vectors[-1], bert_vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.07078111171722412, dist-2: 0.01500558853149414


In [16]:
scores = []
for num in range(len(bert_job_list) - 1):
    scores.append(1 - spatial.distance.cosine(bert_vectors[num], bert_vectors[-1]))
    
vector_df['bert_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066


In [17]:
vector_df.sort_values(by=['bert_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985


## TF-IDF

Next we'll add TF-IDF vectors to our dataframe.

In [18]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bert_job_list)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
vector_df['tfidf_fit'] = cosine_sim[-1][:-1]

In [20]:
vector_df.sort_values(by=['tfidf_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702


## GloVe

Finally we'll add GloVe vectors to the dataframe. We will be using pretrained glove embeddings to get our word vectors.

In [21]:
def glove_list(df, query):
    glove_list = []

    for num in range(len(df)):
        desc = df.iloc[num]['job_title']
        ## remove punctuation from job description    
        for symbol in list(string.punctuation):
            desc = desc.replace(symbol, ' ')

        ## write out acronyms for human resource positions
        desc = nltk.word_tokenize(desc)
        for word in range(len(desc)):
            if desc[word] == 'HR':
                desc[word] = 'Human Resources'
            elif desc[word] == 'CHRO':
                desc[word] = 'Chief Human Resources Officer'
            elif desc[word] == 'GPHR':
                desc[word] = 'General Professional in Human Resources'
            elif desc[word] == 'SPHR':
                desc[word] = 'Senior Professional in Human Resources'
            ## remove words not found in pretrained embeddings
            elif desc[word] in ['HRIS', 'ENGIE', 'ScottMadden', 'Styczynski', 'Nortia', 'Loparex']:
                desc[word] = ''
            desc[word] = desc[word].lower()

        desc = [token for token in desc if token not in stopwords.words('english')]
        desc = [token for token in desc if not token.isdigit()]

        desc = ' '.join(desc)
        desc = nltk.word_tokenize(desc)
        glove_list.append(desc)

    glove_list.append(query.lower().split())
    
    return glove_list

In [22]:
glove_job_list = glove_list(df, 'seeking human resources')
glove_job_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'epik', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'celal', 'bayar', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student',
  'humber',
  'college',
  'aspiring',
  'human',
  'resources',
  'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student',
  'humber',
  'college',
  'aspiring',
  'human',
  'resources',
  'generalist'],
 ['seeking', 'human', 'resources', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'in',
  'human',
  'resour

In [23]:
## we will be downloading pretrained GloVe embeddings which has been trained on Wikipedia using gensim

model = api.load("glove-wiki-gigaword-300")

In [24]:
## create a function to get word vectors using the gensim model

def get_vector(s):
    return np.sum(np.array([model[i] for i in s]), axis=0)

In [25]:
glove_vectors = []

for num in range(len(glove_job_list)):
    glove_vectors.append(get_vector(glove_job_list[num]))

In [26]:
glove_vectors

[array([ 1.5282625 ,  3.2461998 ,  0.10179698,  0.43414286, -1.1617588 ,
         0.42638186, -1.2995371 , -2.175614  , -1.2447441 , -8.75541   ,
         2.066736  ,  3.179113  ,  1.047955  , -1.439763  ,  0.32221   ,
         2.150875  ,  0.134926  ,  0.38139823,  0.97562855, -2.576674  ,
        -2.153926  ,  1.1725923 ,  0.6002661 ,  3.276764  , -2.073533  ,
         2.584117  ,  3.2218032 ,  2.790596  ,  0.09734902, -3.9332678 ,
         1.0110134 , -1.9817    ,  0.35711497,  0.30539995, -6.045522  ,
        -0.35889903, -0.39025107,  0.576265  , -0.925467  , -0.43875903,
         0.46786496, -1.8324845 , -0.24037302,  2.687144  , -0.933758  ,
        -1.515815  ,  3.75725   ,  4.4700537 ,  2.0354638 , -4.00129   ,
        -1.041018  ,  0.926556  ,  1.005497  ,  0.06708105, -0.4698538 ,
        -1.8851821 ,  3.2804098 , -1.1563601 , -0.366996  ,  0.54470396,
         1.0268004 ,  2.4263701 ,  2.331198  ,  0.21122198, -0.24890622,
        -1.463427  , -0.04659796,  0.23804508,  0.2

In [27]:
scores = []
for num in range(len(glove_job_list) - 1):
    scores.append(1 - spatial.distance.cosine(glove_vectors[num], glove_vectors[-1]))
    
vector_df['glove_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,glove_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066,0.000000,0.459518


In [28]:
vector_df.sort_values(by=['glove_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,glove_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865,0.945992
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318,0.914078
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686,0.351856,0.864729
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193,0.18337,0.849824


In [29]:
vector_df.drop('fit', axis=1, inplace=True)
vector_df

Unnamed: 0,id,job_title,location,connection,word2vec_fit,bert_fit,tfidf_fit,glove_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.133268,0.983066,0.000000,0.459518


Now that I have all the fit scores, I took a look at each models top results and have concluded that the Word2Vec vectors provide the best fit for the query. I will be using that to create the RankNet model.

## RankNet

In [80]:
## create numpy arrays of the word2vec vectors as input data and the fit scores as the target variable

X = pd.DataFrame(word2vec_vectors)
y = np.array(vector_df['word2vec_fit'])

In [81]:
## drop the query string from the input data
X.drop([104], inplace=True)

## I need to create a numpy array for query id, in this case we only have 1 query id so I will be creating a numpy array 
## with the same number of rows as our input data with 1's to group the data together
X['qid'] = 1
qid = np.array(X['qid'])

## drop the query id from input data
X.drop('qid', axis=1, inplace=True)
X = X.to_numpy()

In [32]:
X

array([[-0.06260172,  0.0594991 ,  0.01339213, ..., -0.0257899 ,
        -0.01607577,  0.10813395],
       [ 0.0164388 ,  0.0230306 ,  0.06384277, ...,  0.03059896,
         0.00439453,  0.02514648],
       [-0.09277344,  0.03723145,  0.08280945, ..., -0.04348755,
        -0.08966064,  0.03756714],
       ...,
       [ 0.18185425, -0.05010986, -0.08358765, ...,  0.0144043 ,
         0.13219261,  0.0296936 ],
       [ 0.01285807,  0.10728963,  0.02231852, ...,  0.00384521,
         0.00382487, -0.1295573 ],
       [-0.0721283 , -0.02363586,  0.03528214, ...,  0.01784515,
         0.1071167 ,  0.14154053]], dtype=float32)

In [33]:
## run the data through the RankNet model

from LambdaRankNN  import RankNetNN

ranker = RankNetNN(input_size=X.shape[1], hidden_layer_sizes=(128, 64,), activation=('relu', 'relu',))
ranker.fit(X, y, qid, epochs=35)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
ndcg: 0.9999550217795616


In [34]:
pred = ranker.predict(X)
pred

array([-27.791878 , -96.43367  ,  40.360817 , -38.84929  , -62.30091  ,
        53.04063  ,   1.6328974,  40.892204 ,   1.6328974,  70.12957  ,
       -72.92934  ,  31.001757 ,  10.664214 , -27.791878 , -27.791878 ,
       -96.43367  ,  40.360817 , -38.84929  , -27.791878 , -96.43367  ,
        40.360817 , -38.84929  , -62.30091  ,  53.04063  ,   1.6328974,
        40.892204 ,  32.651554 ,  88.92209  ,  32.651554 ,  88.92209  ,
       -27.791878 , -96.43367  ,  40.360817 , -38.84929  , -62.30091  ,
        53.04063  ,   1.6328974,  40.892204 ,   1.6328974,  70.12957  ,
       -72.92934  ,  31.001757 ,  10.664214 , -27.791878 , -96.43367  ,
        40.360817 , -38.84929  , -62.30091  ,  53.04063  ,   1.6328974,
        40.892204 ,   1.6328974,  70.12957  , -72.92934  ,  31.001757 ,
        10.664214 , -27.791878 ,  40.360817 , -38.84929  ,  53.04063  ,
        40.892204 ,  70.12957  , -72.92934  ,  31.001757 ,  10.664214 ,
        -3.6417851,  26.490097 ,  65.70101  , -11.596133 ,  -3.9

In [35]:
ranker.evaluate(X, y, qid, eval_at=10)

ndcg@10: 0.9998997466715455


In [36]:
## normalize predictions between 0 and 1

norm_pred = (pred - np.min(pred)) / (np.max(pred) - np.min(pred))
norm_pred

array([0.35862917, 0.        , 0.714703  , 0.3008581 , 0.17833164,
       0.78095055, 0.51236326, 0.7174793 , 0.51236326, 0.8702342 ,
       0.12280186, 0.6658052 , 0.55954874, 0.35862917, 0.35862917,
       0.        , 0.714703  , 0.3008581 , 0.35862917, 0.        ,
       0.714703  , 0.3008581 , 0.17833164, 0.78095055, 0.51236326,
       0.7174793 , 0.67442477, 0.96841854, 0.67442477, 0.96841854,
       0.35862917, 0.        , 0.714703  , 0.3008581 , 0.17833164,
       0.78095055, 0.51236326, 0.7174793 , 0.51236326, 0.8702342 ,
       0.12280186, 0.6658052 , 0.55954874, 0.35862917, 0.        ,
       0.714703  , 0.3008581 , 0.17833164, 0.78095055, 0.51236326,
       0.7174793 , 0.51236326, 0.8702342 , 0.12280186, 0.6658052 ,
       0.55954874, 0.35862917, 0.714703  , 0.3008581 , 0.78095055,
       0.7174793 , 0.8702342 , 0.12280186, 0.6658052 , 0.55954874,
       0.4848049 , 0.6422334 , 0.84709656, 0.44324625, 0.48342222,
       0.65392596, 0.5944628 , 0.8746846 , 0.838392  , 0.77511

In [37]:
vector_df.sort_values('word2vec_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,bert_fit,tfidf_fit,glove_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,0.994894,0.625865,0.945992
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.996201,0.646155,0.948254
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.996201,0.646155,0.948254
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.975402,0.552318,0.914078
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.974998,0.447527,0.873041
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.974998,0.447527,0.873041
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.974998,0.447527,0.873041
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.974998,0.447527,0.873041
73,74,Human Resources Professional,Greater Boston Area,16,0.793325,0.991686,0.351856,0.864729
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.767376,0.974341,0.170633,0.753141


In [38]:
## create a new dataframe to add ranknet scores

ranknet_df = vector_df.drop(['bert_fit', 'tfidf_fit', 'glove_fit'], axis=1)
ranknet_df['ranknet_fit'] = norm_pred
ranknet_df.sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,1.0
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.968419
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.968419
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.874685
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.870234
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.870234
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.870234
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.870234
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.767376,0.847097
73,74,Human Resources Professional,Greater Boston Area,16,0.793325,0.838392


The ranknet model performs very well in ranking candidates based on the word2vec fit score with the top candidates all being ranked the same as the original word2vec fit scores. Next let's create a function to star candidates and update our ranknet model based on the preferred candidates.

In [39]:
def star_candidate():
    star_cand = []
    x = ''

    while x != 'done':
        x = ''
        while not x.isdigit():
            x = input('Specify best candidate by id, when finished type "done": ')
            if x == 'done':
                break

        star_cand.append(x)

    star_cand = star_cand[:-1]
    
    return star_cand

In [40]:
star_cand = star_candidate()

Specify best candidate by id, when finished type "done": 15
Specify best candidate by id, when finished type "done": 55
Specify best candidate by id, when finished type "done": 78
Specify best candidate by id, when finished type "done": 7
Specify best candidate by id, when finished type "done": done


In [41]:
## subtract 1 from id to get index

star_cand = [int(i)-1 for i in star_cand]

In [42]:
star_cand

[14, 54, 77, 6]

In [82]:
## create a new column named star_score to update scores based on starred candidates

ranknet_df['star_score'] = ranknet_df['word2vec_fit']
ranknet_df.iloc[star_cand,[6]] = 1
ranknet_df.iloc[star_cand]

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
14,15,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.388173,0.594211,1.0
54,55,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.689503,0.595744,1.0
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.750962,0.804317,1.0
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.600016,0.546995,1.0


I created a new column called star_score and updated starred candidates scores to a 1 to allow the ranknet model to train on the updated score.

In [83]:
## create new target variable with updated starred candidates

y = np.array(ranknet_df['star_score'])

In [84]:
ranker = RankNetNN(input_size=X.shape[1], hidden_layer_sizes=(128, 64,), activation=('relu', 'relu',))
ranker.fit(X, y, qid, epochs=35)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
ndcg: 0.9757146717839669


In [85]:
ranker.predict(X)

array([ 3.4388417e-01, -8.0197525e+01,  4.2854481e+00, -1.6946510e+01,
       -4.0995636e+01,  5.0149212e+00,  2.0325110e+00,  3.1637673e+00,
        2.0325110e+00,  6.4725933e+00, -5.3131245e+01,  3.5060163e+00,
        1.1935713e+00,  3.4388417e-01,  3.4388417e-01, -8.0197525e+01,
        4.2854481e+00, -1.6946510e+01,  3.4388417e-01, -8.0197525e+01,
        4.2854481e+00, -1.6946510e+01, -4.0995636e+01,  5.0149212e+00,
        2.0325110e+00,  3.1637673e+00,  3.2299366e+00,  7.4573684e+00,
        3.2299366e+00,  7.4573684e+00,  3.4388417e-01, -8.0197525e+01,
        4.2854481e+00, -1.6946510e+01, -4.0995636e+01,  5.0149212e+00,
        2.0325110e+00,  3.1637673e+00,  2.0325110e+00,  6.4725933e+00,
       -5.3131245e+01,  3.5060163e+00,  1.1935713e+00,  3.4388417e-01,
       -8.0197525e+01,  4.2854481e+00, -1.6946510e+01, -4.0995636e+01,
        5.0149212e+00,  2.0325110e+00,  3.1637673e+00,  2.0325110e+00,
        6.4725933e+00, -5.3131245e+01,  3.5060163e+00,  1.1935713e+00,
      

In [86]:
ranker.evaluate(X, y, qid, eval_at=10)

ndcg@10: 0.9011320170117706


In [87]:
norm_pred = (ranker.predict(X) - np.min(ranker.predict(X))) / (np.max(ranker.predict(X)) - np.min(ranker.predict(X)))
ranknet_df['ranknet_fit'] = norm_pred

In [88]:
ranknet_df.sort_values('star_score', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
54,55,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.689503,0.950613,1.0
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.600016,0.933879,1.0
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.750962,0.988119,1.0
14,15,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.388173,0.914701,1.0
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,1.0,0.920869
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.98586,0.83922
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101


In [89]:
ranknet_df.sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,1.0,0.920869
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.988119,0.750962
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.750962,0.988119,1.0
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.98586,0.83922
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101


In [92]:
## dataframe without the starred candidates 

ranknet_df.drop(star_cand).sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,1.0,0.920869
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995488,0.903864
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.988119,0.750962
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.98586,0.83922
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.984304,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,0.793325,0.976464,0.793325


The ranknet model was able to update its scores based on the starred candidates and return an updated list that takes the starred candidates into account. Let's see if the ranknet model performs just as well with a different query.

## RankNet Model with a Different Query

I'll be using the query 'aspiring human resources' to rank the potential job candidates. I will be using Word2Vec vectors for this because it had the best performing word embeddings for the previous query.

In [93]:
list_2 = word2vec_list(df, 'aspiring human resources')
list_2

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [94]:
## get vectors with the new query

vectorizer = Vectorizer()
vectorizer.word2vec(list_2, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
word2vec_vectors = vectorizer.vectors

In [100]:
## append vector distance of job description and query to dataframe

df_2 = df.drop('fit', axis=1)

scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(word2vec_vectors[num], word2vec_vectors[-1]))
    
df_2['word2vec_fit'] = scores
df_2.sort_values('word2vec_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.945019
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542


In [101]:
X = pd.DataFrame(word2vec_vectors)
y = np.array(df_2['word2vec_fit'])

X.drop([104], inplace=True)
X['qid'] = 1
qid = np.array(X['qid'])
X.drop('qid', axis=1, inplace=True)
X = X.to_numpy()

In [102]:
ranker = RankNetNN(input_size=X.shape[1], hidden_layer_sizes=(128, 64,), activation=('relu', 'relu',))
ranker.fit(X, y, qid, epochs=35)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
ndcg: 0.9999957101147271


In [109]:
pred = (ranker.predict(X) - np.min(ranker.predict(X))) / (np.max(ranker.predict(X)) - np.min(ranker.predict(X)))
df_2['ranknet_fit'] = pred
df_2.sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,0.945019
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,0.945019
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,0.945019
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,0.945019
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,0.945019
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,1.0,1.0
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.945019,1.0,0.945019
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.883866,0.90542
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.883866,0.90542
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.883866,0.90542


In [110]:
star_cand = star_candidate()

Specify best candidate by id, when finished type "done": 6
Specify best candidate by id, when finished type "done": 10
Specify best candidate by id, when finished type "done": 25
Specify best candidate by id, when finished type "done": 39
Specify best candidate by id, when finished type "done": 52
Specify best candidate by id, when finished type "done": 74
Specify best candidate by id, when finished type "done": done


In [111]:
## subtract 1 from id to get index
star_cand = [int(i)-1 for i in star_cand]

df_2['star_score'] = df_2['word2vec_fit']
df_2.iloc[star_cand,[6]] = 1
df_2.iloc[star_cand]

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.883866,1.0
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.693193,0.603403,1.0
24,25,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.813642,1.0
38,39,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.813642,1.0
51,52,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.813642,1.0
73,74,Human Resources Professional,Greater Boston Area,16,0.865165,0.832696,1.0


In [114]:
y = np.array(df_2['star_score'])

ranker = RankNetNN(input_size=X.shape[1], hidden_layer_sizes=(128, 64,), activation=('relu', 'relu',))
ranker.fit(X, y, qid, epochs=35)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
ndcg: 0.9911428184991445


In [115]:
pred = (ranker.predict(X) - np.min(ranker.predict(X))) / (np.max(ranker.predict(X)) - np.min(ranker.predict(X)))
df_2['ranknet_fit'] = pred
df_2.sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
73,74,Human Resources Professional,Greater Boston Area,16,0.865165,1.0,1.0
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.945019,0.985955,0.945019
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
36,37,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.982069,0.815372
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.982069,0.815372


In [117]:
## dataframe ranked by word2vec score for comparison

df_2.sort_values('word2vec_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.945019,0.985955,0.945019
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.980887,0.90542
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.980887,1.0
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90542,0.980887,0.90542


In [116]:
## dataframe without the starred candidates 

df_2.drop(star_cand).sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.945019,0.985955,0.945019
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.945019,0.985955,0.945019
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.982069,0.815372
49,50,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.982069,0.815372
8,9,Student at Humber College and Aspiring Human R...,Kanada,61,0.815372,0.982069,0.815372


After starring the candidates, we can see that the ranknet model learned from the starred candidates like our previous query. The learning after starring is more apparent here because we can see that the "Student at Humber College..." rises in rank with our ranknet model when compared to just ranking with the word2vec fit score. The model is able to learn to rank  more easily when you star candidates that were similar to the query to begin with. 

# Conclusion

The learning to rank model after ranking with starred candidates had a NDCG score of 0.975 for the first query and a NDCG score of 0.991 for the second query. The model is able to learn to rank more accurately when you star candidates that were similar to the query to begin with based on the higher NDCG score of the second query. With such a high NDCG score, I am confident in the model to correctly rerank potential candidates based on preferred candidates. 

# Appendix

# Different RankNet Model

In [55]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class RankNet(nn.Module):
    
    def __init__(self, num_feature):
        super(RankNet, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(num_feature, 512),         # Linear layer - linear projection
            nn.Dropout(0.5),                     # Regularization - Drops some of the neurons in a random fashion
            nn.LeakyReLU(0.2, inplace=True),     # Activation function - swap dropout and relu
            nn.Linear(512, 256),
            nn.Dropout(0.5),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
            nn.Sigmoid()                         # Applies a function on the output and brings the output bw 0 and 1. Softmax is replacement
        )
        self.output_sig = nn.Sigmoid()

    def forward(self, input_1, input_2):
        s1 = self.model(input_1)
        s2 = self.model(input_2)
        out = self.output_sig(s1-s2)
        return out
    
    def predict(self, input_):
        s = self.model(input_)
        return s

In [56]:
## clean up data and make a list that can be run through the Word2Vec model

def ranknet_list(desc_list):
    clean_list = []

    for num in range(len(desc_list)):
        desc = desc_list[num]
        ## remove punctuation from job description    
        for symbol in list(string.punctuation):
            desc = desc.replace(symbol, ' ')

        ## write out acronyms for human resource positions
        desc = nltk.word_tokenize(desc)
        for word in range(len(desc)):
            if desc[word] == 'HR':
                desc[word] = 'Human Resources'
            elif desc[word] == 'CHRO':
                desc[word] = 'Chief Human Resources Officer'
            elif desc[word] == 'GPHR':
                desc[word] = 'General Professional Human Resources'
            elif desc[word] == 'SPHR':
                desc[word] = 'Senior Professional Human Resources'

            ## remove words that are not in pretrained vectors
            ## most of these words are the names of colleges
            elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                                'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
                desc[word] = ''
            desc[word] = desc[word].lower()

        ## remove stopwords
        desc = [token for token in desc if token not in stopwords.words('english')]
        desc = [token for token in desc if not token.isdigit()]

        desc = ' '.join(desc)
        desc = nltk.word_tokenize(desc)

        clean_list.append(desc)
    
    return clean_list

In [57]:
random_row_1 = ranknet_df.sample(n = 5000, replace = True)
random_row_2 = ranknet_df.sample(n = 5000, replace = True)
job_title_list_ranknet1 = ranknet_list(list(random_row_1['job_title']))
job_title_list_ranknet2 = ranknet_list(list(random_row_2['job_title']))

In [58]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_title_list_ranknet1, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
doc1 = np.array(vectorizer.vectors)

vectorizer = Vectorizer()
vectorizer.word2vec(job_title_list_ranknet2, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
doc2 = np.array(vectorizer.vectors)

doc1 = torch.from_numpy(doc1).float()
doc2 = torch.from_numpy(doc2).float()

In [59]:
y_1 = list(random_row_1['star_score'])
y_2 = list(random_row_2['star_score'])
y = torch.tensor([1.0 if y1_i>y2_i else 0.5 if y1_i==y2_i else 0.0 for y1_i, y2_i in zip(y_1, y_2)]).float()

y = y.unsqueeze(1)

In [60]:
rank_model = RankNet(num_feature = 300)
# optimizer = torch.optim.Adam(rank_model.parameters())         
optimizer = torch.optim.SGD(rank_model.parameters(), lr = 0.01, momentum = 0.9)         # experiment with optimizer
loss_fun = torch.nn.BCELoss()

In [61]:
epoch = 2000
losses = []

for i in range(epoch):
    rank_model.zero_grad()
    y_pred = rank_model(doc1, doc2)
    loss = loss_fun(y_pred,y)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    #print(y1, y2, y, y_pred)
    
    if i % 100 == 0:
        print('Epoch{}, loss : {}'.format(i, loss.item()))

Epoch0, loss : 0.6930458545684814
Epoch100, loss : 0.6887785196304321
Epoch200, loss : 0.6791598200798035
Epoch300, loss : 0.6494277119636536
Epoch400, loss : 0.5845950841903687
Epoch500, loss : 0.5449822545051575
Epoch600, loss : 0.5309036374092102
Epoch700, loss : 0.5275877118110657
Epoch800, loss : 0.5241197347640991
Epoch900, loss : 0.522300124168396
Epoch1000, loss : 0.5205228328704834
Epoch1100, loss : 0.5205462574958801
Epoch1200, loss : 0.5199341773986816
Epoch1300, loss : 0.5188631415367126
Epoch1400, loss : 0.5178435444831848
Epoch1500, loss : 0.5178009867668152
Epoch1600, loss : 0.5164840221405029
Epoch1700, loss : 0.5165194869041443
Epoch1800, loss : 0.5173816680908203
Epoch1900, loss : 0.5155891180038452


In [63]:
pred = rank_model.predict(torch.from_numpy(X))
ranknet_df['ranknet_fit'] = norm_pred

In [68]:
ranknet_df.sort_values('ranknet_fit', ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,word2vec_fit,ranknet_fit,star_score
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.920869,1.0,0.920869
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995216,0.903864
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.903864,0.995216,0.903864
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.993609,0.750962
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.750962,0.993609,1.0
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.83922,0.989833,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.987375,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.987375,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.987375,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.808101,0.987375,0.808101
