In [1]:
## import necessary libraries

import pandas as pd
import nltk
import string
import gensim
import itertools
import numpy as np
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sent2vec.vectorizer import Vectorizer
from sent2vec.splitter import Splitter
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('./potential-talents - Aspiring human resources - seeking human resources.csv')
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


## Word2Vec

In [3]:
job_desc_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
    
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional Human Resources'
            
        ## remove words that are not in pretrained vectors
        ## most of these words are the names of colleges
        elif desc[word] in ['EPIK', 'Celal', 'Bayar', 'Humber', 'ENGIE', 'Buckhead', 'Luxottica', 'Beneteau', 'ScottMadden', 
                            'Nortia', 'Schwan', 'Endemol', 'JTI', 'Styczynski', 'Westfield', 'Kokomo', 'Delphi', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()

    ## remove stopwords
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    desc = nltk.word_tokenize(desc)
    
    job_desc_list.append(desc)

In [4]:
job_desc_list.append(['seeking', 'human', 'resources'])

In [5]:
job_desc_list

[['c',
  'bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resources',
  'professional'],
 ['native', 'english', 'teacher', 'english', 'program', 'korea'],
 ['aspiring', 'human', 'resources', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'university'],
 ['aspiring', 'human', 'resources', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['human', 'resources', 'senior', 'specialist'],
 ['student', 'college', 'aspiring', 'human', 'resources', 'generalist'],
 ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions'],
 ['student', 'chapman', 'university'],
 ['svp',
  'chief',
  'human',
  'resources',
  'officer',
  'marketing',
  'communications',
  'csr',
  'officer',
  'houston',
  'woodlands',
  'energy',
  'general',
  'professional',
  'human',
  'resources',
  'senior',
  'professional',
  'human',
  'resources'],
 ['huma

In [6]:
vectorizer = Vectorizer()
vectorizer.word2vec(job_desc_list, pretrained_vectors_path= './GoogleNews-vectors-negative300.bin')
word2vec_vectors = vectorizer.vectors

In [7]:
dist_1 = spatial.distance.cosine(word2vec_vectors[-1], word2vec_vectors[0])
dist_2 = spatial.distance.cosine(word2vec_vectors[-1], word2vec_vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.611827164888382, dist-2: 0.29940420389175415


In [8]:
vector_df = df
vector_df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [9]:
scores = []
for num in range(len(job_desc_list) - 1):
    scores.append(1 - spatial.distance.cosine(word2vec_vectors[num], word2vec_vectors[-1]))
    
vector_df['word2vec_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268


In [10]:
vector_df.sort_values(by=['word2vec_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,0.767376


## BERT

In [11]:
bert_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
    
    desc = ' '.join(desc)
    bert_list.append(desc)

In [12]:
bert_list.append('seeking human resources')

In [13]:
bert_list

['2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 'Native English Teacher at EPIK English Program in Korea',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Human Resources Senior Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP Chief Human Resources Officer Marketing Communications CSR Officer ENGIE Houston The Woodlands Energy General Professional in Human Resources Senior Professional in Human Resources',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 '2019 C T Bauer College of Business Graduate Magna Cum Laude and aspiring Human Resources professional',
 '2019 C T Bauer Colleg

In [14]:
vectorizer = Vectorizer()
vectorizer.bert(bert_list)
bert_vectors = vectorizer.vectors

Vectorization done on cpu device


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
dist_1 = spatial.distance.cosine(bert_vectors[-1], bert_vectors[0])
dist_2 = spatial.distance.cosine(bert_vectors[-1], bert_vectors[2])
print('dist_1: {0}, dist-2: {1}'.format(dist_1, dist_2))

dist_1: 0.07078111171722412, dist-2: 0.01500558853149414


In [16]:
scores = []
for num in range(len(bert_list) - 1):
    scores.append(1 - spatial.distance.cosine(bert_vectors[num], bert_vectors[-1]))
    
vector_df['bert_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457
...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066


In [17]:
vector_df.sort_values(by=['bert_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,0.743151,0.987985


## TF-IDF

In [18]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(bert_list)

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [19]:
vector_df['tfidf_fit'] = cosine_sim[-1][:-1]

In [20]:
vector_df.sort_values(by=['tfidf_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527
26,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702
28,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,,0.696512,0.965308,0.406702


## GloVe

In [21]:
glove_list = []

for num in range(len(df)):
    desc = df.iloc[num]['job_title']
    ## remove punctuation from job description    
    for symbol in list(string.punctuation):
        desc = desc.replace(symbol, ' ')
        
    ## write out acronyms for human resource positions
    desc = nltk.word_tokenize(desc)
    for word in range(len(desc)):
        if desc[word] == 'HR':
            desc[word] = 'Human Resources'
        elif desc[word] == 'CHRO':
            desc[word] = 'Chief Human Resources Officer'
        elif desc[word] == 'GPHR':
            desc[word] = 'General Professional in Human Resources'
        elif desc[word] == 'SPHR':
            desc[word] = 'Senior Professional in Human Resources'
        elif desc[word] in ['HRIS', 'ENGIE', 'ScottMadden', 'Styczynski', 'Nortia', 'Loparex']:
            desc[word] = ''
        desc[word] = desc[word].lower()
    
    desc = [token for token in desc if token not in stopwords.words('english')]
    desc = [token for token in desc if not token.isdigit()]
    
    desc = ' '.join(desc)
    glove_list.append(desc)

glove_list.append('seeking human resources')
glove_list

['c bauer college business graduate magna cum laude aspiring human resources professional',
 'native english teacher epik english program korea',
 'aspiring human resources professional',
 'people development coordinator ryan',
 'advisory board member celal bayar university',
 'aspiring human resources specialist',
 'student humber college aspiring human resources generalist',
 'human resources senior specialist',
 'student humber college aspiring human resources generalist',
 'seeking human resources  generalist positions',
 'student chapman university',
 'svp chief human resources officer marketing communications csr officer  houston woodlands energy general professional in human resources senior professional in human resources',
 'human resources coordinator intercontinental buckhead atlanta',
 'c bauer college business graduate magna cum laude aspiring human resources professional',
 'c bauer college business graduate magna cum laude aspiring human resources professional',
 'native

In [22]:
import gensim.downloader as api

model = api.load("glove-wiki-gigaword-300")

In [23]:
def preprocess(s):
    return [i.lower() for i in s.split()]

def get_vector(s):
    return np.sum(np.array([model[i] for i in preprocess(s)]), axis=0)

In [24]:
glove_vectors = []

for num in range(len(glove_list)):
    glove_vectors.append(get_vector(glove_list[num]))

In [25]:
scores = []
for num in range(len(glove_list) - 1):
    scores.append(1 - spatial.distance.cosine(glove_vectors[num], glove_vectors[-1]))
    
vector_df['glove_fit'] = scores
vector_df

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,glove_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,0.133268,0.983066,0.000000,0.459518


In [26]:
vector_df.sort_values(by=['glove_fit'], ascending=False).head(15)

Unnamed: 0,id,job_title,location,connection,fit,word2vec_fit,bert_fit,tfidf_fit,glove_fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,,0.903864,0.996201,0.646155,0.948254
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,0.920869,0.994894,0.625865,0.945992
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,,0.83922,0.975402,0.552318,0.914078
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,,0.808101,0.974998,0.447527,0.873041
73,74,Human Resources Professional,Greater Boston Area,16,,0.793325,0.991686,0.351856,0.864729
87,88,Human Resources Management Major,"Milpitas, California",18,,0.74754,0.990193,0.18337,0.849824


In [27]:
vector_df.drop('fit', axis=1, inplace=True)
vector_df

Unnamed: 0,id,job_title,location,connection,word2vec_fit,bert_fit,tfidf_fit,glove_fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.388173,0.929219,0.097827,0.466585
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.104410,0.948172,0.000000,0.329688
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.700596,0.984994,0.292820,0.799220
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.291818,0.973138,0.000000,0.529448
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.229988,0.935457,0.000000,0.295936
...,...,...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.729863,0.914459,0.283608,0.809268
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.750962,0.982065,0.172445,0.810235
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.282647,0.976543,0.000000,0.412289
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.133268,0.983066,0.000000,0.459518


## RankNet

In [49]:
X = pd.DataFrame(word2vec_vectors)
y = vector_df['word2vec_fit']

In [50]:
X.drop([104], inplace=True)
X['qid'] = 1
qid = X['qid']
X.drop('qid', axis=1, inplace=True)

In [51]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.062602,0.059499,0.013392,0.121714,-0.008286,0.020477,0.097799,-0.148356,-0.068639,-0.041682,...,-0.108190,-0.031993,-0.188354,0.020640,0.003052,-0.072347,0.014048,-0.025790,-0.016076,0.108134
1,0.016439,0.023031,0.063843,0.074544,0.031146,0.078664,0.088704,-0.015747,-0.097514,0.070394,...,-0.070374,0.099121,0.027929,0.025553,-0.006673,-0.075826,-0.082746,0.030599,0.004395,0.025146
2,-0.092773,0.037231,0.082809,0.029816,-0.020706,0.129272,0.140808,-0.086487,-0.114258,-0.032104,...,-0.119751,-0.025116,-0.212158,-0.032745,-0.136597,-0.031494,0.108368,-0.043488,-0.089661,0.037567
3,-0.050491,0.047333,0.044434,0.122192,-0.088943,-0.039001,-0.011597,-0.158203,-0.051735,-0.092186,...,0.006683,0.106628,-0.076370,0.084045,-0.046692,-0.178894,0.052185,-0.148071,0.028534,0.019531
4,-0.051636,-0.194458,0.125610,0.040283,0.007477,-0.017090,0.012085,-0.207157,0.087555,-0.115967,...,0.041992,-0.043983,-0.144287,0.130920,0.050659,0.089478,-0.059753,0.098572,0.075655,-0.086868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,-0.043230,0.059992,-0.007956,0.068551,-0.028818,0.006404,0.054729,-0.122018,0.012521,0.002887,...,-0.100917,0.020273,-0.124704,0.023564,-0.027666,0.015769,0.072056,-0.062395,-0.036556,0.031067
100,0.012126,0.026571,0.083333,0.181356,-0.121582,0.048340,0.282878,-0.080078,-0.018880,-0.111816,...,-0.131004,-0.089193,-0.243490,0.131755,-0.111165,0.035156,0.136678,-0.098796,0.087484,0.057780
101,0.181854,-0.050110,-0.083588,0.055550,-0.091339,0.002075,-0.153183,0.021851,0.176392,-0.056519,...,0.102661,0.156403,-0.072754,0.123596,0.133789,0.095886,0.148682,0.014404,0.132193,0.029694
102,0.012858,0.107290,0.022319,0.083171,0.132080,-0.069417,0.041829,-0.033203,0.125814,0.144613,...,-0.151367,0.051717,-0.107096,-0.110026,-0.047201,-0.131673,0.133703,0.003845,0.003825,-0.129557


In [30]:
import tensorflow as tf
from tensorflow.keras import layers, activations, losses, Model, Input
from tensorflow.nn import leaky_relu
import numpy as np
from itertools import combinations
from tensorflow.keras.utils import plot_model, Progbar
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# model architecture
class RankNet(Model):
    def __init__(self):
        super().__init__()
        self.dense = [layers.Dense(16, activation=leaky_relu), layers.Dense(8, activation=leaky_relu)]
        self.o = layers.Dense(1, activation='linear')
        self.oi_minus_oj = layers.Subtract()
    
    def call(self, inputs):
        xi, xj = inputs
        densei = self.dense[0](xi)
        densej = self.dense[0](xj)
        for dense in self.dense[1:]:
            densei = dense(densei)
            densej = dense(densej)
        oi = self.o(densei)
        oj= self.o(densej)
        oij = self.oi_minus_oj([oi, oj])
        output = layers.Activation('sigmoid')(oij)
        return output
    
    def build_graph(self):
        x = [Input(shape=(10)), Input(shape=(10))]
        return Model(inputs=x, outputs=self.call(x))

# visualize model architecture
plot_model(RankNet().build_graph(), show_shapes=False)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [53]:
# put data into pairs
xi = []
xj = []
pij = []
pair_id = []
pair_query_id = []
for q in np.unique(qid):
    query_idx = np.where(qid == q)[0]
    for pair_idx in combinations(query_idx, 2):
        pair_query_id.append(q)
        
        pair_id.append(pair_idx)
        i = pair_idx[0]
        j = pair_idx[1]
        xi.append(X[i])
        xj.append(X[j])
        
        if y[i] == y[j]:
            _pij = 0.5
        elif y[i] > y[j]:
            _pij = 1
        else: 
            _pij = 0
        pij.append(_pij)
        
xi = np.array(xi)
xj = np.array(xj)
pij = np.array(pij)
pair_query_id = np.array(pair_query_id)

xi_train, xi_test, xj_train, xj_test, pij_train, pij_test, pair_id_train, pair_id_test = train_test_split(
    xi, xj, pij, pair_id, test_size=0.2, stratify=pair_query_id)

In [54]:
ranknet = RankNet()
ranknet.compile(optimizer='adam', loss='binary_crossentropy')
history = ranknet.fit([xi_train, xj_train], pij_train, epochs=50, batch_size=1, validation_data=([xi_test, xj_test], pij_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [59]:
from LambdaRankNN  import RankNetNN

ranker = RankNetNN(input_size=X.shape[0])
ranker.fit(X, y, qid, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AttributeError: 'tuple' object has no attribute 'as_list'