In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Potential Talents

#### In this task we are going to rank each record in a list of potential candidates based on how well they match a certain keyword. The goal is for talent sourcing agents to be able to faster and more reliably and efficiently find suitable candidates for role.

#### We will test two different methods: a calssic TF-IDF, and google gemini's pre-trained embedding models

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [9]:
sheet_id = "117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU"
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv"
df = pd.read_csv(csv_url)

In [11]:
df = df.drop(columns=['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


In [12]:
df.shape

(104, 4)

## TF-IDF

We first look at a simple tf-idf vectorization.

In [13]:
tfidf = TfidfVectorizer(
    token_pattern=r'(?u)\b\w+\b',
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    # ngram_range=(1,2)
)

combined_sparse = tfidf.fit_transform(df['job_title'])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

keyword = "Aspiring Human Resources"

query_vec = tfidf.transform([keyword])

sims = cosine_similarity(query_vec, combined_sparse).flatten()

df_tfidf_k1 = df.copy()
df_tfidf_k1['fit'] = sims

df_sorted = df_tfidf_k1.sort_values(by='fit', ascending=False).reset_index(drop=True)

df_sorted.head(10)

Unnamed: 0,id,job_title,location,connection,fit
0,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
1,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.753591
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
3,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
4,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
5,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
6,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
7,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679
8,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679
9,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679


In [15]:
keyword = "Seeking Human Resources"

query_vec = tfidf.transform([keyword])

sims = cosine_similarity(query_vec, combined_sparse).flatten()

df_tfidf_k2 = df.copy()
df_tfidf_k2['fit'] = sims

df_sorted = df_tfidf_k2.sort_values(by='fit', ascending=False).reset_index(drop=True)

df_sorted.head(10)

Unnamed: 0,id,job_title,location,connection,fit
0,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.664933
1,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.664933
2,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.644851
3,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.620589
4,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.488364
5,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.488364
6,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.488364
7,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.488364
8,29,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.476206
9,27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.476206


## Huggingface/MPNet

Now with TF-IDF as a abseline, we will move on to using a pretrained emedding model -MPNet in this case.

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

df_mpnet_k1 = df.copy()

titles = df_mpnet_k1['job_title'].fillna("").tolist()
keyword = "Aspiring Human Resources"

title_embeddings = model.encode(titles)  
keyword_embedding = model.encode([keyword])

similarities = model.similarity(keyword_embedding, title_embeddings)[0]

df_mpnet_k1['fit'] = similarities
df_ranked = df_mpnet_k1.sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(20)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,job_title,location,connection,fit
0,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
1,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
2,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
3,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.879234
4,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
5,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
6,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
7,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964
8,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964
9,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964


In [19]:
keyword = "Seeking Human Resources"

keyword_embedding = model.encode([keyword])

similarities = model.similarity(keyword_embedding, title_embeddings)[0]

df_mpnet_k2 = df.copy()
df_mpnet_k2['fit'] = similarities
df_ranked = df_mpnet_k2.sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(20)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,job_title,location,connection,fit
0,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.880791
1,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.880791
2,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.820258
3,67,"Human Resources, Staffing and Recruiting Profe...","Jackson, Mississippi Area",500+,0.745163
4,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.735197
5,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.735197
6,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.735197
7,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.735197
8,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.726518
9,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.726518


The built-in similarity method that is used here is using cosine similarity too, which is important for comparison. MPNet is giving higher scores even compared to tfidf for this particular keyword. One more thing that MPNet were superior in, was the fact that they give "Asspiring Human Resources Professional" and "Aspiring Human Resources Specialist" very similar scores, which makes sense because the last word is arbitrary and shouldn't make much difference.

## Re-Ranking/Starred Candidate

Now we need to implement the re-ranking logic. We will revise the ranking once the a candidate is starred as the best choice. In order to do so, we will take starred candidates job title as the new search keyword, rank other records based on similarity with this new reference this time and generate a new ranking.

We already have the vector space, we will use th retrieval query to map the starred record job title into that space and calculate the similarities same way we did before.

In [22]:
starred = df.iloc[73]
starred

id                                      74
job_title     Human Resources Professional
location               Greater Boston Area
connection                              16
Name: 73, dtype: object

In [23]:
starred.job_title

'Human Resources Professional'

In [24]:
search_keyword = starred.job_title

keyword_embedding = model.encode([search_keyword])

similarities = model.similarity(keyword_embedding, title_embeddings)[0]

df_starred = df.copy()
df_starred ['fit'] = similarities
df_ranked = df_starred .sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(20)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,job_title,location,connection,fit
0,74,Human Resources Professional,Greater Boston Area,16,1.0
1,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
2,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.84059
3,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
4,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
5,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
6,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
7,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
8,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.82942
9,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.82942
