In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Potential Talents

#### In this task we are going to rank each record in a list of potential candidates based on how well they match a certain keyword. The goal is for talent sourcing agents to be able to faster and more reliably and efficiently find suitable candidates for role.

#### We will test two different methods: a calssic TF-IDF, and google gemini's pre-trained embedding models

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
sheet_id = "117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU"
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv"
df = pd.read_csv(csv_url)

In [3]:
df = df.drop(columns=['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


In [4]:
df.shape

(104, 4)

## TF-IDF

We first look at a simple tf-idf vectorization.

In [5]:
tfidf = TfidfVectorizer(
    token_pattern=r'(?u)\b\w+\b',
    strip_accents='unicode',
    stop_words='english',
    lowercase=True,
    # ngram_range=(1,2)
)

combined_sparse = tfidf.fit_transform(df['job_title'])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

keyword = "Aspiring Human Resources"

query_vec = tfidf.transform([keyword])

sims = cosine_similarity(query_vec, combined_sparse).flatten()

df['fit'] = sims

df_sorted = df.sort_values(by='fit', ascending=False).reset_index(drop=True)

df_sorted.head(10)

Unnamed: 0,id,job_title,location,connection,fit
0,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
1,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.753591
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
3,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
4,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
5,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
6,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.753591
7,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679
8,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679
9,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.695679


## Google Gemini Embedding

Here we will use a pre-trained model, google gemini embedding models. We will need to use two task types, retrieval document to create an embedding of the records, and retrieval query to take a search keyword an map it into the same space.

In [7]:
from google import genai
from google.genai import types

genai.__version__

'1.9.0'

In [8]:
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

client = genai.Client(api_key=GOOGLE_API_KEY)

for model in client.models.list():
  if 'embedContent' in model.supported_actions:
    print(model.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


In [9]:
texts = df['job_title'].to_list()

title_embeddings_result = client.models.embed_content(
    model='models/text-embedding-004',
    contents=texts[:100],
    config=types.EmbedContentConfig(task_type='RETRIEVAL_DOCUMENT'))

In [10]:
search_keyword = "aspiring human resources"

keyword_embedding_result = client.models.embed_content(
    model='models/text-embedding-004',
    contents=keyword,
    config=types.EmbedContentConfig(task_type='RETRIEVAL_QUERY'))

In [11]:
title_embeddings = np.array([e.values for e in title_embeddings_result.embeddings])
keyword_embeddings = np.array([k.values for k in keyword_embedding_result.embeddings])

In [12]:
similarity_scores = cosine_similarity(keyword_embeddings, title_embeddings)

In [13]:
df_subset = df.iloc[:100].copy()

df_subset['fit'] = similarity_scores[0]
df_ranked = df_subset.sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(10)

Unnamed: 0,id,job_title,location,connection,fit
0,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
1,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
3,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
4,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.641288
5,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
6,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.641288
7,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.638048
8,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.638048
9,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.637967


## Huggingface/MPNet

Gemini gave overal lower similarity scores compared to tfidf. As the last embeeding model we will now use and compare MPNet through huggingface.

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

df = df.drop(columns=['fit'])

titles = df['job_title'].fillna("").tolist()
keyword = "Aspiring Human Resources"

title_embeddings = model.encode(titles)  
keyword_embedding = model.encode([keyword])

similarities = model.similarity(keyword_embedding, title_embeddings)[0]

df['fit'] = similarities
df_ranked = df.sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(20)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,job_title,location,connection,fit
0,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
1,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
2,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
3,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.879234
4,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
5,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
6,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.879234
7,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964
8,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964
9,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.864964


MPNet is giving higher scores even compared to tfidf for this particular keyword. One more thing that both Gemini and MPNet were superior in, was the fact that they give "Asspiring Human Resources Professional" and "Aspiring Human Resources Specialist" very similar scores, which makes sense because the last word is arbitrary and shouldn't make much difference.

## Re-Ranking/Starred Candidate

Now we need to implement the re-ranking logic. We will revise the ranking once the a candidate is starred as the best choice. In order to do so, we will take starred candidates job title as the new search keyword, rank other records based on similarity with this new reference this time and generate a new ranking.

We already have the vector space, we will use th retrieval query to map the starred record job title into that space and calculate the similarities same way we did before.

In [16]:
starred = df.iloc[73]
starred

id                                      74
job_title     Human Resources Professional
location               Greater Boston Area
connection                              16
fit                               0.754995
Name: 73, dtype: object

In [17]:
starred.job_title

'Human Resources Professional'

In [18]:
search_keyword = starred.job_title

keyword_embedding = model.encode([search_keyword])

similarities = model.similarity(keyword_embedding, title_embeddings)[0]

df['fit'] = similarities
df_ranked = df.sort_values('fit', ascending=False).reset_index(drop=True)

df_ranked.head(20)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,id,job_title,location,connection,fit
0,74,Human Resources Professional,Greater Boston Area,16,1.0
1,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
2,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.84059
3,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
4,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
5,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
6,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
7,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.84059
8,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.82942
9,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.82942
