## 1. Importing neccessary libraries

In [1]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install scikit-learn

!pip install tensorflow
!pip install transformers
!pip install sentence-transformers

!pip install nltk
!pip install spacy
!python -m spacy download en






























Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;3m[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use
the full pipeline package name 'en_core_web_sm' instead.[0m




[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import random

import re

import tensorflow as tf
import transformers

import spacy
from nltk.stem import PorterStemmer

In [3]:
pd.set_option('display.max_colwidth', 200)

## 2. Loading dataset

In [4]:
dataset_original= pd.read_csv("C:\\Users\\agnes\\Documents\\apziva_ai_residency\\project3\\data\\potential-talents.csv")
dataset_original.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Program in Korea),Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


# Inspecting & Cleaning the dataset

In [5]:
dataset_cleaned_temp = dataset_original.copy()

In [6]:
dataset_cleaned_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


In [7]:
dataset_cleaned_temp.describe()

Unnamed: 0,id,fit
count,104.0,0.0
mean,52.5,
std,30.166206,
min,1.0,
25%,26.75,
50%,52.5,
75%,78.25,
max,104.0,


### Checking missing data

In [8]:
dataset_cleaned_temp.isnull().sum()

id              0
job_title       0
location        0
connection      0
fit           104
dtype: int64

### Checking duplicates

In [9]:
dataset_cleaned_temp.duplicated().sum()

0

### Remove unnecessary words & Replace abbreviations

In [10]:
dataset_cleaned_temp['job_title'].value_counts()

2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 7
Aspiring Human Resources Professional                                                                                    7
Student at Humber College and Aspiring Human Resources Generalist                                                        7
People Development Coordinator at Ryan                                                                                   6
Native English Teacher at EPIK (English Program in Korea)                                                                5
Aspiring Human Resources Specialist                                                                                      5
HR Senior Specialist                                                                                                     5
Student at Chapman University                                                                                            4
SVP, CHRO, Marke

In [11]:
spacy_nlp = spacy.load('en_core_web_sm')
spacy_nlp.pipe_names

stemmer = PorterStemmer()

In [12]:
abbreviations_to_replace = {
    'GPHR': 'Global Professional in Human Resources',
    'CSR': 'Corporate Social Responsibility',
    'MES': 'Manufacturing Execution Systems',
    'SPHR': 'Senior Professional in Human Resources',
    'SVP': 'Senior Vice President',
    'GIS': 'Geographic Information System',
    'RRP': 'Reduced Risk Products',
    'CHRO': 'Chief Human Resources Officer',
    'HRIS': 'Human resources information system',
    'HR': 'Human resources',
}

def replace_abbreviations(sentence):
    replaced_sentence = sentence
    for abbreviation, replacement in abbreviations_to_replace.items():
        # Create a regular expression pattern to match the whole word
        pattern = r'\b{}\b'.format(re.escape(abbreviation))
    
        # Use re.sub() to replace the word in the sentence
        replaced_sentence = re.sub(pattern, replacement, replaced_sentence, flags=re.IGNORECASE)

    return replaced_sentence

In [13]:
def clean_sentence(sentence):
    # Remove special characters
    new_sentence = re.sub(r'[+*,.|(){}&\-\']', '', sentence)

    # Replce abbreviations
    new_sentence = replace_abbreviations(new_sentence)
    
    words = new_sentence.split()
    
    # Stemming
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
        
    # Lemmatization
    lemmatized_words = []
    doc = spacy_nlp(" ".join(stemmed_words))
    for token in doc:
        if not token.is_stop:
            lemmatized_words.append(token.lemma_)

    return " ".join(lemmatized_words)

In [14]:
dataset_cleaned_temp['job_title_cleaned'] = dataset_cleaned_temp['job_title'].apply(clean_sentence)
print(dataset_cleaned_temp['job_title_cleaned'].head())

0    2019 ct bauer colleg busi graduat magna cum laud aspir human resourc profession
1                                   nativ english teacher epik english program korea
2                                                     aspir human resourc profession
3                                                         peopl develop coordin ryan
4                                           advisori board member celal bayar univer
Name: job_title_cleaned, dtype: object


In [15]:
print([*set(dataset_cleaned_temp["job_title_cleaned"].str.split().agg(sum, axis = 0))])

['lab', 'reloc', 'director', 'energi', 'environ', 'western', 'employ', 'chapman', 'managementbenefit', 'loui', 'medic', 'loparex', 'help', 'long', 'paint', '7092621', 'social', 'analyt', 'leader', 'excel', 'offici', 'work', 'admiss', 'energet', 'generalist', 'teamfocus', 'intellig', 'wellington', 'group', 'manag', 'ct', 'servic', 'peopl', 'staf', 'experienc', 'repre', 'softwar', 'programm', 'offic', '2019', 'endemol', 'retir', 'engag', 'log', 'engin', 'world', 'scienc', 'biolog', 'resourc', 'scottmadden', 'system', 'laud', 'atlanta', 'open', 'market', 'beach', 'profession', 'professional', 'partner', 'creat', 'environment', 'risk', 'product', 'portfolio', 'delphi', 'bauer', 'nativ', 'set', '!', 'global', 'reduc', 'colleg', 'armi', 'kokomo', 'state', 'student', 'ey', 'center', 'westfield', 'passion', 'undergradu', 'heil', 'beneteau', 'art', 'jti', 'bachelor', 'brand', 'develop', 'engi', 'entrylevel', 'victoria', 'coordin', 'junior', 'corpor', 'japan', 'member', 'lead', 'commun', 'liber'

In [16]:
dataset_cleaned = dataset_cleaned_temp.copy()

# Preprocessing

In [17]:
dataset_preprocessed = dataset_cleaned.copy()

## Setup BERT & Utils

In [18]:
from transformers import BertTokenizer, TFBertModel
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [19]:
def get_bert_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Tokenize input sentence
        encoded_inputs = bert_tokenizer(sentence, padding=True, truncation=True, return_tensors='tf')
    
        # Generate BERT embeddings
        outputs = bert_model(encoded_inputs)
        hidden_states = outputs.last_hidden_state

        # Apply pooling strategy - averaging
        pooled = tf.reduce_mean(hidden_states, axis=1)
        embeddings.append(pooled.numpy().reshape(-1))
    
    return np.array(embeddings)

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

def encode_and_get_similarity(data, queries, search_columns, output_columns):
    data = data.copy()
    
    embeddings = {}
    queries_embeddings = []
    
    # without replacing the abbreviations with their full meaning, we will get very bad results
    for index, query in enumerate(queries):
        query = replace_abbreviations(query)
        query = clean_sentence(query)
        queries_embeddings.append(get_bert_embeddings([query]))
        
    queries_embeddings_mean = np.mean(queries_embeddings, axis=0)
    # queries_embeddings_mean = get_bert_embeddings('Aspiring Human Resources Professional')

    for index, column in enumerate(search_columns):
        sentences = dataset_preprocessed[column].tolist()

        # Encoding
        embeddings[column] = get_bert_embeddings(sentences)

        # Cosine Similarity
        cosine_similarities = cosine_similarity(
            queries_embeddings_mean,
            embeddings[column]
        )        
        data[output_columns[index]] = cosine_similarities[0]
    
    return data

# Ranking

## Search Queries/Keywords

In [22]:
queries = [
    # 'Aspiring Human Resources Professional',
    'aspiring human resources',
    'seeking human resources'
]

## Get Embeddings & Similarities

In [23]:
dataset_preprocessed = encode_and_get_similarity(dataset_preprocessed, queries, ['job_title_cleaned'], ['bert_similarity'])

## First Rank

In [25]:
dataset_preprocessed.sort_values(by='bert_similarity', ascending=False).head(20)

Unnamed: 0,id,job_title,location,connection,fit,job_title_cleaned,bert_similarity
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspir human resourc specialist,0.918638
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspir human resourc specialist,0.918638
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspir human resourc specialist,0.918638
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspir human resourc specialist,0.918638
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,,aspir human resourc specialist,0.918638
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,,seek human resourc posit,0.899539
67,68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,,human resourc specialist luxottica,0.890844
87,88,Human Resources Management Major,"Milpitas, California",18,,human resourc manag major,0.883633
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,human resourc generalist loparex,0.86799
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspir human resourc profession,0.860986


## Starred Candidates

Mark them as favorite/bookmark

In [26]:
starred_ids = [int(item) for item in input("Enter the ids of the candidates you want to star (separate by spaces): ").split()]

Enter the ids of the candidates you want to star (separate by spaces): 3


## Second Rank (Re-Rank)

- similar to bookmark
- First way:  Marging the keypharse and the starred title
- Second way: one more column of scores (starred), use the starred job title as a keyword

In [27]:
dataset_preprocessed.loc[dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 1
dataset_preprocessed.loc[~dataset_preprocessed['id'].isin(starred_ids), 'is_starred'] = 0

In [28]:
def get_starred_score(data):
    data = data.copy()
    
    # Starred Queries
    queries = data[data['is_starred'] == 1]['job_title_cleaned']
    
    similarities = []
    for query in queries:
        print('START: ' + query)
        data = encode_and_get_similarity(data, [query], ['job_title_cleaned'], ['starred_similarity'])
        similarities.append(data['starred_similarity'])
        
        
    starred_similarity = np.mean(similarities, axis=0)
    
    return starred_similarity

In [None]:
dataset_preprocessed['starred_similarity'] = get_starred_score(dataset_preprocessed)

START: aspir human resourc profession


In [None]:
dataset_preprocessed['mean_similarity'] = dataset_preprocessed[['bert_similarity', 'starred_similarity']].mean(axis=1)

In [None]:
dataset_preprocessed[['job_title', 'is_starred', 'bert_similarity', 'starred_similarity', 'mean_similarity']].sort_values(by=['mean_similarity', 'is_starred'], ascending=False).head(20)