In [6]:
# import main libraries
import numpy as np
import torch
import os
import pandas as pd


import sys

sys.path.append("../")
from src import config
from src import embeddings
from sentence_transformers import SentenceTransformer
import faiss.contrib.torch_utils

import warnings

warnings.filterwarnings("ignore")

## EDA

In [14]:
# Load the dataset
talent_data = pd.read_csv(
    os.path.join("..", config.DATA_DIR, "potential-talents.csv"), sep=";"
)

In [15]:
talent_data

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"?zmir, T�rkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [18]:
talent_data.describe(include="all")

Unnamed: 0,id,job_title,location,connection,fit
count,104.0,104,104,104,0.0
unique,,52,41,33,
top,,2019 C.T. Bauer College of Business Graduate (...,Kanada,500+,
freq,,7,12,44,
mean,52.5,,,,
std,30.166206,,,,
min,1.0,,,,
25%,26.75,,,,
50%,52.5,,,,
75%,78.25,,,,


## Preprocessing

### before computing embeddings

* Lower case all text fields
* separate capitalized words
    * "ThisisAWord" -> "This is A Word"

* fix_white_space

* remove_non_alphanumeric
* remove_stop_words
* text_to_paragraphs
* text_to_sentences
* tokenization/lemmatization 

In [25]:
current_dir = os.getcwd()

# Get the parent directory path
parent_dir = os.path.dirname(current_dir)

# Change the current working directory to the parent directory which should be the root folder
os.chdir(parent_dir)

# Verify the new working directory
new_dir = os.getcwd()
print("Current working directory:", new_dir)

Current working directory: d:\01-Projects\Apziva\03-Potential talent


In [34]:
# from src import preprocessor
from src import utils
from src import preprocess
import logging


logger = utils.logger

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# 1. Collect and preprocess data: The preprocessed data will be dumped into a new file
logger.info("Collecting and preprocessing data ...")

2024-10-06 23:13:39,417 - src.utils - INFO - Collecting and preprocessing data ...


In [29]:
talent_data_copy = talent_data.copy()

In [38]:
preprocess.preprocess_text("2019 C.T. Bauer College of Business Graduate")

'ct bauer college business graduate'

In [39]:
talent_data_copy["job_title_p"] = talent_data_copy["job_title"].apply(
    preprocess.preprocess_text
)
talent_data_copy

Unnamed: 0,id,job_title,location,connection,fit,job_title_p
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,,ct bauer college business graduate magna cum l...
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,,native english teacher epik english program korea
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,,aspire human resource professional
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,,people development coordinator ryan
4,5,Advisory Board Member at Celal Bayar University,"?zmir, T�rkiye",500+,,advisory board member celal bayar university
...,...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,,aspire human resource manager graduate seek en...
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,,human resource generalist loparex
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,,business intelligence analytic traveler
102,103,Always set them up for Success,Greater Los Angeles Area,500+,,set success


### Embedding

In [None]:
data = dict(
    zip(
        talent_data_copy["id"].values.tolist(),
        embeddings.get_embeddings_from_lemmatized_sentences(
            talent_data_copy["job_title_p"].values.tolist()
        ),
    )
)
utils.save_json(
    data,
    os.path.join(
        config.DATA_DIR,
        "processed",
        f"{config.TEXT_SECTION_TYPE}_"
        + "_".join(config.TRAIN_DATA_INPUT_TYPES)
        + "_embeddings.json",
    ),
)

## Training

In [8]:
# Load a pre-trained model

model = SentenceTransformer("msmarco-MiniLM-L-12-v3")
job_titles = talent_data["job_title"].tolist()

# embed hotel reviews

job_titles_embds = model.encode(job_titles)



In [9]:
job_titles_embds

array([[-0.25508878, -0.01373229, -0.07645729, ..., -0.55478364,
         0.1249141 , -0.47570327],
       [ 0.32636008,  0.2628865 ,  0.51103956, ...,  0.4303046 ,
         0.05353639,  0.1499273 ],
       [ 0.44917667,  0.24064577, -0.04250324, ..., -0.33236706,
        -0.332711  , -0.1096751 ],
       ...,
       [ 0.04733843, -0.01758945, -0.2557778 , ..., -0.3086499 ,
         0.1094522 , -0.33761653],
       [-0.10538442,  0.06323341, -0.53019553, ..., -0.40946347,
        -0.20917243,  0.24782464],
       [ 0.24558017, -0.26688632,  0.31220445, ..., -0.65013444,
        -0.1799025 ,  0.20887013]], dtype=float32)

In [10]:
d = job_titles_embds.shape[1]
d

384

In [11]:
import faiss
import time

# Create an index using FAISS
job_titles_embds = np.array(job_titles_embds, dtype="float32")
index = faiss.IndexFlatL2(job_titles_embds.shape[1])
index.is_trained
index.add(job_titles_embds)
faiss.write_index(index, "index_job_titles")
index = faiss.read_index("index_job_titles")

In [14]:
def search(query):

    t = time.time()
    query_vector = model.encode([query])
    k = 5
    top_k = index.search(query_vector, k)
    print(top_k)
    print("totaltime: {}".format(time.time() - t))
    return [job_titles[_id] for _id in top_k[1].tolist()[0]]

In [15]:
results = search("Aspiring human resources")
results

(array([[7.5216384, 7.5216384, 7.5216384, 7.5216384, 7.5216384]],
      dtype=float32), array([[ 2, 16, 20, 32, 45]], dtype=int64))
totaltime: 0.026020050048828125


['Aspiring Human Resources Professional',
 'Aspiring Human Resources Professional',
 'Aspiring Human Resources Professional',
 'Aspiring Human Resources Professional',
 'Aspiring Human Resources Professional']

In [16]:
results2 = search("seeking human resources")
results2

(array([[ 8.325108,  8.325108, 12.726274, 19.924831, 20.216654]],
      dtype=float32), array([[27, 29, 98, 93,  2]], dtype=int64))
totaltime: 0.036566972732543945


['Seeking Human Resources Opportunities',
 'Seeking Human Resources Opportunities',
 'Seeking Human Resources Position',
 'Seeking Human  Resources Opportunities. Open to travel and relocation.',
 'Aspiring Human Resources Professional']