In [9]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from google import genai

In [None]:
KEY = ""
client = genai.Client(api_key=KEY)

In [11]:
df_resumes = pd.read_csv('../../PreProcessingResumes/processed_data/Resumes.csv')

# Convert resumes column to a list
resumes = df_resumes["Resume_str"].tolist()
    
# Encode category labels as integers
encoder = LabelEncoder()
labels = encoder.fit_transform(df_resumes["Category"])
category_names = encoder.classes_.tolist()

# Load the SBERT model for generating sentence embeddings
sbert_model = SentenceTransformer('all-MiniLM-L12-v2')

resumes_embed = sbert_model.encode(resumes, show_progress_bar=True)

Batches:   0%|          | 0/78 [00:00<?, ?it/s]

In [12]:
def select_next_job(df_jobs, output_csv):
    processed = pd.read_csv(output_csv)['job_id'].unique()
    available = df_jobs[~df_jobs['job_id'].isin(processed)]

    # Randomly sample one available job
    job_sample = available.sample(n=1).iloc[0]
    return job_sample

In [13]:
def requestLLM(resume_text, job_description):
    """
    Query the LLM to get a similarity score between resume and job description.
    Returns a value from 0-10 indicating how well the resume fits the job.
    """

    # As a Human Resources recruiter, your task is to evaluate whether the **professional field or industry** of the candidate aligns with that of the job description.
    # As a Human Resources recruiter, your task is to evaluate how well a candidate's resume matches a job description
    # Given a resume and a job description, assign an integer score from 1 to 10 indicating how well the resume matches the position.
    # Ignore experience, specific skills, or seniority level — focus only on the alignment of the professional domain.
    
    prompt = f"""Analyze the match between this RESUME and JOB DESCRIPTION. 
    As a Human Resources recruiter, your task is to evaluate whether the professional field of the candidate aligns with that of the job description.
    
    RESUME:
    {resume_text}
    
    JOB DESCRIPTION:
    {job_description}
    
    Given a resume and a job description, assign an integer score from 0 to 10 indicating how well the resume matches the position.
    
    Provide only a single integer score representing how well this resume matches the job requirements.
    Don't provide any explanation, just return the integer score between 0 and 10.
    """
    
    response = client.models.generate_content(
        model = "gemini-2.0-flash",
        contents= prompt,
    )

    content = response.text.replace('\n', '')
    return content

In [14]:
def random_indices(similarity_vector):
    # Get indices of top_n resumes with highest similarity scores
    sorted_indices = similarity_vector.argsort()[::-1]

    # 5 best
    best_idx = sorted_indices[:5]

    # 5 random
    remaining_indices = sorted_indices[5:-2]
    random_idx = np.random.choice(remaining_indices, size=5, replace=False)

    # combine result
    final_indices = np.concatenate([best_idx, random_idx])

    return final_indices

In [15]:
def get_similarity_score(job_sample, similarity_vector, indices):    
    results = []
    for rank, cv_idx in enumerate(indices, start=1):
        
        score = requestLLM(df_resumes.iloc[cv_idx]['Resume_str'], job_sample['description'])
        
        results.append({
            'job_id':           job_sample['job_id'],
            'job_title':        job_sample['title'],
            'cv_id':            df_resumes.iloc[cv_idx]['ID'],      
            'cv_category':      df_resumes.iloc[cv_idx]['Category'],
            'similarity_score': round(similarity_vector[cv_idx]*10),
            'gemini_score':     score
        })

        time.sleep(2)

    df_matches = pd.DataFrame(results)

    return df_matches 

In [19]:
TIMES = 30
OUTPUT_CSV = "compare.csv"


for i in range(TIMES):

    df_jobs = pd.read_csv('../../PreProcessingJobs/processed_data/JobDescription.csv')
    job_sample = select_next_job(df_jobs, OUTPUT_CSV)

    job_desc = job_sample['description']
    job_embed = sbert_model.encode([job_desc], show_progress_bar=False)

    similarity_vector = cosine_similarity(job_embed, resumes_embed).flatten()

    # Fetch random CVs to evaluate cosine similarity
    indices = random_indices(similarity_vector)

    matches = get_similarity_score(job_sample, similarity_vector, indices)

    #Append results to the CSV file
    header = not os.path.exists(OUTPUT_CSV)
    matches.to_csv(OUTPUT_CSV, mode='a', index=False, header=header)

    # Sleep for 60 seconds to avoid exceeding Gemini API rate limits
    time.sleep(60)

ServerError: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The service is currently unavailable.', 'status': 'UNAVAILABLE'}}