In [25]:
import pandas as pd
import os
import datetime
import dateparser
import numpy as np
from dotenv import load_dotenv
import json
from tqdm import tqdm

from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

# SOURCE

In [6]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/cnamuangtoun/resume-job-description-fit/" + splits["train"])

In [7]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

# BRONZE TABLE

In [8]:
# define LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

## Resume Feature Extraction

In [13]:
# Define models
    
class Experience(BaseModel):
    role: Optional[str] = Field(None, description="The job title or position held")
    company: Optional[str] = Field(None, description="The name of the company")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the job")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the job")
    role_description: Optional[str] = Field(None, description="A description of the responsibilities and achievements in the role")

class Education(BaseModel):
    degree: Optional[str] = Field(None, description="The academic degree obtained")
    institution: Optional[str] = Field(None, description="The name of the educational institution")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the education program")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the education program")
    grade: Optional[float] = Field(None, description="The GPA or final grade, if available")
    description: Optional[str] = Field(None, description="Additional details about the education")

class Resume(BaseModel):
    name: Optional[str] = Field(None, description="Full name of the person")
    location_preference: Optional[str] = Field(None, description="Preference for their work location / remote, if stated")
    work_authorizaton: Optional[str] = Field(None, description="Work authorization that the person holds, such as citizenship, if stated")
    employment_type_preference: Optional[str] = Field(
        None,
        description="Type of employment the resume is looking for such as Full-time, Part-time, Contract, Freelance, or Internship, if stated"
    )
    hard_skills: List[str] = Field(..., description="A list of hard or technical skills mentioned in the resume")
    soft_skills: List[str] = Field(..., description="A list of soft skills mentioned in the resume, such as communication, teamwork, and leadership")
    languages: List[str]= Field(..., description="A list of language proficiencies mentioned in the resume")
    experience: List[Experience] = Field(..., description="A list of past work experiences")
    education: List[Education] = Field(..., description="A list of educational qualifications")

# Create the parser
resume_parser = PydanticOutputParser(pydantic_object=Resume)

In [11]:
# Create the prompt
resume_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that extracts structured information from resumes."),
    ("human", "Extract the following information from the resume:\n\n{text}\n\n{format_instructions}")
])

## Job Desc Feature Extraction

In [14]:
# Define models for job desc

class JD(BaseModel):
    company_name: Optional[str] = Field(None, description="Name of the company posting the job")
    role_title: Optional[str] = Field(None, description="The title or name of the job role being offered")
    employment_type: Optional[str] = Field(None, description="Type of employment, such as Full-time, Part-time, Contract, Freelance, or Internship")
    about_the_company: Optional[str] = Field(None, description="A brief overview or description of the company")
    job_responsibilities: List[str] = Field(..., description="A list of key duties, tasks, or responsibilities associated with the job")
    required_hard_skills: List[str] = Field(..., description="A list of technical or hard skills required or preferred for the job")
    required_soft_skills: List[str] = Field(..., description="A list of soft skills or character required or preferred for the job")
    required_language_proficiencies: List[str] = Field(..., description="A list of language proficiencies required for the job")
    required_work_authorization: Optional[str] = Field(None, description="Work authorization required for the job")
    job_location: Optional[str] = Field(None, description="Location where the job is based, such as a city or remote")

# Create the parser
jd_parser = PydanticOutputParser(pydantic_object=JD)

In [15]:
# Create the prompt
jd_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that extracts structured information from job descriptions."),
    ("human", "Extract the following information from the job description:\n\n{text}\n\n{format_instructions}")
])

## Parse

In [16]:
# Create a function to parse text with llm
def parse_with_llm(text, prompt_template, parser, llm):
    prompt = prompt_template.format_messages(
        text=text,
        format_instructions=parser.get_format_instructions()
    )
    
    response = llm.invoke(prompt)
    return parser.parse(response.content)

### Parse resume

In [19]:
parsed_data = parse_with_llm(df['resume_text'].iloc[6236], resume_prompt_template, resume_parser, llm)

In [18]:
print(parsed_data.model_dump_json(indent=2))

{
  "name": null,
  "location_preference": null,
  "work_authorizaton": null,
  "employment_type_preference": null,
  "hard_skills": [
    "Data Management",
    "Database Management",
    "Data Compilation",
    "Attention to Detail",
    "Data Review",
    "Microsoft Office Suite",
    "Document Management and Storage",
    "Multitasking and Prioritization",
    "Time Management",
    "Data Verification",
    "Administrative Support",
    "Microsoft Access",
    "Microsoft Excel",
    "Adobe Software"
  ],
  "soft_skills": [
    "Decision Making",
    "Service-Oriented",
    "Self-Starter",
    "Workflow Management",
    "Team player"
  ],
  "languages": [],
  "experience": [
    {
      "role": "Data Entry Specialist",
      "company": "Sonic Healthcare Usa",
      "date_start": "2020-09-01T00:00:00",
      "date_end": "2023-11-21T15:16:19",
      "role_description": "Input client information into spreadsheets and company database to provide leaders with quick access to essential cl

### Parse job desc

In [20]:
print(df['job_description_text'].iloc[6236])

Hi,
Hope you are doing great today. Please find the job description below. Let me know your job interest as soon as possible. I will highly appreciate it if you can refer somebody suitable for this position. 
Role: Data Engineer (Oracle and DataStage).Location: RemoteContract Position
Job Description:RoleResponsibilities:Skills: Oracle, Datastage, UNIX, PLSQL, SQL. Good to have: AWS, Matillion, Snowflake. Data engineering experience; expert level experience with SQL. Experience with the cloud (AWS, Azure andor Google Cloud Platform).  Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics).  Experience with cloud-based ETLELT tools (Matillion, Glue, Data Factory) and data modelling.  Experience with version control systems (Git, SVN).  Understanding of and willingness to embrace Agile Principles. 
Looking forward to your response . 
Shubhanshu Tripathishubhanshu.t@cblsolutions.com 469-947-7816 (Ext  209)Cerebral Technologies, Inc

In [21]:
parsed_jd = parse_with_llm(df['job_description_text'].iloc[6236], jd_prompt_template, jd_parser, llm)

In [22]:
print(parsed_jd.model_dump_json(indent=2))

{
  "company_name": "Cerebral Technologies, Inc",
  "role_title": "Data Engineer",
  "employment_type": "Contract",
  "about_the_company": null,
  "job_responsibilities": [],
  "required_hard_skills": [
    "Oracle",
    "Datastage",
    "UNIX",
    "PLSQL",
    "SQL",
    "AWS",
    "Matillion",
    "Snowflake",
    "Data engineering",
    "SQL",
    "AWS",
    "Azure",
    "Google Cloud Platform",
    "Snowflake",
    "Google BigQuery",
    "Amazon Redshift",
    "Azure Synapse Analytics",
    "Matillion",
    "Glue",
    "Data Factory",
    "Git",
    "SVN"
  ],
  "required_soft_skills": [
    "Agile Principles"
  ],
  "required_language_proficiencies": [],
  "required_work_authorization": null,
  "job_location": "Remote"
}


### Parse 10 rows

In [33]:
df_subset = pd.concat([df[:5], df[-5:]])
df_subset

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
6236,SummaryResults-driven Data Entry Clerk with ex...,"Hi,\nHope you are doing great today. Please fi...",Good Fit
6237,Professional SummaryWith the attitude of learn...,Job Title: DHT - Front End Software Engineer W...,Good Fit
6238,Summary• \nOver\nThree years of extensi...,LHH Recruitment Solutions is looking for a Sof...,Good Fit
6239,ProfileAbility to prioritize and multi-task in...,Our client is a growing Medical Device company...,Good Fit
6240,SummaryFull stack Software Engineer with 8+ ye...,Robert Half is looking for a Senior Full Stack...,Good Fit


In [None]:
for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    try:
        # Process resume
        parsed_resume = parse_with_llm(resume_text, resume_prompt_template, resume_parser, llm)
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        resume_output_path = os.path.join('examples', 'resume', f"{idx}.json")
        with open(resume_output_path, "w") as f:
            json.dump(parsed_resume_dict, f, indent=2)

        # Process JD
        parsed_jd = parse_with_llm(jd_text, jd_prompt_template, jd_parser, llm)
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        jd_output_path = os.path.join('examples', 'jd', f"{idx}.json")
        with open(jd_output_path, "w") as f:
            json.dump(parsed_jd_dict, f, indent=2)
       
    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

100%|██████████| 10/10 [00:20<00:00,  2.07s/it]


# SILVER

# GOLD

## Get scores

In [37]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="SEMANTIC_SIMILARITY"
)

In [39]:
embeddings_required_skills = embedding_model.embed_documents(parsed_jd.required_hard_skills)
embeddings_skills_owned = embedding_model.embed_documents(parsed_resume.hard_skills)

In [40]:
required_skills = np.array(embeddings_required_skills)
skills_owned = np.array(embeddings_skills_owned)

# Normalize embeddings to unit vectors (L2 norm)
required_skills = required_skills / np.linalg.norm(required_skills, axis=1, keepdims=True)
skills_owned = skills_owned / np.linalg.norm(skills_owned, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(required_skills, skills_owned.T)

In [42]:
best_matches = []

for i, req_skill in enumerate(parsed_jd.required_hard_skills):
    j = similarity_matrix[i].argmax()
    score = similarity_matrix[i, j]
    if score >= 0.6:
        best_matches.append((req_skill, parsed_resume.hard_skills[j], score))

# Print
for req_skill, own_skill, score in best_matches:
    print(f"Required: {req_skill}  <=> Best Owned: {own_skill}  | Similarity: {score:.2f}")

Required: PostgreSQL  <=> Best Owned: PostgreSQL  | Similarity: 1.00
Required: Express  <=> Best Owned: EF  | Similarity: 0.63
Required: React  <=> Best Owned: HTML5  | Similarity: 0.63
Required: NodeJS  <=> Best Owned: AngularJS  | Similarity: 0.73
Required: Redux  <=> Best Owned: Redmine  | Similarity: 0.63
Required: HTML  <=> Best Owned: HTML  | Similarity: 1.00
Required: CSS  <=> Best Owned: CSS  | Similarity: 1.00
Required: JavaScript  <=> Best Owned: jQuery  | Similarity: 0.86
Required: JSON  <=> Best Owned: JSON  | Similarity: 1.00
Required: Git  <=> Best Owned: GIT  | Similarity: 0.95
Required: REST  <=> Best Owned: REST  | Similarity: 1.00
Required: Firebase  <=> Best Owned: Hangfire  | Similarity: 0.62
Required: Material-UI  <=> Best Owned: AngularJS  | Similarity: 0.63
Required: D3js  <=> Best Owned: jQuery  | Similarity: 0.72
Required: Docker (Compose)  <=> Best Owned: Composer  | Similarity: 0.67
Required: AWS  <=> Best Owned: AWS EC2  | Similarity: 0.85


In [47]:
embeddings_role_name = embedding_model.embed_query(parsed_jd.role_title)
embeddings_experience_titles = embedding_model.embed_documents([exp.role for exp in parsed_resume.experience])

In [43]:
parsed_jd.role_title

'Senior Full Stack Engineer (PERN Stack)'

In [48]:
[exp.role for exp in parsed_resume.experience]

['Software Developer',
 'Software .Net Developer',
 'Software Engineer and Professor']

In [49]:
role_name = np.array(embeddings_role_name)
experiences = np.array(embeddings_experience_titles)

# Normalize embeddings to unit vectors (L2 norm)
role_name = role_name / np.linalg.norm(role_name)
experiences = experiences / np.linalg.norm(experiences, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(experiences, role_name.T)

In [50]:
similarity_matrix

array([0.65028087, 0.62905722, 0.6121288 ])