In [1]:
import pandas as pd
import os
import datetime
import dateparser
import numpy as np
from dotenv import load_dotenv
import json
from tqdm import tqdm

from mistralai import Mistral


from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

# SOURCE

In [2]:
from huggingface_hub import login
login("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")

In [3]:
from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
df = dataset["train"].to_pandas()


In [4]:
load_dotenv()
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")

In [5]:
client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])


# BRONZE TABLE

## Resume Feature Extraction

In [6]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Experience(BaseModel):
    role: Optional[str] = Field(None, description="Job title or position held", alias="role", json_schema_extra={"mistral_type": "string", "mistral_description": "Job title or position held"})
    company: Optional[str] = Field(None, description="Company name", alias="company", json_schema_extra={"mistral_type": "string", "mistral_description": "Company name"})
    date_start: Optional[str] = Field(None, description="Start date", alias="date_start", json_schema_extra={"mistral_type": "string", "mistral_description": "Start date"})
    date_end: Optional[str] = Field(None, description="End date", alias="date_end", json_schema_extra={"mistral_type": "string", "mistral_description": "End date"})
    role_description: Optional[str] = Field(None, description="Description of the role", alias="role_description", json_schema_extra={"mistral_type": "string", "mistral_description": "Description of the role"})

class Education(BaseModel):
    degree: Optional[str] = Field(None, description="Degree obtained", alias="degree", json_schema_extra={"mistral_type": "string", "mistral_description": "Degree obtained"})
    institution: Optional[str] = Field(None, description="Institution name", alias="institution", json_schema_extra={"mistral_type": "string", "mistral_description": "Institution name"})
    date_start: Optional[str] = Field(None, description="Start date", alias="date_start", json_schema_extra={"mistral_type": "string", "mistral_description": "Start date"})
    date_end: Optional[str] = Field(None, description="End date", alias="date_end", json_schema_extra={"mistral_type": "string", "mistral_description": "End date"})
    grade: Optional[float] = Field(None, description="Grade or GPA", alias="grade", json_schema_extra={"mistral_type": "number", "mistral_description": "Grade or GPA"})
    description: Optional[str] = Field(None, description="Description", alias="description", json_schema_extra={"mistral_type": "string", "mistral_description": "Description"})

class Resume(BaseModel):
    name: Optional[str] = Field(None, description="Candidate's name", alias="name", json_schema_extra={"mistral_type": "string", "mistral_description": "Candidate's name"})
    location_preference: Optional[str] = Field(None, description="Preferred location", alias="location_preference", json_schema_extra={"mistral_type": "string", "mistral_description": "Preferred location"})
    work_authorization: Optional[str] = Field(None, description="Work authorization status", alias="work_authorization", json_schema_extra={"mistral_type": "string", "mistral_description": "Work authorization status"})
    employment_type_preference: Optional[str] = Field(None, description="Preferred employment type", alias="employment_type_preference", json_schema_extra={"mistral_type": "string", "mistral_description": "Preferred employment type"})
    hard_skills: List[str] = Field(default_factory=list, description="List of hard skills", alias="hard_skills", json_schema_extra={"mistral_type": "array", "mistral_description": "List of hard skills"})
    soft_skills: List[str] = Field(default_factory=list, description="List of soft skills", alias="soft_skills", json_schema_extra={"mistral_type": "array", "mistral_description": "List of soft skills"})
    languages: List[str] = Field(default_factory=list, description="Languages spoken", alias="languages", json_schema_extra={"mistral_type": "array", "mistral_description": "Languages spoken"})
    experience: List[Experience] = Field(default_factory=list, description="Work experience", alias="experience", json_schema_extra={"mistral_type": "array", "mistral_description": "Work experience"})
    education: List[Education] = Field(default_factory=list, description="Education history", alias="education", json_schema_extra={"mistral_type": "array", "mistral_description": "Education history"})


In [7]:
def create_structured_resume_prompt(resume_text: str) -> str:
    return f"""
    Extract the following information from the resume:
    - Name
    - Location Preference
    - Work Authorization
    - Employment Type Preference
    - Hard Skills
    - Soft Skills
    - Languages
    - Experience (with role, company, start date, end date, description)
    - Education (with degree, institution, start date, end date, grade, description)

    Return the information in the following JSON format:
    {{
        "name": "John Doe",
        "location_preference": "Remote",
        "work_authorization": "US Citizen",
        "employment_type_preference": "Full-time",
        "hard_skills": ["Python", "Machine Learning"],
        "soft_skills": ["Communication", "Teamwork"],
        "languages": ["English", "Spanish"],
        "experience": [
            {{
                "role": "Software Engineer",
                "company": "Tech Corp",
                "date_start": "2020-01-01",
                "date_end": "2022-01-01",
                "role_description": "Developed software applications."
            }}
        ],
        "education": [
            {{
                "degree": "BSc Computer Science",
                "institution": "University X",
                "date_start": "2016-09-01",
                "date_end": "2020-06-01",
                "grade": 3.8,
                "description": "Graduated with honors."
            }}
        ]
    }}
    """


## Job Desc Feature Extraction

In [None]:
# Define models for job desc

class JD(BaseModel):
    company_name: Optional[str] = Field(None, description="Name of the company posting the job")
    role_title: Optional[str] = Field(None, description="The title or name of the job role being offered")
    employment_type: Optional[str] = Field(None, description="Type of employment, such as Full-time, Part-time, Contract, Freelance, or Internship. If not stated, it is assumed to be Full-time")
    about_the_company: Optional[str] = Field(None, description="A brief overview or description of the company")
    job_responsibilities: List[str] = Field(..., description="A list of key duties, tasks, or responsibilities associated with the job")
    required_hard_skills: List[str] = Field(..., description="A list of technical or hard skills required or preferred for the job")
    required_soft_skills: List[str] = Field(..., description="A list of soft skills or character required or preferred for the job")
    required_language_proficiencies: List[str] = Field(..., description="A list of language proficiencies required for the job")
    required_work_authorization: Optional[str] = Field(None, description="Work authorization required for the job")
    job_location: Optional[str] = Field(None, description="Location where the job is based, such as a city or remote")

# Create the parser
jd_parser = PydanticOutputParser(pydantic_object=JD)

In [15]:
# Create the prompt
jd_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that extracts structured information from job descriptions."),
    ("human", "Extract the following information from the job description:\n\n{text}\n\n{format_instructions}")
])

## Parse

### Parse resume

In [8]:
models = client.models.list()
for m in models.data:
    print(m.id)


ministral-3b-2410
ministral-3b-latest
ministral-8b-2410
ministral-8b-latest
open-mistral-7b
mistral-tiny
mistral-tiny-2312
open-mistral-nemo
open-mistral-nemo-2407
mistral-tiny-2407
mistral-tiny-latest
open-mixtral-8x7b
mistral-small
mistral-small-2312
open-mixtral-8x22b
open-mixtral-8x22b-2404
mistral-small-2402
mistral-small-2409
mistral-medium-2312
mistral-large-2402
mistral-large-2407
mistral-large-2411
mistral-large-latest
pixtral-large-2411
pixtral-large-latest
mistral-large-pixtral-2411
codestral-2405
codestral-2501
codestral-latest
codestral-2412
codestral-2411-rc5
devstral-small-2505
devstral-small-latest
codestral-mamba-2407
open-codestral-mamba
codestral-mamba-latest
pixtral-12b-2409
pixtral-12b
pixtral-12b-latest
mistral-small-2501
mistral-small-2503
mistral-small-latest
mistral-saba-2502
mistral-saba-latest
mistral-medium-2505
mistral-medium-latest
mistral-medium
mistral-embed
mistral-moderation-2411
mistral-moderation-latest
mistral-ocr-2503
mistral-ocr-2505
mistral-ocr-l

In [9]:
from mistralai import Mistral
import json

def parse_resume(resume_text: str) -> Resume:
    prompt = create_structured_resume_prompt(resume_text)

    response = client.chat.complete(
        model="mistral-medium-latest",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=2048,
        response_format={"type": "json_object"}
    )

    # Extract and parse the JSON response
    structured_json = response.choices[0].message.content
    data = json.loads(structured_json)
    return Resume(**data)


In [11]:
resume_text = df["resume_text"].iloc[6236]
parsed_resume = parse_resume(resume_text)
print(parsed_resume.model_dump_json(indent=2))


{
  "name": "John Doe",
  "location_preference": "Remote",
  "work_authorization": "US Citizen",
  "employment_type_preference": "Full-time",
  "hard_skills": [
    "Python",
    "Machine Learning",
    "Data Analysis",
    "SQL",
    "Java"
  ],
  "soft_skills": [
    "Communication",
    "Teamwork",
    "Problem-Solving",
    "Leadership",
    "Time Management"
  ],
  "languages": [
    "English",
    "Spanish",
    "French"
  ],
  "experience": [
    {
      "role": "Software Engineer",
      "company": "Tech Corp",
      "date_start": "2020-01-01",
      "date_end": "2022-01-01",
      "role_description": "Developed and maintained software applications using Python and Java. Collaborated with cross-functional teams to deliver high-quality products."
    },
    {
      "role": "Data Analyst",
      "company": "Data Solutions Inc.",
      "date_start": "2018-06-01",
      "date_end": "2019-12-01",
      "role_description": "Analyzed large datasets to extract actionable insights. Crea

In [None]:
print(parsed_resume.model_dump_json(indent=2))

{
  "name": null,
  "location_preference": null,
  "work_authorizaton": null,
  "employment_type_preference": null,
  "hard_skills": [
    "Data Management",
    "Database Management",
    "Data Compilation",
    "Attention to Detail",
    "Data Review",
    "Microsoft Office Suite",
    "Document Management and Storage",
    "Multitasking and Prioritization",
    "Time Management",
    "Data Verification",
    "Administrative Support",
    "Microsoft Access",
    "Microsoft Excel",
    "Adobe Software"
  ],
  "soft_skills": [
    "Decision Making",
    "Service-Oriented",
    "Self-Starter",
    "Workflow Management",
    "Team player"
  ],
  "languages": [],
  "experience": [
    {
      "role": "Data Entry Specialist",
      "company": "Sonic Healthcare Usa",
      "date_start": "2020-09-01T00:00:00",
      "date_end": "2023-11-21T15:16:19",
      "role_description": "Input client information into spreadsheets and company database to provide leaders with quick access to essential cl

### Parse job desc

In [20]:
print(df['job_description_text'].iloc[6236])

Hi,
Hope you are doing great today. Please find the job description below. Let me know your job interest as soon as possible. I will highly appreciate it if you can refer somebody suitable for this position. 
Role: Data Engineer (Oracle and DataStage).Location: RemoteContract Position
Job Description:RoleResponsibilities:Skills: Oracle, Datastage, UNIX, PLSQL, SQL. Good to have: AWS, Matillion, Snowflake. Data engineering experience; expert level experience with SQL. Experience with the cloud (AWS, Azure andor Google Cloud Platform).  Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics).  Experience with cloud-based ETLELT tools (Matillion, Glue, Data Factory) and data modelling.  Experience with version control systems (Git, SVN).  Understanding of and willingness to embrace Agile Principles. 
Looking forward to your response . 
Shubhanshu Tripathishubhanshu.t@cblsolutions.com 469-947-7816 (Ext  209)Cerebral Technologies, Inc

In [21]:
parsed_jd = parse_with_llm(df['job_description_text'].iloc[6236], jd_prompt_template, jd_parser, llm)

In [22]:
print(parsed_jd.model_dump_json(indent=2))

{
  "company_name": "Cerebral Technologies, Inc",
  "role_title": "Data Engineer",
  "employment_type": "Contract",
  "about_the_company": null,
  "job_responsibilities": [],
  "required_hard_skills": [
    "Oracle",
    "Datastage",
    "UNIX",
    "PLSQL",
    "SQL",
    "AWS",
    "Matillion",
    "Snowflake",
    "Data engineering",
    "SQL",
    "AWS",
    "Azure",
    "Google Cloud Platform",
    "Snowflake",
    "Google BigQuery",
    "Amazon Redshift",
    "Azure Synapse Analytics",
    "Matillion",
    "Glue",
    "Data Factory",
    "Git",
    "SVN"
  ],
  "required_soft_skills": [
    "Agile Principles"
  ],
  "required_language_proficiencies": [],
  "required_work_authorization": null,
  "job_location": "Remote"
}


### Parse 10 rows

In [33]:
df_subset = pd.concat([df[:5], df[-5:]])
df_subset

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
6236,SummaryResults-driven Data Entry Clerk with ex...,"Hi,\nHope you are doing great today. Please fi...",Good Fit
6237,Professional SummaryWith the attitude of learn...,Job Title: DHT - Front End Software Engineer W...,Good Fit
6238,Summary• \nOver\nThree years of extensi...,LHH Recruitment Solutions is looking for a Sof...,Good Fit
6239,ProfileAbility to prioritize and multi-task in...,Our client is a growing Medical Device company...,Good Fit
6240,SummaryFull stack Software Engineer with 8+ ye...,Robert Half is looking for a Senior Full Stack...,Good Fit


In [None]:
for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    try:
        # Process resume
        parsed_resume = parse_with_llm(resume_text, resume_prompt_template, resume_parser, llm)
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        resume_output_path = os.path.join('examples', 'resume', f"{idx}.json")
        with open(resume_output_path, "w") as f:
            json.dump(parsed_resume_dict, f, indent=2)

        # Process JD
        parsed_jd = parse_with_llm(jd_text, jd_prompt_template, jd_parser, llm)
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        jd_output_path = os.path.join('examples', 'jd', f"{idx}.json")
        with open(jd_output_path, "w") as f:
            json.dump(parsed_jd_dict, f, indent=2)
       
    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

100%|██████████| 10/10 [00:20<00:00,  2.07s/it]


# SILVER

# GOLD

## Get scores

In [37]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="SEMANTIC_SIMILARITY"
)

In [39]:
embeddings_required_skills = embedding_model.embed_documents(parsed_jd.required_hard_skills)
embeddings_skills_owned = embedding_model.embed_documents(parsed_resume.hard_skills)

In [40]:
required_skills = np.array(embeddings_required_skills)
skills_owned = np.array(embeddings_skills_owned)

# Normalize embeddings to unit vectors (L2 norm)
required_skills = required_skills / np.linalg.norm(required_skills, axis=1, keepdims=True)
skills_owned = skills_owned / np.linalg.norm(skills_owned, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(required_skills, skills_owned.T)

In [42]:
best_matches = []

for i, req_skill in enumerate(parsed_jd.required_hard_skills):
    j = similarity_matrix[i].argmax()
    score = similarity_matrix[i, j]
    if score >= 0.6:
        best_matches.append((req_skill, parsed_resume.hard_skills[j], score))

# Print
for req_skill, own_skill, score in best_matches:
    print(f"Required: {req_skill}  <=> Best Owned: {own_skill}  | Similarity: {score:.2f}")

Required: PostgreSQL  <=> Best Owned: PostgreSQL  | Similarity: 1.00
Required: Express  <=> Best Owned: EF  | Similarity: 0.63
Required: React  <=> Best Owned: HTML5  | Similarity: 0.63
Required: NodeJS  <=> Best Owned: AngularJS  | Similarity: 0.73
Required: Redux  <=> Best Owned: Redmine  | Similarity: 0.63
Required: HTML  <=> Best Owned: HTML  | Similarity: 1.00
Required: CSS  <=> Best Owned: CSS  | Similarity: 1.00
Required: JavaScript  <=> Best Owned: jQuery  | Similarity: 0.86
Required: JSON  <=> Best Owned: JSON  | Similarity: 1.00
Required: Git  <=> Best Owned: GIT  | Similarity: 0.95
Required: REST  <=> Best Owned: REST  | Similarity: 1.00
Required: Firebase  <=> Best Owned: Hangfire  | Similarity: 0.62
Required: Material-UI  <=> Best Owned: AngularJS  | Similarity: 0.63
Required: D3js  <=> Best Owned: jQuery  | Similarity: 0.72
Required: Docker (Compose)  <=> Best Owned: Composer  | Similarity: 0.67
Required: AWS  <=> Best Owned: AWS EC2  | Similarity: 0.85


In [47]:
embeddings_role_name = embedding_model.embed_query(parsed_jd.role_title)
embeddings_experience_titles = embedding_model.embed_documents([exp.role for exp in parsed_resume.experience])

In [43]:
parsed_jd.role_title

'Senior Full Stack Engineer (PERN Stack)'

In [48]:
[exp.role for exp in parsed_resume.experience]

['Software Developer',
 'Software .Net Developer',
 'Software Engineer and Professor']

In [49]:
role_name = np.array(embeddings_role_name)
experiences = np.array(embeddings_experience_titles)

# Normalize embeddings to unit vectors (L2 norm)
role_name = role_name / np.linalg.norm(role_name)
experiences = experiences / np.linalg.norm(experiences, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(experiences, role_name.T)

In [50]:
similarity_matrix

array([0.65028087, 0.62905722, 0.6121288 ])