In [25]:
import pandas as pd
import os
import datetime
import dateparser
import numpy as np
from dotenv import load_dotenv
import json
from tqdm import tqdm

from mistralai import Mistral
from datasets import load_dataset
from huggingface_hub import login
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

from pydantic import BaseModel, Field
from typing import List, Optional
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

# SOURCE

In [26]:

# Login to Hugging Face using the API key from .env
login("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")

In [27]:
dataset = load_dataset("cnamuangtoun/resume-job-description-fit")
df = dataset["train"].to_pandas()


In [28]:
load_dotenv()
os.environ["MISTRAL_API_KEY"] = os.getenv("MISTRAL_API_KEY")

In [29]:
mistral = Mistral(api_key=os.environ.get("MISTRAL_API_KEY"))


# BRONZE TABLE

## Resume Feature Extraction

In [30]:
# Define models
    
class Experience(BaseModel):
    role: Optional[str] = Field(None, description="The job title or position held")
    company: Optional[str] = Field(None, description="The name of the company")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the job in ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the job in ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    role_description: Optional[str] = Field(None, description="A description of the responsibilities and achievements in the role")

class Education(BaseModel):
    degree: Optional[str] = Field(None, description="The academic degree obtained")
    institution: Optional[str] = Field(None, description="The name of the educational institution")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the education program in ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the education program in ISO 8601 format ( YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    grade: Optional[float] = Field(None, description="The GPA or final grade, if available")
    description: Optional[str] = Field(None, description="Additional details about the education")

class Resume(BaseModel):
    name: Optional[str] = Field(None, description="Full name of the person")
    location_preference: Optional[str] = Field(None, description="Preference for their work location / remote, if stated")
    work_authorizaton: Optional[str] = Field(None, description="Work authorization that the person holds, such as citizenship, if stated")
    employment_type_preference: Optional[str] = Field(
        None,
        description="Type of employment the resume is looking for such as Full-time, Part-time, Contract, Freelance, or Internship, if stated. It can also be a preference for remote work or on-site work"
    )
    hard_skills: List[str] = Field(default_factory=list, description="A list of hard or technical skills mentioned in the resume. For example, programming languages, software tools, or specific technologies. Keep it as keywwords")
    soft_skills: List[str] = Field(default_factory=list, description="A list of soft skills mentioned in the resume, such as communication, teamwork, or leadership. Keep it as keywwords. Exclude required languages")
    languages: List[str]= Field(default_factory=list, description="A list of language proficiencies mentioned in the resume. If the resume does not mention any languages, then fill this with the language that the resume is written in")
    experience: List[Experience] = Field(default_factory=list, description="A list of past work experiences")
    education: List[Education] = Field(default_factory=list, description="A list of educational qualifications")
    certifications: List[str] = Field(default_factory=list, description="A list of certifications or licenses related with hardskills or software tools mentioned in the resume, such as AWS Certified Solutions Architect, PMP, etc. Don't include work role")

# Create the parser
resume_parser = PydanticOutputParser(pydantic_object=Resume)
format_instructions = resume_parser.get_format_instructions()

## Job Desc Feature Extraction

In [58]:
# Define models for job desc

class JD(BaseModel):
    company_name: Optional[str] = Field(None, description="Name of the company posting the job")
    role_title: Optional[str] = Field(None, description="Job title or position being offered")
    application_deadline: Optional[datetime.datetime] = Field(None, description="The deadline for submitting applications in ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    date_posted: Optional[datetime.datetime] = Field(None, description="The date when the job was posted in ISO 8601 format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)")
    employment_type: Optional[str] = Field(None, description="Type of employment, such as Full-time, Part-time, Contract, Freelance, or Internship. If not stated, it is assumed to be Full-time")
    about_the_company: Optional[str] = Field(None, description="A brief overview or description of the company")
    job_responsibilities: List[str] = Field(default_factory=list, description="A list of key duties, tasks, or responsibilities associated with the job")
    required_hard_skills: List[str] = Field(default_factory=list, description="A list of technical or hard skills required or preferred for the job. Keep it as keywords. This includes programming languages, software tools, or frameworks like Python, Java, SQL")
    required_soft_skills: List[str] = Field(default_factory=list, description="A list of soft skills or character required or preferred for the job. Keep it as keywords. This includes communication, teamwork, or leadership skills")   
    required_language_proficiencies: List[str] = Field(default_factory=list, description="A list of language proficiencies required for the job if stated. If the job description does not mention any languages, then fill this with the language that the job description is written in")
    required_education: Optional[str] = Field(None, description="The minimum educational qualification required for the job, such as a degree or certification")
    required_work_authorization: Optional[str] = Field(None, description="Work authorization required for the job")
    job_location: Optional[str] = Field(None, description="Location where the job is based, such as a city or remote")
    certifications: List[str] = Field(default_factory=list, description="A list of certifications or licenses related with hard skills, medical skills, and software tools mentioned in the resume. For example, AWS Certified Solutions Architect, PMP, etc. Don't include work role or job title. Do not include insurance-related licenses such as 'Life Insurance License' or similar regulatory licenses")

# Create the parser
jd_parser = PydanticOutputParser(pydantic_object=JD)
format_instructions = jd_parser.get_format_instructions()

## Parse

### Parse resume

In [50]:
# model options

models = mistral.models.list()
for m in models.data:
    print(m.id)


ministral-3b-2410
ministral-3b-latest
ministral-8b-2410
ministral-8b-latest
open-mistral-7b
mistral-tiny
mistral-tiny-2312
open-mistral-nemo
open-mistral-nemo-2407
mistral-tiny-2407
mistral-tiny-latest
open-mixtral-8x7b
mistral-small
mistral-small-2312
open-mixtral-8x22b
open-mixtral-8x22b-2404
mistral-small-2402
mistral-small-2409
mistral-medium-2312
mistral-large-2402
mistral-large-2407
mistral-large-2411
mistral-large-latest
pixtral-large-2411
pixtral-large-latest
mistral-large-pixtral-2411
codestral-2405
codestral-2501
codestral-latest
codestral-2412
codestral-2411-rc5
devstral-small-2505
devstral-small-latest
pixtral-12b-2409
pixtral-12b
pixtral-12b-latest
mistral-small-2501
mistral-small-2503
mistral-small-latest
mistral-saba-2502
mistral-saba-latest
mistral-medium-2505
mistral-medium-latest
mistral-medium
mistral-embed
codestral-embed
codestral-embed-2505
mistral-moderation-2411
mistral-moderation-latest
mistral-ocr-2503
mistral-ocr-2505
mistral-ocr-latest


In [51]:
import json

def parse_with_mistral(text: str, parser, format_instructions: str, label: str) -> BaseModel:

    prompt = (
    f"{format_instructions}\n\n"
    f"{label}:\n{text}"
)

    response = mistral.chat.complete(
        model="mistral-medium-latest",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=2048
    )
    raw = response.choices[0].message.content
    return parser.parse(raw)

In [52]:
resume_text = df["resume_text"].iloc[6236]
parsed_resume = parse_with_mistral(resume_text, resume_parser, resume_parser.get_format_instructions(), "Resume")


In [53]:
print(parsed_resume.model_dump_json(indent=2))

{
  "name": "Results-driven Data Entry Clerk",
  "location_preference": null,
  "work_authorizaton": null,
  "employment_type_preference": null,
  "hard_skills": [
    "Microsoft Excel",
    "Microsoft Outlook",
    "Adobe Software",
    "Microsoft Office Suite",
    "Microsoft Access",
    "Database Management",
    "Data Compilation",
    "Data Review",
    "Data Verification",
    "Document Management and Storage"
  ],
  "soft_skills": [
    "Decision Making",
    "Service-Oriented",
    "Self-Starter",
    "Workflow Management",
    "Attention to Detail",
    "Multitasking and Prioritization",
    "Time Management",
    "Team Player",
    "Communication"
  ],
  "languages": [
    "English"
  ],
  "experience": [
    {
      "role": "Data Entry Specialist",
      "company": "Sonic Healthcare Usa",
      "date_start": "2020-09-01T00:00:00",
      "date_end": null,
      "role_description": "Input client information into spreadsheets and company database to provide leaders with quick 

### Parse job desc

In [59]:
print(df['job_description_text'].iloc[6236])

Hi,
Hope you are doing great today. Please find the job description below. Let me know your job interest as soon as possible. I will highly appreciate it if you can refer somebody suitable for this position. 
Role: Data Engineer (Oracle and DataStage).Location: RemoteContract Position
Job Description:RoleResponsibilities:Skills: Oracle, Datastage, UNIX, PLSQL, SQL. Good to have: AWS, Matillion, Snowflake. Data engineering experience; expert level experience with SQL. Experience with the cloud (AWS, Azure andor Google Cloud Platform).  Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics).  Experience with cloud-based ETLELT tools (Matillion, Glue, Data Factory) and data modelling.  Experience with version control systems (Git, SVN).  Understanding of and willingness to embrace Agile Principles. 
Looking forward to your response . 
Shubhanshu Tripathishubhanshu.t@cblsolutions.com 469-947-7816 (Ext  209)Cerebral Technologies, Inc

In [60]:
parsed_jd = parse_with_mistral(df['job_description_text'].iloc[6236], jd_parser, jd_parser.get_format_instructions(), "Job Description")

In [61]:
print(parsed_jd.model_dump_json(indent=2))

{
  "company_name": "Cerebral Technologies, Inc (D.B.A CBLSolutions)",
  "role_title": "Data Engineer (Oracle and DataStage)",
  "application_deadline": null,
  "date_posted": null,
  "employment_type": "Contract",
  "about_the_company": null,
  "job_responsibilities": [
    "Data engineering tasks",
    "Expert level SQL development",
    "Cloud-based data warehouse management",
    "ETL/ELT processes using cloud-based tools",
    "Data modeling",
    "Version control management",
    "Embracing Agile Principles"
  ],
  "required_hard_skills": [
    "Oracle",
    "DataStage",
    "UNIX",
    "PLSQL",
    "SQL",
    "AWS",
    "Matillion",
    "Snowflake",
    "Google BigQuery",
    "Amazon Redshift",
    "Azure Synapse Analytics",
    "Glue",
    "Data Factory",
    "Git",
    "SVN"
  ],
  "required_soft_skills": [
    "Agile Principles"
  ],
  "required_language_proficiencies": [
    "English"
  ],
  "required_education": null,
  "required_work_authorization": null,
  "job_location":

### Parse 10 rows

In [62]:
df_subset = pd.concat([df[:5], df[-5:]])
df_subset

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
6236,SummaryResults-driven Data Entry Clerk with ex...,"Hi,\nHope you are doing great today. Please fi...",Good Fit
6237,Professional SummaryWith the attitude of learn...,Job Title: DHT - Front End Software Engineer W...,Good Fit
6238,Summary• \nOver\nThree years of extensi...,LHH Recruitment Solutions is looking for a Sof...,Good Fit
6239,ProfileAbility to prioritize and multi-task in...,Our client is a growing Medical Device company...,Good Fit
6240,SummaryFull stack Software Engineer with 8+ ye...,Robert Half is looking for a Senior Full Stack...,Good Fit


In [None]:
for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    
    try:
        # Process resume
        parsed_resume = parse_with_mistral(
            resume_text,
            resume_parser,
            resume_parser.get_format_instructions(),
            "Resume"
        )
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        resume_output_path = os.path.join('examples_mistral', 'resume', f"{idx}.json")
        os.makedirs(os.path.dirname(resume_output_path), exist_ok=True) 
        with open(resume_output_path, "w") as f:
            json.dump(parsed_resume_dict, f, indent=2)

        # Process JD
        parsed_jd = parse_with_mistral(
            jd_text,
            jd_parser,
            jd_parser.get_format_instructions(),
            "Job Description"
        )
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        jd_output_path = os.path.join('examples_mistral', 'jd', f"{idx}.json")
        os.makedirs(os.path.dirname(jd_output_path), exist_ok=True) 
        with open(jd_output_path, "w") as f:
            json.dump(parsed_jd_dict, f, indent=2)

    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

 80%|████████  | 8/10 [02:02<00:29, 14.85s/it]

# Connecting to MongoDB

In [20]:
uri = os.environ.get("MONGO_DB_URI")

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id

spark = SparkSession.builder \
    .appName("MongoDBIntegration") \
    .config("spark.mongodb.read.connection.uri", os.environ.get("MONGO_DB_URI")) \
    .config("spark.mongodb.write.connection.uri", os.environ.get("MONGO_DB_URI")) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1") \
    .getOrCreate()

:: loading settings :: url = jar:file:/usr/local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2254e70e-cdd4-4729-8b3b-92eff4e3339f;1.0
	confs: [default]
	found org.mongodb.spark#mongo-spark-connector_2.12;10.1.1 in central
	found org.mongodb#mongodb-driver-sync;4.8.2 in central
	[4.8.2] org.mongodb#mongodb-driver-sync;[4.8.1,4.8.99)
	found org.mongodb#bson;4.8.2 in central
	found org.mongodb#mongodb-driver-core;4.8.2 in central
	found org.mongodb#bson-record-codec;4.8.2 in central
:: resolution report :: resolve 2821ms :: artifacts dl 7ms
	:: modules in use:
	org.mongodb#bson;4.8.2 from central in [default]
	org.mongodb#bson-record-codec;4.8.2 from central in [default]
	org.mongodb#mongodb-driver-core;4.8.2 from central in [default]
	org.mongodb#mongodb-driver-sync;4.8.2 from central in [default]
	org.mongodb.spark#mongo-spark-connector_2.12;10.1.1 fr

### Save in mongodb per item (incrementing table)

In [23]:
# Test 200 data entries 
df_test = pd.concat([df[:100], df[-100:]])

db = client["jobmirror_db"]
resume_collection = db["resumes"]
jd_collection = db["job_descriptions"]

for idx, row in tqdm(df_test.iterrows(), total=len(df)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    try:
        # Process resume
        parsed_resume = parse_with_mistral(
            resume_text,
            resume_parser,
            resume_parser.get_format_instructions(),
            "Resume"
        )
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        parsed_resume_dict["row_idx"] = idx  
        resume_collection.insert_one(parsed_resume_dict) 

        # Process JD
        parsed_jd = parse_with_mistral(
            jd_text,
            jd_parser,
            jd_parser.get_format_instructions(),
            "Job Description"
        )
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        parsed_jd_dict["row_idx"] = idx  
        jd_collection.insert_one(parsed_jd_dict)

    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

  3%|▎         | 163/6241 [54:18<33:44:55, 19.99s/it]


KeyboardInterrupt: 

In [24]:
# Clear collections
resume_collection.delete_many({})
jd_collection.delete_many({})



DeleteResult({'n': 173, 'electionId': ObjectId('7fffffff00000000000001b7'), 'opTime': {'ts': Timestamp(1748621604, 41), 't': 439}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1748621604, 41), 'signature': {'hash': b'{~8#\xc7\xac\xf3\x80\xa0F\xe7m\x06\xc3\xb3 m\x0c\xc5+', 'keyId': 7450535577176244240}}, 'operationTime': Timestamp(1748621604, 41)}, acknowledged=True)

# SILVER

# GOLD

## Get scores

In [37]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="SEMANTIC_SIMILARITY"
)

In [39]:
embeddings_required_skills = embedding_model.embed_documents(parsed_jd.required_hard_skills)
embeddings_skills_owned = embedding_model.embed_documents(parsed_resume.hard_skills)

In [40]:
required_skills = np.array(embeddings_required_skills)
skills_owned = np.array(embeddings_skills_owned)

# Normalize embeddings to unit vectors (L2 norm)
required_skills = required_skills / np.linalg.norm(required_skills, axis=1, keepdims=True)
skills_owned = skills_owned / np.linalg.norm(skills_owned, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(required_skills, skills_owned.T)

In [42]:
best_matches = []

for i, req_skill in enumerate(parsed_jd.required_hard_skills):
    j = similarity_matrix[i].argmax()
    score = similarity_matrix[i, j]
    if score >= 0.6:
        best_matches.append((req_skill, parsed_resume.hard_skills[j], score))

# Print
for req_skill, own_skill, score in best_matches:
    print(f"Required: {req_skill}  <=> Best Owned: {own_skill}  | Similarity: {score:.2f}")

Required: PostgreSQL  <=> Best Owned: PostgreSQL  | Similarity: 1.00
Required: Express  <=> Best Owned: EF  | Similarity: 0.63
Required: React  <=> Best Owned: HTML5  | Similarity: 0.63
Required: NodeJS  <=> Best Owned: AngularJS  | Similarity: 0.73
Required: Redux  <=> Best Owned: Redmine  | Similarity: 0.63
Required: HTML  <=> Best Owned: HTML  | Similarity: 1.00
Required: CSS  <=> Best Owned: CSS  | Similarity: 1.00
Required: JavaScript  <=> Best Owned: jQuery  | Similarity: 0.86
Required: JSON  <=> Best Owned: JSON  | Similarity: 1.00
Required: Git  <=> Best Owned: GIT  | Similarity: 0.95
Required: REST  <=> Best Owned: REST  | Similarity: 1.00
Required: Firebase  <=> Best Owned: Hangfire  | Similarity: 0.62
Required: Material-UI  <=> Best Owned: AngularJS  | Similarity: 0.63
Required: D3js  <=> Best Owned: jQuery  | Similarity: 0.72
Required: Docker (Compose)  <=> Best Owned: Composer  | Similarity: 0.67
Required: AWS  <=> Best Owned: AWS EC2  | Similarity: 0.85


In [47]:
embeddings_role_name = embedding_model.embed_query(parsed_jd.role_title)
embeddings_experience_titles = embedding_model.embed_documents([exp.role for exp in parsed_resume.experience])

In [43]:
parsed_jd.role_title

'Senior Full Stack Engineer (PERN Stack)'

In [48]:
[exp.role for exp in parsed_resume.experience]

['Software Developer',
 'Software .Net Developer',
 'Software Engineer and Professor']

In [49]:
role_name = np.array(embeddings_role_name)
experiences = np.array(embeddings_experience_titles)

# Normalize embeddings to unit vectors (L2 norm)
role_name = role_name / np.linalg.norm(role_name)
experiences = experiences / np.linalg.norm(experiences, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(experiences, role_name.T)

In [50]:
similarity_matrix

array([0.65028087, 0.62905722, 0.6121288 ])