In [20]:
import pandas as pd
import os
import datetime
import dateparser
import numpy as np
from dotenv import load_dotenv
import json
from tqdm import tqdm
import string

from pydantic import BaseModel, Field
from typing import List, Optional, get_origin, get_args, Union
from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType, IntegerType, FloatType, BooleanType, TimestampType, StructField, StructType

# SOURCE

Import from Huggingface

In [2]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/cnamuangtoun/resume-job-description-fit/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


Generate random snapshot dates

In [70]:
# Create a seeded Generator
rng = np.random.default_rng(seed=42)

# Define start and end date
start_date = pd.to_datetime('2024-01-01')
end_date = pd.to_datetime('2025-01-01')

# Generate random timestamps between start_date and end_date
random_dates = pd.to_datetime(
    rng.uniform(start_date.value, end_date.value, size=len(df))
)

# Ensure it's treated as a pandas Series and convert to date
df['snapshot_date'] = pd.Series(random_dates).dt.date  # This will convert to date format

Generate random IDs

In [71]:
def generate_random_id(prefix: str, length=8, use_digits=True, use_letters=True, seed=42):
    rng = np.random.default_rng(seed=seed) 

    characters = ''
    
    if use_digits:
        characters += string.digits
    if use_letters:
        characters += string.ascii_letters

    # Ensure we have characters to choose from
    if not characters:
        raise ValueError("At least one of 'use_digits' or 'use_letters' must be True.")
    
    # Use np.random.choice to randomly select characters
    random_id = ''.join(rng.choice(list(characters), size=length))
    return prefix + random_id

In [None]:
df['resume_id'] = df.apply(lambda row: generate_random_id('RES_', seed=row.name), axis=1)
df['job_id'] = df.apply(lambda row: generate_random_id('JD_', seed=row.name), axis=1)

Setup env & pyspark

In [36]:
load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [37]:
mongodb_uri =  os.getenv("MONGODB_URI")

spark = SparkSession.builder \
    .appName("SaveJSONtoMongoDB") \
    .config("spark.mongodb.read.connection.uri", mongodb_uri) \
    .config("spark.mongodb.write.connection.uri", mongodb_uri) \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:10.5.0") \
    .getOrCreate()

# BRONZE TABLE

In [38]:
# define LLM
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

## Resume Feature Extraction

In [39]:
# Define models
    
class Experience(BaseModel):
    role: Optional[str] = Field(None, description="The job title or position held")
    company: Optional[str] = Field(None, description="The name of the company")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the job")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the job")
    role_description: Optional[str] = Field(None, description="A description of the responsibilities and achievements in the role")

class Education(BaseModel):
    degree: Optional[str] = Field(None, description="The academic degree obtained")
    institution: Optional[str] = Field(None, description="The name of the educational institution")
    date_start: Optional[datetime.datetime] = Field(None, description="The start date of the education program")
    date_end: Optional[datetime.datetime] = Field(None, description="The end date of the education program")
    grade: Optional[float] = Field(None, description="The GPA or final grade, if available")
    description: Optional[str] = Field(None, description="Additional details about the education")

class Resume(BaseModel):
    name: Optional[str] = Field(None, description="Full name of the person")
    location_preference: Optional[str] = Field(None, description="Preference for their work location or remote, if stated")
    work_authorization: Optional[str] = Field(None, description="Work authorization that the person holds, such as citizenship, if stated")
    employment_type_preference: Optional[str] = Field(
        None,
        description="Type of employment the resume is looking for such as Full-time, Part-time, Contract, Freelance, or Internship, if stated"
    )
    hard_skills: List[str] = Field(..., 
                                   description="A list of proficiencies in tools, technologies, frameworks, programming languages, platforms, methodologies, and key professional terms mentioned in the resume. " \
                                   "Avoid duplicates and use concise wording." \
                                   "Clean up tool names and merge variations.")
    soft_skills: List[str] = Field(..., description="A list of soft skills mentioned in the resume, such as communication, teamwork, and leadership. Avoid duplication.")
    languages: List[str]= Field(..., description="A list of language proficiencies mentioned in the resume")
    experience: List[Experience] = Field(..., description="A list of past work experiences")
    education: List[Education] = Field(..., description="A list of educational qualifications")
    certifications: List[str] = Field(..., description="A list of certifications or licenses mentioned in the resume, such as AWS Certified Solutions Architect, PMP, etc.")

# Create the parser
resume_parser = PydanticOutputParser(pydantic_object=Resume)

In [40]:
# Create the prompt
resume_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that extracts structured information from resumes."),
    ("human", "Extract the following information from the resume:\n\n{text}\n\n{format_instructions}")
])

## Job Desc Feature Extraction

In [41]:
# Define models for job desc

class JD(BaseModel):
    company_name: Optional[str] = Field(None, description="Name of the company posting the job")
    role_title: Optional[str] = Field(None, description="The title or name of the job role being offered")
    employment_type: Optional[str] = Field(None, description="Type of employment, such as Full-time, Part-time, Contract, Freelance, or Internship")
    about_the_company: Optional[str] = Field(None, description="A brief overview or description of the company")
    job_responsibilities: List[str] = Field(..., description="A list of key duties, tasks, or responsibilities associated with the job")
    required_hard_skills: List[str] = Field(..., description="A list of technical or hard skills required or preferred for the job")
    required_soft_skills: List[str] = Field(..., description="A list of soft skills or character required or preferred for the job")
    required_language_proficiencies: List[str] = Field(..., description="A list of language proficiencies required for the job")
    required_work_authorization: Optional[str] = Field(None, description="Work authorization required for the job")
    required_education: Optional[str] = Field(None, description="The minimum educational qualification required for the job, such as a degree or certification")
    job_location: Optional[str] = Field(None, description="Location where the job is based, such as a city or remote")

# Create the parser
jd_parser = PydanticOutputParser(pydantic_object=JD)

In [42]:
# Create the prompt
jd_prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that extracts structured information from job descriptions."),
    ("human", "Extract the following information from the job description:\n\n{text}\n\n{format_instructions}")
])

## Parse

In [43]:
# Create a function to parse text with llm
def parse_with_llm(text, prompt_template, parser, llm):
    prompt = prompt_template.format_messages(
        text=text,
        format_instructions=parser.get_format_instructions()
    )
    
    response = llm.invoke(prompt)
    return parser.parse(response.content)

### Parse resume

In [44]:
print(df['resume_text'].iloc[6240])

SummaryFull stack Software Engineer with 8+ years of experience in software industry. Professional, creative, responsible with proven analytical skills and knowledge of Microsoft .Net platform and PHP. Passionate about new technologies and software development. Fast learner, focused, problem solver, great Team Player, and lover of good software development practices including Systems Development Life Cycle (SDLC).
HighlightsC#, PHP.Technologies/Services: ASP.NET, ASP.NET MVC, ASP.NET Web API, EF, ADO.NET, T-SQL, .NET Framework, LINQ, Razor, Hangfire, SignalR, HTML, HTML5, XML, CSS, CSS3, JSON, AJAX, jQuery, AngularJS, Bootstrap, SOAP, REST, Alfresco ECM, Alfresco Activiti BPM, Alfresco OCR, OCR, Tesseract, AWS S3, AWS EC2, CodeIgniter, Composer, Google Maps API, Apache Web Server, Apache Tomcat, PL/pgSQL, LaTeX.Databases: MS SQL Server, PostgreSQL, MySQL.Source Code Control: SVN, GIT.Project Management Tool: Redmine, Gforge.IDEs: Visual Studio, PhpStorm, Netbeans, Intellij Idea, Zend S

In [45]:
parsed_resume = parse_with_llm(df['resume_text'].iloc[6239], resume_prompt_template, resume_parser, llm)

In [46]:
print(parsed_resume.model_dump_json(indent=2))

{
  "name": null,
  "location_preference": null,
  "work_authorization": null,
  "employment_type_preference": null,
  "hard_skills": [
    "ATM",
    "Broadband",
    "C",
    "Cables",
    "Cisco Routers",
    "Citrix",
    "Dispatching",
    "Ethernet",
    "Flash",
    "Frame Relay",
    "GSM",
    "HP OpenView",
    "Inventory",
    "IP",
    "ISIS",
    "Office",
    "Word",
    "MSN",
    "Network",
    "Networks",
    "Quality",
    "Siemens",
    "Switches",
    "T1",
    "Troubleshooting",
    "UMTS",
    "Upgrades",
    "VPN",
    "T3",
    "OC3",
    "OC12",
    "OC48",
    "OC192",
    "FastEthernet",
    "GigE",
    "MPLS",
    "DIA",
    "Tellabs 5500",
    "5320",
    "532L",
    "Alcatel 1630",
    "1631",
    "1671",
    "1677",
    "Ericsson UMTS/GSM",
    "Lucent UMTS",
    "Nokia GSM",
    "DACS",
    "MUXES",
    "DDM2000",
    "DMX",
    "FLM150",
    "FT2000",
    "Fujitsu",
    "5E",
    "DMS",
    "WFA-C/DI/DO",
    "GRETA",
    "REACT",
    "INTAS",
    "NMA"

### Parse job desc

In [15]:
print(df['job_description_text'].iloc[6236])

Hi,
Hope you are doing great today. Please find the job description below. Let me know your job interest as soon as possible. I will highly appreciate it if you can refer somebody suitable for this position. 
Role: Data Engineer (Oracle and DataStage).Location: RemoteContract Position
Job Description:RoleResponsibilities:Skills: Oracle, Datastage, UNIX, PLSQL, SQL. Good to have: AWS, Matillion, Snowflake. Data engineering experience; expert level experience with SQL. Experience with the cloud (AWS, Azure andor Google Cloud Platform).  Experience in cloud-based data warehouses (Snowflake, Google BigQuery, Amazon Redshift, Azure Synapse Analytics).  Experience with cloud-based ETLELT tools (Matillion, Glue, Data Factory) and data modelling.  Experience with version control systems (Git, SVN).  Understanding of and willingness to embrace Agile Principles. 
Looking forward to your response . 
Shubhanshu Tripathishubhanshu.t@cblsolutions.com 469-947-7816 (Ext  209)Cerebral Technologies, Inc

In [16]:
parsed_jd = parse_with_llm(df['job_description_text'].iloc[6236], jd_prompt_template, jd_parser, llm)

In [17]:
print(parsed_jd.model_dump_json(indent=2))

{
  "company_name": "Cerebral Technologies, Inc",
  "role_title": "Data Engineer",
  "employment_type": "Contract",
  "about_the_company": null,
  "job_responsibilities": [],
  "required_hard_skills": [
    "Oracle",
    "Datastage",
    "UNIX",
    "PLSQL",
    "SQL",
    "AWS",
    "Matillion",
    "Snowflake",
    "AWS",
    "Azure",
    "Google Cloud Platform",
    "Snowflake",
    "Google BigQuery",
    "Amazon Redshift",
    "Azure Synapse Analytics",
    "Matillion",
    "Glue",
    "Data Factory",
    "Git",
    "SVN",
    "SQL"
  ],
  "required_soft_skills": [
    "Agile Principles"
  ],
  "required_language_proficiencies": [],
  "required_work_authorization": null,
  "required_education": null,
  "job_location": "Remote"
}


### Parse & save into DB

In [73]:
df_subset = pd.concat([df[:5], df[-5:]])
df_subset

Unnamed: 0,resume_text,job_description_text,label,snapshot_date,resume_id,job_id
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit,2024-10-10,RES_QDvgj241,JD_QDvgj241
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit,2024-06-09,RES_tvKW28PW,JD_tvKW28PW
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit,2024-11-10,RES_Pg6ipOr5,JD_Pg6ipOr5
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit,2024-09-12,RES_O5bebNRA,JD_O5bebNRA
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit,2024-02-04,RES_JWSvWYY5,JD_JWSvWYY5
6236,SummaryResults-driven Data Entry Clerk with ex...,"Hi,\nHope you are doing great today. Please fi...",Good Fit,2024-06-02,RES_vNEJ62Py,JD_vNEJ62Py
6237,Professional SummaryWith the attitude of learn...,Job Title: DHT - Front End Software Engineer W...,Good Fit,2024-09-01,RES_DPqh0lVb,JD_DPqh0lVb
6238,Summary• \nOver\nThree years of extensi...,LHH Recruitment Solutions is looking for a Sof...,Good Fit,2024-11-02,RES_1HWrRA5T,JD_1HWrRA5T
6239,ProfileAbility to prioritize and multi-task in...,Our client is a growing Medical Device company...,Good Fit,2024-07-26,RES_XdUNowSD,JD_XdUNowSD
6240,SummaryFull stack Software Engineer with 8+ ye...,Robert Half is looking for a Senior Full Stack...,Good Fit,2024-08-22,RES_2RPwzELC,JD_2RPwzELC


Parse JDs

In [74]:
parsed_resumes = []
parsed_jds = []

for idx, row in tqdm(df_subset.iterrows(), total=len(df_subset)):
    resume_text = row['resume_text']
    jd_text = row['job_description_text']
    try:
        # Process resume
        parsed_resume = parse_with_llm(resume_text, resume_prompt_template, resume_parser, llm)
        parsed_resume_dict = parsed_resume.model_dump(mode="json")
        parsed_resume_dict = {**parsed_resume_dict, 
                              'snapshot_date': row['snapshot_date'], 
                              'id': row['resume_id']}
        parsed_resumes.append(parsed_resume_dict)

        # Process JD
        parsed_jd = parse_with_llm(jd_text, jd_prompt_template, jd_parser, llm)
        parsed_jd_dict = parsed_jd.model_dump(mode="json")
        parsed_jd_dict = {**parsed_jd_dict, 
                          'snapshot_date': row['snapshot_date'],
                          'id': row['job_id']}
        parsed_jds.append(parsed_jd_dict)
       
    except Exception as e:
        print(f"Error parsing row {idx}: {e}")

100%|██████████| 10/10 [01:45<00:00, 10.51s/it]


Convert into PySpark Dataframe

In [84]:
def python_type_to_spark_type(annotation):
    origin = get_origin(annotation)

    if origin is Union:  # Handle Optional
        args = [arg for arg in get_args(annotation) if arg is not type(None)]
        return python_type_to_spark_type(args[0])

    if origin in (list, List):
        element_type = python_type_to_spark_type(get_args(annotation)[0])
        return ArrayType(element_type)

    if isinstance(annotation, type):
        if issubclass(annotation, BaseModel):
            return pydantic_to_spark_schema(annotation)
        if issubclass(annotation, str):
            return StringType()
        if issubclass(annotation, int):
            return IntegerType()
        if issubclass(annotation, float):
            return FloatType()
        if issubclass(annotation, bool):
            return BooleanType()
        if issubclass(annotation, datetime.datetime):
            return StringType()

    return StringType()

def pydantic_to_spark_schema(model: type) -> StructType:
    fields = []

    for name, field in model.model_fields.items():
        annotation = field.annotation

        spark_type = python_type_to_spark_type(annotation)
        fields.append(StructField(name, spark_type, True))  # assume all nullable
    fields.append(StructField('snapshot_date', StringType(), True))
    fields.append(StructField('id', StringType(), True))

    return StructType(fields)

In [85]:
resume_df = spark.createDataFrame(parsed_resumes, schema=pydantic_to_spark_schema(Resume)).repartition("snapshot_date")

In [88]:
jd_df = spark.createDataFrame(parsed_jds, schema=pydantic_to_spark_schema(JD)).repartition("snapshot_date")

Save into DB

In [87]:
resume_df.write.format("mongodb") \
               .mode("overwrite") \
               .option("database", "jobmirror") \
               .option("collection", "resume") \
               .partitionBy("snapshot_date") \
               .save()

                                                                                

In [90]:
jd_df.write.format("mongodb") \
               .mode("overwrite") \
               .option("database", "jobmirror") \
               .option("collection", "jd") \
               .partitionBy("snapshot_date") \
               .save()

                                                                                

# SILVER

# GOLD

## Get scores

In [37]:
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004", task_type="SEMANTIC_SIMILARITY"
)

In [39]:
embeddings_required_skills = embedding_model.embed_documents(parsed_jd.required_hard_skills)
embeddings_skills_owned = embedding_model.embed_documents(parsed_resume.hard_skills)

In [40]:
required_skills = np.array(embeddings_required_skills)
skills_owned = np.array(embeddings_skills_owned)

# Normalize embeddings to unit vectors (L2 norm)
required_skills = required_skills / np.linalg.norm(required_skills, axis=1, keepdims=True)
skills_owned = skills_owned / np.linalg.norm(skills_owned, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(required_skills, skills_owned.T)

In [42]:
best_matches = []

for i, req_skill in enumerate(parsed_jd.required_hard_skills):
    j = similarity_matrix[i].argmax()
    score = similarity_matrix[i, j]
    if score >= 0.6:
        best_matches.append((req_skill, parsed_resume.hard_skills[j], score))

# Print
for req_skill, own_skill, score in best_matches:
    print(f"Required: {req_skill}  <=> Best Owned: {own_skill}  | Similarity: {score:.2f}")

Required: PostgreSQL  <=> Best Owned: PostgreSQL  | Similarity: 1.00
Required: Express  <=> Best Owned: EF  | Similarity: 0.63
Required: React  <=> Best Owned: HTML5  | Similarity: 0.63
Required: NodeJS  <=> Best Owned: AngularJS  | Similarity: 0.73
Required: Redux  <=> Best Owned: Redmine  | Similarity: 0.63
Required: HTML  <=> Best Owned: HTML  | Similarity: 1.00
Required: CSS  <=> Best Owned: CSS  | Similarity: 1.00
Required: JavaScript  <=> Best Owned: jQuery  | Similarity: 0.86
Required: JSON  <=> Best Owned: JSON  | Similarity: 1.00
Required: Git  <=> Best Owned: GIT  | Similarity: 0.95
Required: REST  <=> Best Owned: REST  | Similarity: 1.00
Required: Firebase  <=> Best Owned: Hangfire  | Similarity: 0.62
Required: Material-UI  <=> Best Owned: AngularJS  | Similarity: 0.63
Required: D3js  <=> Best Owned: jQuery  | Similarity: 0.72
Required: Docker (Compose)  <=> Best Owned: Composer  | Similarity: 0.67
Required: AWS  <=> Best Owned: AWS EC2  | Similarity: 0.85


In [47]:
embeddings_role_name = embedding_model.embed_query(parsed_jd.role_title)
embeddings_experience_titles = embedding_model.embed_documents([exp.role for exp in parsed_resume.experience])

In [43]:
parsed_jd.role_title

'Senior Full Stack Engineer (PERN Stack)'

In [48]:
[exp.role for exp in parsed_resume.experience]

['Software Developer',
 'Software .Net Developer',
 'Software Engineer and Professor']

In [49]:
role_name = np.array(embeddings_role_name)
experiences = np.array(embeddings_experience_titles)

# Normalize embeddings to unit vectors (L2 norm)
role_name = role_name / np.linalg.norm(role_name)
experiences = experiences / np.linalg.norm(experiences, axis=1, keepdims=True)

# Compute cosine similarity matrix by dot product
similarity_matrix = np.dot(experiences, role_name.T)

In [50]:
similarity_matrix

array([0.65028087, 0.62905722, 0.6121288 ])