In [1]:
import json
import copy
import pandas as pd
import importlib
from datetime import datetime
from tqdm import tqdm

from dotenv import load_dotenv

from utils import read_csv_file, read_desired_skills, write_csv_file, write_json_file

In [2]:
load_dotenv()

True

In [3]:
# variables
applicants_resume_analysis_df_path = './output/applicants_resume_analysis_df.csv' # path to a dataframe storing the results of resume analysis for applications
pdf_files_contents_path = './data/pdf_files_contents.csv' # path to a .csv file containing the content of PDF resume documents
desired_skills_path = './desired_skills.txt' # path to desired skills defined by HR for this specific job posting
fine_grained_skill_grade_dict_path = './output/fine_grained_skill_grade_dict.json' # path to .json file  containing fine_grained_skill_grade_dict
desired_skills_summary_dict_path = './output/desired_skills_summary_dict.json' # path tp .json file containing candidate summaries for the list of disired skills

# 1. 📄 PDF information Extraction
- Uses Qwen 2.5 VLM to extract information from the resume documents and preserve the layout information.
- find the implementation in `./models/pdf_ocr/ocr_qwen25.ipynb`

In [4]:
# read pdf content
pdf_files_contents = read_csv_file(pdf_files_contents_path)
print(pdf_files_contents.shape)
# display(pdf_files_contents.head(2))

(37, 3)


In [5]:
desired_skills_list = read_desired_skills(desired_skills_path)

desired_skills_list = desired_skills_list.split('\n')
desired_skills_list

['object detection',
 'semantic segmentation',
 'image classification',
 'computer vision',
 'transformers',
 'vision transformers']

In [None]:
def create_applicants_resume_analysis_df_place_holder(pdf_files_contents, desired_skills_list):
    """
        Create a placeholder DataFrame for storing applicants' resume analysis results.
    
        This function initializes an empty DataFrame that will later hold parsed resume data,
        skill grading outputs, summaries, and reranking results. It acts as the primary
        structure for tracking candidate evaluations throughout the HR AgentAI workflow.
    
        Returns
        -------
        pandas.DataFrame
            An empty DataFrame with predefined columns for storing applicant analysis results.
    
        Notes
        -----
        This placeholder ensures that subsequent resume parsing, skill grading, and
        reranking results can be consistently appended without schema mismatches.
        """
    # create place holder for applicants_resume_analysis_df based on pdf_files_contents and desired_skills_list for applicants
    applicants_resume_analysis_df = pdf_files_contents.copy(deep=True)
    for skill in desired_skills_list: # # add desired skill list as columns to applicants_resume_analysis_df
        applicants_resume_analysis_df[skill] = False
    return applicants_resume_analysis_df

applicants_resume_analysis_df = create_applicants_resume_analysis_df_place_holder(pdf_files_contents, desired_skills_list)

# display(applicants_resume_analysis_df.head(2))

# 2. 🧪 Skill Grader

In [7]:
# import the defined chain for Skill Grader
from chains.skill_grader import skill_grader as skill_grader_chain

In [None]:
def skill_grader(desired_skill, resume_content):
    """
        Evaluate the presence of a specific skill in a candidate's resume.
    
        Uses an LLM-based binary classification approach to determine if the target
        skill is clearly demonstrated in the candidate's work experience, projects, or
        qualifications.
    
        Parameters
        ----------
        resume_text : str
            Extracted textual content from the candidate's resume.
        target_skill : str
            The skill to evaluate (e.g., "Python", "Project Management").
    
        Returns
        -------
        bool
            True if the skill is demonstrated with supporting evidence, False otherwise.
    
        Notes
        -----
        - Forms part of the **1st Screening Step – Skill-Based Filtering** in the HR AgentAI pipeline.
        - This step reduces candidate pool size by ensuring only applicants meeting
          the minimum skill requirements move forward.
        """
    skill_score = skill_grader_chain.invoke(
        {
            "desired_skill": desired_skill,  
            "resume_content": resume_content
        }
    )

    return skill_score

In [None]:
def get_desired_skill_score_dict_for_one_resume(resume_content):
    """
        Generate a binary skill presence dictionary for a single resume.
    
        For each desired skill, the function uses `skill_grader` to determine whether
        the skill is present in the given resume.
    
        Parameters
        ----------
        resume_text : str
            Extracted textual content from a candidate's resume.
        desired_skills : list of str
            Skills required for the job position.
    
        Returns
        -------
        dict
            Mapping of skill names to boolean values indicating presence.
    
        Notes
        -----
        This function is used to build the binary skill table for a single candidate,
        which feeds into the **1st Screening Step**.
        """
    desired_skill_score_dict = dict()

    for desired_skill in desired_skills_list:
        skill_score = skill_grader(desired_skill, resume_content)
        desired_skill_score_dict[desired_skill] = skill_score.binary_score
    # print(f"Skill: {desired_skill}")
    # print(f"Present: {skill_score.binary_score}")
    # print(skill_score)

    return desired_skill_score_dict

In [10]:
# desired_skill_score_dict

In [11]:
# limit the analysis to first 10 applicants
applicants_resume_analysis_df = applicants_resume_analysis_df[:10]

In [12]:
# display(applicants_resume_analysis_df)

In [None]:
def get_desired_skill_score_dict_for_all_resumes(applicants_resume_analysis_df):
    """
        Generate binary skill presence dictionaries for all resumes.
    
        Iterates through multiple resumes and applies the skill grading process
        for each desired skill.
    
        Parameters
        ----------
        resumes : dict
            Mapping of resume IDs to their extracted text content.
        desired_skills : list of str
            Skills required for the job position.
    
        Returns
        -------
        dict
            Mapping of resume IDs to skill presence dictionaries.
    
        Notes
        -----
        Produces the binary skill presence matrix used for initial candidate filtering.
        """
    for idx in range(applicants_resume_analysis_df.shape[0]):
        pdf_content = applicants_resume_analysis_df.at[idx, 'pdf_content']
        desired_skill_score_dict = get_desired_skill_score_dict_for_one_resume(pdf_content)
        for desired_skill in desired_skills_list:
            applicants_resume_analysis_df.at[idx, desired_skill] = desired_skill_score_dict[desired_skill]
    
    return applicants_resume_analysis_df

applicants_resume_analysis_df = get_desired_skill_score_dict_for_all_resumes(applicants_resume_analysis_df)

In [14]:
# display(applicants_resume_analysis_df.head(10))

In [None]:
# store the results
write_csv_file(applicants_resume_analysis_df, applicants_resume_analysis_df_path)

# 4. 🧪 Fine-grained Skill Grader

In [16]:
# import defined chain for Fine-grained Skill Grader
from chains.fine_grained_skill_grader import fine_grained_skill_grader as fine_grained_skill_grader_chain

In [29]:
def get_present_date():
    """
        Get the current date in a standardized format.
    
        Returns
        -------
        str
            Current date in YYYY-MM-DD format.
    
        Notes
        -----
        Useful for timestamping analysis outputs in the HR AgentAI pipeline.
        """
    now = datetime.now()
    month_name = now.strftime("%B")
    current_year = now.year
    return f" {month_name} {current_year}"

# Example usage
# print(get_present_date())

In [None]:
def fine_grained_skill_grader(desired_skill, resume_content):
    """
        Assess a candidate's proficiency in a specific skill with detailed evaluation.

        Goes beyond binary checks to capture proficiency level, key experiences,
        and estimated duration of use, ensuring skills are backed by real-world evidence.

        Parameters
        ----------
        resume_text : str
            Extracted text from the candidate's resume.
        target_skill : str
            Skill to evaluate.

        Returns
        -------
        dict
            Structured skill assessment including proficiency, experiences, duration,
            and confidence score.

        Notes
        -----
        Part of the Fine-grained Skill Grader stage, supporting advanced filtering.
        """
    current_date = get_present_date()

    fine_grained_skill_grade = fine_grained_skill_grader_chain.invoke(
        {
            'desired_skill': desired_skill,
            'resume_content': resume_content,
            'current_date':current_date
        }
    )

    return fine_grained_skill_grade.model_dump()

In [None]:
def process_desired_skills_summary_dict(desired_skills_summary_dict_for_one_candidate):
    """
        Format fine-grained skill grading results into structured summaries.

        Converts detailed skill evaluations into a clean, standardized format
        for review or downstream processing.

        Parameters
        ----------
        skill_results : dict
            Mapping of skills to their detailed grading results.

        Returns
        -------
        dict
            Processed skill summaries in a standardized structure.

        Notes
        -----
        Prepares skill data for filtering, ranking, and presentation.
        """
    # Create a deep copy to avoid modifying the original dictionary
    processed_dict = copy.deepcopy(desired_skills_summary_dict_for_one_candidate)
    
    # Iterate through each skill in the dictionary
    for skill_name, skill_data in processed_dict.items():
        # Add key_experiences_count
        if "key_experiences" in skill_data:
            skill_data["key_experiences_count"] = len(skill_data["key_experiences"])
        else:
            skill_data["key_experiences_count"] = 0
        
        # Process months_of_experience and calculate total
        if "months_of_experience" in skill_data:
            months_list = skill_data["months_of_experience"]
            total_months = 0
            
            for month_value in months_list:
                # Since months_of_experience contains int values, sum them directly
                total_months += month_value
            
            skill_data["months_of_experience_total"] = total_months
        else:
            skill_data["months_of_experience_total"] = 0
    
    return processed_dict

In [20]:
# temp
applicants_resume_analysis_df = read_csv_file(applicants_resume_analysis_df_path)
# display(applicants_resume_analysis_df.head(2))

In [None]:
def apply_fine_grained_skill_grader_to_all_resumes(applicants_resume_analysis_df):
    """
        Apply fine-grained skill grading to all resumes in the dataset.
    
        Parameters
        ----------
        resumes : dict
            Mapping of resume IDs to their extracted text content.
        desired_skills : list of str
            Skills to evaluate for each candidate.
    
        Returns
        -------
        dict
            Mapping of resume IDs to fine-grained skill grading results.
        """
    fine_grained_skill_grade_dict = dict()
    applicants_resume_analysis_df['summary'] = ''

    for idx in tqdm(range(applicants_resume_analysis_df.shape[0])):


        fine_grained_skill_grade_dict_for_one_candidate = dict()
        pdf_content = applicants_resume_analysis_df.at[idx, 'pdf_content']
        candidate_name = applicants_resume_analysis_df.at[idx, 'pdf_name']
        # print(f'candidate_name: {candidate_name}')

        for desired_skill in desired_skills_list:
            if applicants_resume_analysis_df.at[idx, desired_skill]:
                fine_grained_skill_grade_dict_for_one_candidate[desired_skill] = fine_grained_skill_grader(desired_skill, pdf_content)

        # process dictionary to add key_experiences_count and years_of_experience_total
        fine_grained_skill_grade_dict_for_one_candidate = process_desired_skills_summary_dict(fine_grained_skill_grade_dict_for_one_candidate)

        # join summaries for different skills
        candidate_summary = ' '.join( [details['summary'] for details in fine_grained_skill_grade_dict_for_one_candidate.values()])
        fine_grained_skill_grade_dict_for_one_candidate['candidate_summary'] = candidate_summary

        fine_grained_skill_grade_dict[candidate_name] = fine_grained_skill_grade_dict_for_one_candidate

        applicants_resume_analysis_df.at[idx, 'summary'] = candidate_summary
    
    return fine_grained_skill_grade_dict, applicants_resume_analysis_df


fine_grained_skill_grade_dict, applicants_resume_analysis_df = apply_fine_grained_skill_grader_to_all_resumes(applicants_resume_analysis_df)

100%|██████████| 10/10 [02:55<00:00, 17.50s/it]


In [22]:
# store the results
write_csv_file(applicants_resume_analysis_df, applicants_resume_analysis_df_path)
write_json_file(fine_grained_skill_grade_dict, fine_grained_skill_grade_dict_path)

# 6. 🧠 Skill Summary Writer
### 6.1. Skill Summary Writer for one skill

In [23]:
# import the defined chain for Skill Summary Writer
from chains.skill_summary_writer import skill_summary_writer as skill_summary_writer_chain

In [None]:
def skill_summary_writer(desired_skill, resume_content):
    """
        Generate a structured summary for a candidate's skill.

        Creates a concise profile including proficiency level, achievements,
        and estimated experience duration based on resume content.

        Parameters
        ----------
        resume_text : str
            Extracted text from the candidate's resume.
        skill_name : str
            Skill to summarize.

        Returns
        -------
        dict
            Structured skill summary with proficiency, achievements, experience,
            and confidence score.

        Notes
        -----
        Supports HR review by providing clear, standardized skill descriptions.
        """
    skill_summary = skill_summary_writer_chain.invoke(
        {
            'desired_skill': desired_skill,
            'resume_content': resume_content
        }
    )

    return skill_summary.model_dump()


In [None]:
def apply_skill_summary_writer_to_all_resumes(applicants_resume_analysis_df):
    """
        Apply skill summary generation to all resumes.
    
        Parameters
        ----------
        resumes : dict
            Mapping of resume IDs to their extracted text content.
        desired_skills : list of str
            Skills for which summaries should be generated.
    
        Returns
        -------
        dict
            Mapping of resume IDs to generated skill summaries.
        """
    desired_skills_summary_dict = dict()
    applicants_resume_analysis_df['summary'] = ''

    for idx in range(applicants_resume_analysis_df.shape[0]):

        desired_skills_summary_dict_for_one_candidate = dict()

        pdf_content = applicants_resume_analysis_df.at[idx, 'pdf_content']
        candidate_name = applicants_resume_analysis_df.at[idx, 'pdf_name']

        for desired_skill in desired_skills_list:
            if applicants_resume_analysis_df.at[idx, desired_skill]:
                desired_skills_summary_dict_for_one_candidate[desired_skill] = skill_summary_writer(desired_skill, pdf_content)

        desired_skills_summary_dict[candidate_name] = desired_skills_summary_dict_for_one_candidate

        candidate_summary = ' '.join( [details['summary'] for details in desired_skills_summary_dict_for_one_candidate.values()])
        applicants_resume_analysis_df.at[idx, 'summary'] = candidate_summary

    return desired_skills_summary_dict, applicants_resume_analysis_df


desired_skills_summary_dict, applicants_resume_analysis_df = apply_skill_summary_writer_to_all_resumes(applicants_resume_analysis_df)

In [26]:
# store the results
write_csv_file(applicants_resume_analysis_df, applicants_resume_analysis_df_path)
write_json_file(desired_skills_summary_dict, desired_skills_summary_dict_path)

### 6.2. Skill Summary Writer for all matching skills in the resume
create candidate summary based on summaries of available skills

In [None]:
def get_candidate_summary(desired_skills_summary_dict):
    """
        Compile a final candidate summary from skill analysis results.
    
        Aggregates skill grading, summaries, and other metrics into a cohesive
        profile for HR decision-making.
    
        Parameters
        ----------
        candidate_id : str
            Unique identifier for the candidate.
        skill_summaries : dict
            Detailed skill summaries for the candidate.
    
        Returns
        -------
        dict
            Final candidate profile ready for reranking or presentation.
        """
    desired_skills_summary_dict_ = copy.deepcopy(desired_skills_summary_dict)

    for candidate, details in desired_skills_summary_dict_.items():
        candidate_summary = ' '.join([skill['summary'] for skill in details.values()])
        desired_skills_summary_dict_[candidate]['candidate_summary'] = candidate_summary

        # print(f'candidate: {candidate}')
        # print(json.dumps(details, indent=4))
    
    return desired_skills_summary_dict_

desired_skills_summary_dict_ = get_candidate_summary(desired_skills_summary_dict)

In [28]:
# store the results
write_json_file(desired_skills_summary_dict_, desired_skills_summary_dict_path)

# 7. 📊 Resume Reranker
Uses Qwen3-Reranker-8B to sort shortlisted resumes against an ideal candidate profile.