In [0]:
%run /Workspace/Users/ajkayode@outlook.com/ehr-project/ehr-project-bundle/envsetup/Configs

In [0]:
import pandas as pd
from pyspark.sql.functions import pandas_udf
from typing import Iterator

@pandas_udf("string")
def pdf_parser(batch_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
    import pymupdf
    import pymupdf4llm
    
    def get_md_from_pdf(b_content):
        pdf_doc = pymupdf.Document(stream=b_content, filetype="pdf")
        md_text = pymupdf4llm.to_markdown(pdf_doc)
        return md_text
    
    for x in batch_iter:
        yield x.apply(get_md_from_pdf)

In [0]:
class JDSilverLoad:
    def __init__(self):
        spark.conf.set(
            "spark.databricks.delta.changeDataFeed.timestampOutOfRange.enabled", "true"
        )

    def get_start_time(self):
        start_time = (
            spark.sql(
                f"""
                select execution_time as start_time 
            from {ats_configs.jobs_metadate_table_name} 
            where job_name = '{ats_configs.jd_silver_job_name}'
            order by execution_time desc
            """
                ).first()
                .asDict()["start_time"]
                .strftime("%Y-%m-%d %H:%M:%S")
        )

        return start_time
    
    def update_metadata(self, end_time, load_date):
        print(f"Updating metadata for {ats_configs.jd_silver_job_name}")
        spark.sql(
            f"""
            insert into {ats_configs.jobs_metadate_table_name}
            values('{ats_configs.jd_silver_job_name}', 
            '{load_date}', 
            '{end_time}', 
            'Job Execution Completed.' )""")
        print(f"Updated metadata for {ats_configs.jd_silver_job_name}")

    def get_load_date(self):
        load_date = (
            spark.sql(
                f"""
                select date_add(last_load_date, 1) as load_date 
                from {ats_configs.jobs_metadate_table_name} 
                where job_name = '{ats_configs.jd_silver_job_name}'
                order by last_load_date desc
                """).first()
                .asDict()["load_date"]
                .strftime("%Y-%m-%d %H:%M:%S")
        )

        return load_date

    def get_end_time(self):
        end_time = (
            spark.sql(
                f"""
                select current_timestamp() as end_time
                """).first()
                .asDict()["end_time"]
                .strftime("%Y-%m-%d %H:%M:%S")
        )
        return end_time
    
    def get_prompt(self):
        from datetime import datetime
        current_year = datetime.now().year
        prompt = f"""
        Extract the following fields from the provided job description and return them as a single JSON object:
        
        job_title
        
        required_skills (list)
        
        nice_to_have_skills (list)
        
        education (string; required degree(s) or field(s))
        
        experience_level (string; e.g., 2+ years, Entry Level, etc.)
        
        responsibilities (list)
        
        location (string; if available)
        
        Ignore sections about company overview, benefits, perks, application process, or any information not directly relevant to the candidates qualifications or job requirements.
        
        For skills, separate required skills (explicitly stated as required, must have, or essential) and nice-to-have skills (those listed as preferred, bonus, or optional)
        
        If a field is missing, set its value to null (for strings) or an empty array (for lists)
        Job Description:
        """
        return prompt

    def extract_JD(self):
        from pyspark.sql.functions import expr
        #set all required time variables
        load_date = self.get_load_date()
        start_time= self.get_start_time()
        end_time= self.get_end_time()

        #read change data from bronze layer table
        bronze_jd_df = (
            spark.read
            .option("readChangeFeed", "true")
            .option("startingTimestamp", start_time)
            .option("endingTimestamp", end_time)
            .table(ats_configs.jd_bronze_table_name)
        )

        #parse pdf binary to md text and write to silver staging table
        parsed_jd_df = (
            bronze_jd_df.withColumn("text_content", pdf_parser("content"))
            .selectExpr(
                "path as source",
                "text_content"
            )
            .write.mode("overwrite")
            .saveAsTable(f"{ats_configs.jd_silver_table_name}_stg")
            )

        #prepare prompt to get json content from md text
        prompt = self.get_prompt()

        #extract key information using llm model to get json object from the md text
        jd_extract_df = (spark.read.table(f"{ats_configs.jd_silver_table_name}_stg")
                              .withColumn('json_context', 
                                          expr(f"""ai_query(endpoint => '{ats_configs.chat_model_endpoint_name}',
                                                    request => CONCAT('{prompt}', text_content)) 
                                                               """),
                                          )
        )

        #save the silver table result
        jd_extract_df.write.mode("append").saveAsTable(ats_configs.jd_silver_table_name)

        #update metadata
        self.update_metadata(end_time = end_time, load_date = load_date)


In [0]:
JDSilver =JDSilverLoad()
JDSilver.extract_JD()