In [2]:
#import the necessary libraries
import os
import re

In [3]:
#read the job description
def load_jd(filepath):
  with open(filepath, "r") as f:
    text = f.read()
    return text

#normalize and clean the text
def clean_jd_text(text):
  #step 1:
  #step 1a. - remove \r
  cleaned = text.replace("\r", "")
  #step 1b. - replace \t with a space
  cleaned = cleaned.replace("\t", " ")

  #step 2: replace the 3 main bullet point types with a - and a trailing space
  #Ex. • Built SQL pipelines becomes - Built SQL pipelines
  cleaned = cleaned.replace("•", "- ")
  cleaned = cleaned.replace("●", "- ")
  cleaned = cleaned.replace("", "- ")

  #update 1/2 need to be worked on
  #update 1: put email, phone and link on separate lines
  #cleaned = re.sub(r"\s*-\s*", "\n", cleaned)
  #update 2: put job titles and employment date ranges on separate lines
  #cleaned = re.sub(r"(\b\d{4}\b.*)", r"\n\1", cleaned)

  #step 3:
  #step 3a. - split it into lines
  lines = [line.strip() for line in cleaned.split("\n")]
  #step 3b. - remove non-empty lines
  lines = [line for line in lines if line]

  cleaned = "\n".join(lines)
  return cleaned

#grab the job title
def extract_job_title(text):
  first_line = text.split("\n")[0]
  job_title = first_line.strip()
  return job_title

#identify the required and preferred skills - from my own personal experience
def extract_skills(text):
  required = []
  preferred = []
  return {
      "required_skills": required,
      "preferred_skills": preferred,
      "all_skills": required + preferred
  }

#grab the experience needed for the role - years and seniority
def extract_experience_requirements(text):
  experience_years = None
  seniority = None
  return {
      "experience_years": experience_years,
      "seniority": seniority
  }

#extract job responsibilities
def extract_job_responsibilities(text):
  responsibilities = []
  for line in text.split("\n"):
    if line.startswith("-") or line.startswith("*"):
      responsibilities.append(line)
  return responsibilities

#main function that combines the previous functions
def parse_jd(filepath):
    raw_text = load_jd(filepath)
    clean_text = clean_jd_text(raw_text)

    title = extract_job_title(clean_text)
    skill_data = extract_skills(clean_text)
    experience_data = extract_experience_requirements(clean_text)
    responsibilities = extract_job_responsibilities(clean_text)

    return {
        "file_name": os.path.basename(filepath),
        "raw_text": raw_text,
        "clean_text": clean_text,
        "job_title": title,
        "required_skills": skill_data["required_skills"],
        "preferred_skills": skill_data["preferred_skills"],
        "all_skills": skill_data["all_skills"],
        "responsibilities": responsibilities,
        "min_experience_years": experience_data["experience_years"],
        "seniority": experience_data["seniority"]
    }