In [74]:
#install the necessary packages
%pip install pdfplumber
%pip install python-docx



In [75]:
#import the necessary libraries
import os
#refer to the "create an ats scanner" notes
import pdfplumber
#python-docx
#Link: https://python-docx.readthedocs.io/en/latest/
from docx import Document
import re

In [76]:
#parser.py

In [77]:
def detect_file_type(filepath):
  file_ext = os.path.splitext(filepath)[1].lower()
  #return file_ext
  if file_ext == ".pdf":
      return "pdf"
  elif file_ext == ".docx":
      return "docx"
  elif file_ext == ".txt":
      return "txt"
  else:
      return "Unsupported resume format. Use either a .PDF, .DOCX, or .TXT file."

def parse_pdf(filepath):
  text = ""
  with pdfplumber.open(filepath) as pdf:
    for page in pdf.pages:
      text += page.extract_text()
  return text

def parse_docx(filepath):
  text = ""
  doc = Document(filepath)
  for paragraph in doc.paragraphs:
    text += paragraph.text
  return text

def parse_txt(filepath):
  with open(filepath, "r") as f:
    text = f.read()

'''
def clean_text(text):
  text = text.replace("\n", " ")
'''

def clean_text(text):
  #step 1:
  #step 1a. - remove \r
  cleaned = text.replace("\r", "")
  #step 1b. - replace \t with a space
  cleaned = cleaned.replace("\t", " ")

  #step 2: replace the 3 main bullet point types with a - and a trailing space
  #Ex. • Built SQL pipelines becomes - Built SQL pipelines
  cleaned = cleaned.replace("•", "- ")
  cleaned = cleaned.replace("●", "- ")
  cleaned = cleaned.replace("", "- ")

  #update 1/2 need to be worked on
  #update 1: put email, phone and link on separate lines
  #cleaned = re.sub(r"\s*-\s*", "\n", cleaned)
  #update 2: put job titles and employment date ranges on separate lines
  #cleaned = re.sub(r"(\b\d{4}\b.*)", r"\n\1", cleaned)

  #step 3:
  #step 3a. - split it into lines
  lines = [line.strip() for line in cleaned.split("\n")]
  #step 3b. - remove non-empty lines
  lines = [line for line in lines if line]

  cleaned = "\n".join(lines)
  return cleaned

#main function that combines the previous functions
def parse_resume(filepath):
  #function 1
  file_type = detect_file_type(filepath)
  #functions 2,3,4
  if file_type == "pdf":
      raw_text = parse_pdf(filepath)
  elif file_type == "docx":
      raw_text = parse_docx(filepath)
  elif file_type == "txt":
      raw_text = parse_txt(filepath)
  else:
      raise ValueError("Unsupported resume format. Use either a .PDF, .DOCX, or .TXT file.")

  cleaned = clean_text(raw_text)

  return {
      "file_name": os.path.basename(filepath),
      "file_type": file_type,
      "raw_text": raw_text,
      "clean_text": cleaned
  }

In [78]:
#detect_file_type("example.pdf")
#parse_pdf("example.pdf")
#clean_text(parse_pdf("example.pdf"))
#print(clean_text(parse_pdf("example.pdf")))
#parse_resume("example.pdf")

In [79]:
#jd_parser.py

In [80]:
#import the necessary libraries
import os
import re

In [81]:
#read the job description
def load_jd(filepath):
  with open(filepath, "r") as f:
    text = f.read()
    return text

#normalize and clean the text
def clean_jd_text(text):
  #step 1:
  #step 1a. - remove \r
  cleaned = text.replace("\r", "")
  #step 1b. - replace \t with a space
  cleaned = cleaned.replace("\t", " ")

  #step 2: replace the 3 main bullet point types with a - and a trailing space
  #Ex. • Built SQL pipelines becomes - Built SQL pipelines
  cleaned = cleaned.replace("•", "- ")
  cleaned = cleaned.replace("●", "- ")
  cleaned = cleaned.replace("", "- ")

  #update 1/2 need to be worked on
  #update 1: put email, phone and link on separate lines
  #cleaned = re.sub(r"\s*-\s*", "\n", cleaned)
  #update 2: put job titles and employment date ranges on separate lines
  #cleaned = re.sub(r"(\b\d{4}\b.*)", r"\n\1", cleaned)

  #step 3:
  #step 3a. - split it into lines
  lines = [line.strip() for line in cleaned.split("\n")]
  #step 3b. - remove non-empty lines
  lines = [line for line in lines if line]

  cleaned = "\n".join(lines)
  return cleaned

#grab the job title
def extract_job_title(text):
  first_line = text.split("\n")[0]
  job_title = first_line.strip()
  return job_title

#identify the required and preferred skills - from my own personal experience
def extract_skills(text):
  required = []
  preferred = []
  return {
      "required_skills": required,
      "preferred_skills": preferred,
      "all_skills": required + preferred
  }

#grab the experience needed for the role - years and seniority
def extract_experience_requirements(text):
  experience_years = None
  seniority = None
  return {
      "experience_years": experience_years,
      "seniority": seniority
  }

#extract job responsibilities
def extract_job_responsibilities(text):
  responsibilities = []
  for line in text.split("\n"):
    if line.startswith("-") or line.startswith("*"):
      responsibilities.append(line)
  return responsibilities

#main function that combines the previous functions
def parse_jd(filepath):
    raw_text = load_jd(filepath)
    clean_text = clean_jd_text(raw_text)

    title = extract_job_title(clean_text)
    skill_data = extract_skills(clean_text)
    experience_data = extract_experience_requirements(clean_text)
    responsibilities = extract_job_responsibilities(clean_text)

    return {
        "file_name": os.path.basename(filepath),
        "raw_text": raw_text,
        "clean_text": clean_text,
        "job_title": title,
        "required_skills": skill_data["required_skills"],
        "preferred_skills": skill_data["preferred_skills"],
        "all_skills": skill_data["all_skills"],
        "responsibilities": responsibilities,
        "min_experience_years": experience_data["experience_years"],
        "seniority": experience_data["seniority"]
    }

In [82]:
text = load_jd("sample_jd_1.txt")
#print(text)

In [83]:
cleaned_text = clean_jd_text(text)
#cleaned_text

In [84]:
job_title = extract_job_title(cleaned_text)
#print(job_title)

In [85]:
job_skills = extract_skills(cleaned_text)
#print(job_skills)

In [86]:
job_experience_requirements = extract_experience_requirements(cleaned_text)
#print(job_experience_requirements)

In [87]:
job_resposibilities = extract_job_responsibilities(cleaned_text)
#print(job_resposibilities)

In [88]:
job_result = parse_jd("sample_jd_1.txt")
#print(job_result)