In [39]:
#install the necessary packages
%pip install pdfplumber
%pip install python-docx



In [40]:
#import the necessary libraries
import os
#refer to the "create an ats scanner" notes
import pdfplumber
#python-docx
#Link: https://python-docx.readthedocs.io/en/latest/
from docx import Document
import re

In [41]:
#parser.py

In [42]:
def detect_file_type(filepath):
  file_ext = os.path.splitext(filepath)[1].lower()
  #return file_ext
  if file_ext == ".pdf":
      return "pdf"
  elif file_ext == ".docx":
      return "docx"
  elif file_ext == ".txt":
      return "txt"
  else:
      return "Unsupported resume format. Use either a .PDF, .DOCX, or .TXT file."

def parse_pdf(filepath):
  text = ""
  with pdfplumber.open(filepath) as pdf:
    for page in pdf.pages:
      text += page.extract_text()
  return text

def parse_docx(filepath):
  text = ""
  doc = Document(filepath)
  for paragraph in doc.paragraphs:
    text += paragraph.text
  return text

def parse_txt(filepath):
  with open(filepath, "r") as f:
    text = f.read()

'''
def clean_text(text):
  text = text.replace("\n", " ")
'''

def clean_text(text):
  #step 1:
  #step 1a. - remove \r
  cleaned = text.replace("\r", "")
  #step 1b. - replace \t with a space
  cleaned = cleaned.replace("\t", " ")

  #step 2: replace the 3 main bullet point types with a - and a trailing space
  #Ex. • Built SQL pipelines becomes - Built SQL pipelines
  cleaned = cleaned.replace("•", "- ")
  cleaned = cleaned.replace("●", "- ")
  cleaned = cleaned.replace("", "- ")

  #update 1/2 need to be worked on
  #update 1: put email, phone and link on separate lines
  #cleaned = re.sub(r"\s*-\s*", "\n", cleaned)
  #update 2: put job titles and employment date ranges on separate lines
  #cleaned = re.sub(r"(\b\d{4}\b.*)", r"\n\1", cleaned)

  #step 3:
  #step 3a. - split it into lines
  lines = [line.strip() for line in cleaned.split("\n")]
  #step 3b. - remove non-empty lines
  lines = [line for line in lines if line]

  cleaned = "\n".join(lines)
  return cleaned

#main function that combines the previous functions
def parse_resume(filepath):
  #function 1
  file_type = detect_file_type(filepath)
  #functions 2,3,4
  if file_type == "pdf":
      raw_text = parse_pdf(filepath)
  elif file_type == "docx":
      raw_text = parse_docx(filepath)
  elif file_type == "txt":
      raw_text = parse_txt(filepath)
  else:
      raise ValueError("Unsupported resume format. Use either a .PDF, .DOCX, or .TXT file.")

  cleaned = clean_text(raw_text)

  return {
      "file_name": os.path.basename(filepath),
      "file_type": file_type,
      "raw_text": raw_text,
      "clean_text": cleaned
  }

In [43]:
#detect_file_type("example.pdf")
#parse_pdf("example.pdf")
#clean_text(parse_pdf("example.pdf"))
#print(clean_text(parse_pdf("example.pdf")))
#parse_resume("example.pdf")

In [44]:
#jd_parser.py

In [45]:
#import the necessary libraries
import os
import re

In [46]:
def load_jd(filepath):
  with open(filepath, "rb") as f:
    raw = f.read()

  try:
    return raw.decode("utf-8")
  except UnicodeDecodeError:
    pass

  try:
    return raw.decode("cp1252")
  except UnicodeDecodeError:
    pass

  return raw.decode("utf-8", errors="replace")

#normalize and clean the text
def clean_jd_text(text):
  #step 1:
  #step 1a. - remove \r
  cleaned = text.replace("\r", "")
  #step 1b. - replace \t with a space
  cleaned = cleaned.replace("\t", " ")

  #step 2: replace the 3 main bullet point types with a - and a trailing space
  #Ex. • Built SQL pipelines becomes - Built SQL pipelines
  cleaned = cleaned.replace("•", "- ")
  cleaned = cleaned.replace("●", "- ")
  cleaned = cleaned.replace("", "- ")

  #update 1/2 need to be worked on
  #update 1: put email, phone and link on separate lines
  #cleaned = re.sub(r"\s*-\s*", "\n", cleaned)
  #update 2: put job titles and employment date ranges on separate lines
  #cleaned = re.sub(r"(\b\d{4}\b.*)", r"\n\1", cleaned)

  #step 3:
  #step 3a. - split it into lines
  lines = [line.strip() for line in cleaned.split("\n")]
  #step 3b. - remove non-empty lines
  lines = [line for line in lines if line]

  cleaned = "\n".join(lines)
  return cleaned

#grab the job title
def extract_job_title(text):
  first_line = text.split("\n")[0]
  job_title = first_line.strip()
  return job_title

#identify the required and preferred skills - from my own personal experience
def extract_skills(text):
  required = []
  preferred = []
  return {
      "required_skills": required,
      "preferred_skills": preferred,
      "all_skills": required + preferred
  }

#grab the experience needed for the role - years and seniority
def extract_experience_requirements(text):
  experience_years = None
  seniority = None
  return {
      "experience_years": experience_years,
      "seniority": seniority
  }

#extract job responsibilities
def extract_job_responsibilities(text):
  responsibilities = []
  for line in text.split("\n"):
    if line.startswith("-") or line.startswith("*"):
      responsibilities.append(line)
  return responsibilities

#main function that combines the previous functions
def parse_jd(filepath):
    raw_text = load_jd(filepath)
    clean_text = clean_jd_text(raw_text)

    title = extract_job_title(clean_text)
    skill_data = extract_skills(clean_text)
    experience_data = extract_experience_requirements(clean_text)
    responsibilities = extract_job_responsibilities(clean_text)

    return {
        "file_name": os.path.basename(filepath),
        "raw_text": raw_text,
        "clean_text": clean_text,
        "job_title": title,
        "required_skills": skill_data["required_skills"],
        "preferred_skills": skill_data["preferred_skills"],
        "all_skills": skill_data["all_skills"],
        "responsibilities": responsibilities,
        "min_experience_years": experience_data["experience_years"],
        "seniority": experience_data["seniority"]
    }

In [47]:
text = load_jd("sample_jd.pdf")
#print(text)

In [48]:
cleaned_text = clean_jd_text(text)
#print(cleaned_text)

In [49]:
job_title = extract_job_title(cleaned_text)
#print(job_title)

In [50]:
job_skills = extract_skills(cleaned_text)
#print(job_skills)

In [51]:
job_experience_requirements = extract_experience_requirements(cleaned_text)
#print(job_experience_requirements)

In [52]:
job_resposibilities = extract_job_responsibilities(cleaned_text)
#print(job_resposibilities)

In [53]:
job_result = parse_jd("sample_jd.pdf")
#print(job_result)

In [54]:
#job description fixes

In [55]:
#grab the job title
def extract_job_title(text):
  first_line = text.split("\n")[0]
  job_title = first_line.strip()
  return job_title

In [56]:
#based on the sample_jd_1.txt file, the first line contains multiple pieces of information: job title, city, st ate, country, etc.
#the extract_job_title(text): assumes that the first line = job title
#we need to return only the part before the location/extra text
def extract_job_title(text):
  first_line = text.split("\n")[0]
  markers = [" Job Description"," Job Summary", " United States", ", IL", ", CA", ", TX", ", NY", " and "]
  for marker in markers:
    if marker in first_line:
      first_line = first_line.split(marker)[0]
  job_title = first_line.strip()
  return job_title

In [57]:
job_title = extract_job_title(cleaned_text)
print(job_title)

%PDF-1.4


In [58]:
#import the necessary libraries
#Link: https://course.spacy.io/en/
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
def extract_job_title_spacy(text):
  doc = nlp(text)
  matcher = Matcher(nlp.vocab)

  #add more patterns accordingly
  #pattern 1: adj. + noun + noun + proper noun
  #Ex. Senior Data Visualization Specialist
  pattern1 = [
      {"POS": "ADJ"},
      {"POS": "NOUN"},
      {"POS": "NOUN"},
      {"POS": "PROPN"}
  ]
  #pattern 2: Ex. Senior Machine Learning Engineer
  pattern2 = [
      {"POS": "ADJ"},
      {"POS": "PROPN"},
      {"POS": "PROPN"}
  ]
  #pattern 3: Ex. Data Visualization Specialist
  pattern3 = [
      {"POS": "NOUN"},
      {"POS": "NOUN"},
      {"POS": "PROPN"}
  ]
  #pattern 4
  pattern4 = [
      {"POS": "ADJ"},
      {"POS": "PROPN"},
      {"POS": "PROPN"},
      {"POS": "PROPN"}
  ]

  '''
  #combine the patterns into one
  pattern_flexible = [
      {"POS": {"IN": ["ADJ", "PROPN"]}, "OP": "?"},
      {"POS": {"IN": ["NOUN", "PROPN"]}},
      {"POS": {"IN": ["NOUN", "PROPN"]}, "OP": "*"}
    ]
  '''

  matcher.add("JOB_TITLE", [pattern1, pattern2, pattern3, pattern4])
  matches = matcher(doc)

  #matcher.add("JOB_TITLE", [pattern_flexible])
  #matches = matcher(doc)

  for match_id, start, end in matches:
    span = doc[start:end]
    return span.text.strip()

  return None

In [59]:
text_2 = "Senior Data Visualization Specialist Oak Brook, IL, United States and 2 more Job Description Job Summary:"
job_title_2 = extract_job_title_spacy(text_2)
print(job_title_2)

more Job Description


In [60]:
my_string = "Senior Data Visualization Specialist Oak Brook, IL, United States and 2 more Job Description Job Summary:"

words = my_string.split()
#first_four_words = words[:4]
first_four_words = " ".join(words[:4])
print(first_four_words)

Senior Data Visualization Specialist


In [61]:
#keyword_extractor.py

In [62]:
skills_library = {
    "programming": ["python", "java", "sql", "r"],
    "cloud": ["aws", "azure", "gcp"],
    "data_tools": ["power bi", "tableau", "excel", "snowflake"],
    "ml_tools": ["scikit-learn", "tensorflow", "pytorch"],
    "databases": ["postgresql", "mysql", "oracle"],
    "soft_skills": ["communication", "leadership", "collaboration"]
}

all_skills = []
for category, items in skills_library.items():
    all_skills.extend([skill.lower() for skill in items])
#all_skills[:20]

#normalize and clean the text
def normalize_text(text):
  text = text.lower()
  text = re.sub(r"[\n\r\t]", " ", text)
  text = re.sub(r"[^a-z0-9+\s\-\.]", " ", text)
  text = re.sub(r"\s+", " ", text)
  return text.strip()

#grab skills from resume
def find_skills_in_text(text, skills_list):
  found = []

  #clean_text  = "python sql aws services"
  #padded_text = " python sql aws services "
  clean_text = normalize_text(text)
  padded_text = f" {clean_text} "

  for skill in skills_list:
    if len(skill) == 1:
      if f" {skill} " in padded_text:
        found.append(skill)
      continue
    if skill in padded_text:
      found.append(skill)

  return list(set(found))

#grab the required skills from the resume
def extract_required_skills_jd(text, skills_list):
  required = []
  clean_text = normalize_text(text)

  required_sections = [
    r"requirements[:\-\s]+(.*?)(preferred|nice to have|bonus|$)",
    r"required skills[:\-\s]+(.*?)(preferred|nice to have|bonus|$)",
    r"must have[:\-\s]+(.*?)(preferred|nice to have|bonus|$)"
  ]

  for pattern in required_sections:
    #re.IGNORECASE - case insensitive
    #re.DOTALL - allows multi-line matching
    match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
    if match:
      section = match.group(1)
      required = find_skills_in_text(section, skills_list)
      break

  return required

#grab the preferred skills from the resume
def extract_preferred_skills_jd(text, skills_list):
  preferred = []
  clean_text = normalize_text(text)

  preferred_section = [
    r"preferred skills[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"preferred qualifications[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"nice to have[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"bonus[:\-\s]+(.*?)(requirements|required|must have|$)"
  ]

  for pattern in preferred_section:
    match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
    if match:
      section = match.group(1)
      preferred = find_skills_in_text(section, skills_list)
      break

  return preferred

def extract_resume_skills(text, skills_list):
  clean_text = normalize_text(text)
  resume_skills = find_skills_in_text(clean_text, skills_list)
  return resume_skills

def compare_skills(required, preferred, resume):
  required_found = list(set(required) & set(resume))
  required_missing = list(set(required) - set(resume))
  preferred_found = list(set(preferred) & set(resume))
  preferred_missing = list(set(preferred) - set(resume))

  return {
    "required_found": required_found,
    "required_missing": required_missing,
    "preferred_found": preferred_found,
    "preferred_missing": preferred_missing
    }

#ATS match percentage based on required skills
def compute_skill_match(required, resume):
  if len(required) == 0:
    return None

  required_found = set(required) & set(resume)
  match_percent = (len(required_found) / len(required)) * 100

  return round(match_percent, 2)

#main function that combines the previous functions
def extract_keywords(jd_text, resume_text, skills_list):
  jd_clean = normalize_text(jd_text)
  resume_clean = normalize_text(resume_text)

  required = extract_required_skills_jd(jd_clean, skills_list)
  preferred = extract_preferred_skills_jd(jd_clean, skills_list)
  resume_skills = extract_resume_skills(resume_clean, skills_list)

  comparison = compare_skills(required, preferred, resume_skills)
  match_percent = compute_skill_match(required, resume_skills)

  return {
    "required_skills": required,
    "preferred_skills": preferred,
    "resume_skills": resume_skills,
    "required_found": comparison["required_found"],
    "required_missing": comparison["required_missing"],
    "preferred_found": comparison["preferred_found"],
    "preferred_missing": comparison["preferred_missing"],
    "skill_match_percent": match_percent
    }

In [63]:
resume_text = "Experienced in Python, SQL, Power BI, and cloud technologies such as AWS."
skills = find_skills_in_text(resume_text, all_skills)
#print(skills)

In [64]:
required = ["python", "sql", "aws"]
resume = ["python", "sql"]

#compute_skill_match(required, resume)

In [65]:
#scorer.py

In [66]:
def score_resume(jd_text, resume_text, skills_list):
    from keyword_extractor.ipnyb import extract_keywords

    results = extract_keywords(jd_text, resume_text, skills_list)

    final_score = results["skill_match_percent"]

    return {
        "required_skills": results["required_skills"],
        "preferred_skills": results["preferred_skills"],
        "resume_skills": results["resume_skills"],
        "required_found": results["required_found"],
        "required_missing": results["required_missing"],
        "preferred_found": results["preferred_found"],
        "preferred_missing": results["preferred_missing"],
        "skill_match_percent": final_score
    }