In [3]:
#import the necessary libraries
import os
import re

In [4]:
skills_library = {
    "programming": ["python", "java", "sql", "r"],
    "cloud": ["aws", "azure", "gcp"],
    "data_tools": ["power bi", "tableau", "excel", "snowflake"],
    "ml_tools": ["scikit-learn", "tensorflow", "pytorch"],
    "databases": ["postgresql", "mysql", "oracle"],
    "soft_skills": ["communication", "leadership", "collaboration"]
}

all_skills = []
for category, items in skills_library.items():
    all_skills.extend([skill.lower() for skill in items])
#all_skills[:20]

#normalize and clean the text
def normalize_text(text):
  text = text.lower()
  text = re.sub(r"[\n\r\t]", " ", text)
  text = re.sub(r"[^a-z0-9+\s\-\.]", " ", text)
  text = re.sub(r"\s+", " ", text)
  return text.strip()

#grab skills from resume
def find_skills_in_text(text, skills_list):
  found = []

  #clean_text  = "python sql aws services"
  #padded_text = " python sql aws services "
  clean_text = normalize_text(text)
  padded_text = f" {clean_text} "

  for skill in skills_list:
    if len(skill) == 1:
      if f" {skill} " in padded_text:
        found.append(skill)
      continue
    if skill in padded_text:
      found.append(skill)

  return list(set(found))

#grab the required skills from the resume
def extract_required_skills_jd(text, skills_list):
  required = []
  clean_text = normalize_text(text)

  required_sections = [
    r"requirements[:\-\s]+(.*?)(preferred|nice to have|bonus|$)",
    r"required skills[:\-\s]+(.*?)(preferred|nice to have|bonus|$)",
    r"must have[:\-\s]+(.*?)(preferred|nice to have|bonus|$)"
  ]

  for pattern in required_sections:
    #re.IGNORECASE - case insensitive
    #re.DOTALL - allows multi-line matching
    match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
    if match:
      section = match.group(1)
      required = find_skills_in_text(section, skills_list)
      break

  return required

#grab the preferred skills from the resume
def extract_preferred_skills_jd(text, skills_list):
  preferred = []
  clean_text = normalize_text(text)

  preferred_section = [
    r"preferred skills[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"preferred qualifications[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"nice to have[:\-\s]+(.*?)(requirements|required|must have|$)",
    r"bonus[:\-\s]+(.*?)(requirements|required|must have|$)"
  ]

  for pattern in preferred_section:
    #re.IGNORECASE - case insensitive
    #re.DOTALL - allows multi-line matching
    match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL)
    if match:
      section = match.group(1)
      required = find_skills_in_text(section, skills_list)
      break

  return required

def extract_resume_skills(text, skills_list):
  clean_text = normalize_text(text)
  resume_skills = find_skills_in_text(clean_text, skills_list)
  return resume_skills

def compare_skills(required, preferred, resume):
  required_found = list(set(required) & set(resume))
  required_missing = list(set(required) - set(resume))
  preferred_found = list(set(preferred) & set(resume))
  preferred_missing = list(set(preferred) - set(resume))

  return {
    "required_found": required_found,
    "required_missing": required_missing,
    "preferred_found": preferred_found,
    "preferred_missing": preferred_missing
    }

#ATS match percentage based on required skills
def compute_skill_match(required, resume):
  if len(required) == 0:
    return None

  required_found = set(required) & set(resume)
  match_percent = (len(required_found) / len(required)) * 100

  return round(match_percent, 2)

#main function that combines the previous functions
def extract_keywords(jd_text, resume_text, skills_list):
  jd_clean = normalize_text(jd_text)
  resume_clean = normalize_text(resume_text)

  required = extract_required_skills_jd(jd_clean, skills_list)
  preferred = extract_preferred_skills_jd(jd_clean, skills_list)
  resume_skills = extract_resume_skills(resume_clean, skills_list)

  comparison = compare_skills(required, preferred, resume_skills)
  match_percent = compute_skill_match(required, resume_skills)

  return {
    "required_skills": required,
    "preferred_skills": preferred,
    "resume_skills": resume_skills,
    "required_found": comparison["required_found"],
    "required_missing": comparison["required_missing"],
    "preferred_found": comparison["preferred_found"],
    "preferred_missing": comparison["preferred_missing"],
    "skill_match_percent": match_percent
    }