Let's get a list of sample job descriptions to extract the skills from.

In [8]:
descriptions = [
    """
The ideal candidate must have expertise in Python, machine learning, and data analysis.
Experience with cloud platforms such as AWS or Azure is required.
Preferred skills include knowledge of Docker, Kubernetes, and CI/CD pipelines.
Having a background in cybersecurity is a plus. 3D printing is also a plus.
""",
    """
Minimum qualifications:
Currently pursuing a Bachelor’s degree in Electrical Engineering, Computer Engineering, Computer Science or a related field.
Experience in one or more of the areas: Computer Architecture, Circuit Design/Simulation, Design Verification, Digital Design, Embedded Systems, Hardware/Software, Signal and Power Integrity, EMC, Statistics, System Modeling, Networking/Security, Test/Measurement or Verilog.

Preferred qualifications:
Currently pursuing a Master’s or PhD in Electrical Engineering, Computer Engineering, Computer Science or a related field, and returning to a degree program after the internship ends.
Experience (e.g., research assistant, teaching assistant, personal projects outside the classroom, etc.) in Hardware, Electrical Engineering, Mechanical Engineering, Communication Engineering, Ocean Engineering, Optical Engineering, or other related fields.
Knowledge and experience in Test/Design/Manufacturing/Prototyping tools.
Excellent coding skills with the ability to use one coding language (e.g., Python, Matlab, C++).
About the job
As a Hardware Engineering Intern, you will design and build the systems for computing infrastructure. Your work has the potential to shape the machinery that goes into data centers affecting Google users. The teams you will work with design, develop, and deploy next-generation consumer hardware while ensuring that this equipment is reliable. You’ll work closely with engineers to improve our hardware to meet Google's standards of quality and reliability. Your work will have the potential to impact many of Google users.

Depending on your experience, you may have an opportunity to work on a project in Electrical Hardware Engineering, System Hardware Engineering, Power Testing or Networking Engineering.

Responsibilities
Perform specific responsibilities which may vary by project area.
    """
]

required_keywords = ["must", "required", "experience", "should", "expertise"]
preferred_keywords = ["preferred", "nice to have", "plus"]


# Notebook for skill extraction in python

define a couple keywords

Create the set and the model

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")


Function to "extract" the skills

In [None]:
def extract_skills(job_description: str):
  # create a document
  doc = nlp(job_description)

  # create the sets
  required_skills = set()
  preferred_skills = set()

  # loop through sentences to categorize skills
  for sent in doc.sents:
    context = "neutral"

    # Determine context based on keywords
    for word in sent:
      if word.text.lower() in required_keywords:
        context = "required"
      elif word.text.lower() in preferred_keywords:
        context = "preferred"

    # Extract noun phrases (potential skills)
    for chunk in sent.noun_chunks:
      if context == "required":
        required_skills.add(chunk.text)
      elif context == "preferred":
        preferred_skills.add(chunk.text)

  return required_skills, preferred_skills

Preprocess documents

In [None]:
def preprocess_document(document: str):
  return document.replace("\n", " ")

In [None]:
first_document = preprocess_document(descriptions[0])
second_document = preprocess_document(descriptions[1])

Test it on the first document

In [None]:
required_first, preferred_first = extract_skills(first_document)
print(required_first)
print(preferred_first)

{'data analysis', 'machine learning', 'expertise', 'AWS', 'Azure', 'Python', ' The ideal candidate', 'cloud platforms', 'Experience'}
{'a background', 'CI/CD pipelines', 'Preferred skills', 'knowledge', 'Docker', '3D printing', 'Kubernetes', 'cybersecurity', 'a plus'}


In [None]:
required_second, preferred_second = extract_skills(second_document)
print(required_second)
print(preferred_second)

{'personal projects', 'assistant', 'Circuit Design/Simulation', 'Knowledge', 'research assistant', 'Networking Engineering', 'Electrical Hardware Engineering', 'EMC', 'Experience', 'Networking/Security', 'Electrical Engineering', 'Communication Engineering', 'the classroom', 'Ocean Engineering', 'Embedded Systems', 'an opportunity', 'Power Testing', 'Power Integrity', 'Signal', 'you', 'Optical Engineering', 'Mechanical Engineering', 'a project', 'System Modeling', 'System Hardware Engineering', 'the areas', 'Hardware/Software', 'experience', 'Statistics', 'Verilog', 'Test/Design/Manufacturing/Prototyping tools', 'Hardware', 'your experience', 'Design Verification', 'Digital Design', 'Computer Architecture', 'Test/Measurement', 'other related fields'}
{'a related field', 'a degree program', 'the internship', 'PhD', 'a Master', 'Computer Engineering', 'Computer Science', 'Electrical Engineering', 'Preferred qualifications'}


Sanitize the list of skills to remove any words which come out weird for example: "the ideal candidate", "experience", "expertise", "a plus", "knowledge", "preferred skills", "a background".

In [None]:
def sanitize_skills(skills: set):
  sanitized_skills = set()

  for skill in skills:
    # Filter out short or overly generic phrases
    if len(skill.split()) < 2 and skill.islower():
      continue

    # Check for stopwords or generic phrases
    doc = nlp(skill)
    if any(token.is_stop for token in doc) or "skill" in skill.lower():
      continue

    # Retain proper nouns, nouns, or meaningful phrases
    if any(word.lower() in required_keywords for word in skill.split()) or any(word.lower() in preferred_keywords for word in skill.split()):
      continue

    if any(token.pos_ in {"NOUN", "PROPN"} for token in doc):
        sanitized_skills.add(skill.strip())

  return sanitized_skills

In [None]:
print(sanitize_skills(required_first))
print(sanitize_skills(preferred_first))
print(sanitize_skills(required_second))
print(sanitize_skills(preferred_second))

{'data analysis', 'machine learning', 'AWS', 'Python', 'cloud platforms'}
{'Docker', '3D printing', 'CI/CD pipelines', 'Kubernetes'}
{'personal projects', 'Circuit Design/Simulation', 'Knowledge', 'research assistant', 'Networking Engineering', 'Electrical Hardware Engineering', 'EMC', 'Electrical Engineering', 'Networking/Security', 'Communication Engineering', 'Ocean Engineering', 'Embedded Systems', 'Power Testing', 'Power Integrity', 'Signal', 'Optical Engineering', 'Mechanical Engineering', 'System Hardware Engineering', 'System Modeling', 'Hardware/Software', 'Statistics', 'Verilog', 'Test/Design/Manufacturing/Prototyping tools', 'Hardware', 'Design Verification', 'Digital Design', 'Computer Architecture', 'Test/Measurement'}
{'Computer Science', 'PhD', 'Electrical Engineering', 'Computer Engineering'}


Make this more robust in implementation

In [None]:
for description in descriptions:
  doc = preprocess_document(description)
  print("Preprocessed doc: ", doc)
  required, preferred = extract_skills(doc)
  print("Raw Required: ", required)
  print("Sanitized Required: ", sanitize_skills(required))
  print("Raw Preferred: ", preferred)
  print("Sanitized Preferred: ", sanitize_skills(preferred))

Preprocessed doc:   The ideal candidate must have expertise in Python, machine learning, and data analysis. Experience with cloud platforms such as AWS or Azure is required. Preferred skills include knowledge of Docker, Kubernetes, and CI/CD pipelines. Having a background in cybersecurity is a plus. 3D printing is also a plus. 
Raw Required:  {'data analysis', 'machine learning', 'expertise', 'AWS', 'Azure', 'Python', ' The ideal candidate', 'cloud platforms', 'Experience'}
Sanitized Required:  {'data analysis', 'machine learning', 'AWS', 'Python', 'cloud platforms'}
Raw Preferred:  {'a background', 'CI/CD pipelines', 'Preferred skills', 'knowledge', 'Docker', '3D printing', 'Kubernetes', 'cybersecurity', 'a plus'}
Sanitized Preferred:  {'Docker', '3D printing', 'CI/CD pipelines', 'Kubernetes'}
Preprocessed doc:   Minimum qualifications: Currently pursuing a Bachelor’s degree in Electrical Engineering, Computer Engineering, Computer Science or a related field. Experience in one or more

In [None]:
for description in descriptions:
  doc = nlp(description)
  for ent in doc.ents:
    print(ent.text, ent.label_)
  print("Done with desc")

Python GPE
AWS ORG
Docker GPE
Kubernetes ORG
CI PERSON
Done with desc
Bachelor ORG
Electrical Engineering ORG
Computer Engineering ORG
one CARDINAL
Computer Architecture, Circuit Design/Simulation, Design Verification ORG
Digital Design ORG
Embedded Systems ORG
Hardware/Software ORG
Signal ORG
EMC ORG
Networking/Security ORG
Verilog PERSON
a Master’s or PhD WORK_OF_ART
Electrical Engineering ORG
Computer Engineering ORG
Hardware, Electrical Engineering ORG
Mechanical Engineering ORG
Communication Engineering ORG
Ocean Engineering ORG
Optical Engineering ORG
Test/Design/Manufacturing/Prototyping ORG
one CARDINAL
Matlab ORG
C++ GPE
Google ORG
Google ORG
Google ORG
Electrical Hardware Engineering, System Hardware Engineering WORK_OF_ART
Done with desc


In [None]:
import re
for description in descriptions:

  for skill in re.findall(r"(?:experience with |proficient in |must have |required to have |expertise in |experience in )([a-zA-Z0-9,: ]+).", description, re.IGNORECASE):
    print(skill)
  print("Done")

expertise in Python, machine learning, and data analysis
cloud platforms such as AWS or Azure is required
Done
one or more of the areas: Computer Architecture, Circuit Design
Test
Done


# Google AI version

In [17]:
from google.colab import userdata
userdata.get('GOOGLE_API_KEY')

'AIzaSyAfwv2A9HWgr2QIdZpEgNCPbkq9MCppeXw'

In [25]:
from pydantic import BaseModel, Field

class SkillHolder(BaseModel):
  required: list[str]
  preferred: list[str]

In [26]:
SkillHolder.model_json_schema()

{'properties': {'required': {'items': {'type': 'string'},
   'title': 'Required',
   'type': 'array'},
  'preferred': {'items': {'type': 'string'},
   'title': 'Preferred',
   'type': 'array'}},
 'required': ['required', 'preferred'],
 'title': 'SkillHolder',
 'type': 'object'}

In [27]:
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(
            "You are a candidate for a job whose description is provided. You'll need to extract all the required and preferred technical skills given the job description to see whether your profile is a good fit for the position or not. Your background does not matter as long as all the skills that are required and preferred in the job description are reported back as the response. Required skills go into the required_skills array. Preferred skills would go to the preferred_skills array. Degrees (Bachelors, Masters, PhD) are not skills."
        ),
        HumanMessagePromptTemplate.from_template("{input}"),
    ]
)

In [6]:
%pip install langchain_google_genai

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading langchain_google_genai-2.0.7-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Installing collected packages: filetype, langchain_google_genai
Successfully installed filetype-1.2.0 langchain_google_genai-2.0.7


In [28]:
from google.colab import userdata
import os

os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro", temperature=0.7, verbose=True
)  # Initialize Gemini model

In [30]:
structured_llm = llm.with_structured_output(SkillHolder)


In [31]:
# get the test resume from file
structured_llm.invoke(chat_template.format_prompt(input=descriptions[0]).to_messages())


SkillHolder(required=['Python', 'machine learning', 'data analysis', 'AWS or Azure'], preferred=['Docker', 'Kubernetes', 'CI/CD pipelines', 'cybersecurity', '3D printing'])

In [32]:
# get the test resume from file
structured_llm.invoke(chat_template.format_prompt(input=descriptions[1]).to_messages())


SkillHolder(required=['Computer Architecture', 'Circuit Design', 'Simulation', 'Design Verification', 'Digital Design', 'Embedded Systems', 'Hardware', 'Software', 'Signal', 'Power Integrity', 'EMC', 'Statistics', 'System Modeling', 'Networking', 'Security', 'Test', 'Measurement', 'Verilog'], preferred=['Hardware', 'Electrical Engineering', 'Mechanical Engineering', 'Communication Engineering', 'Ocean Engineering', 'Optical Engineering', 'Test', 'Design', 'Manufacturing', 'Prototyping', 'Python', 'Matlab', 'C++'])