# NER Based Skill Extraction

For skill extraction, we built a custom Skill-NER model using spaCy's PhraseMatcher and a curated dictionary of 450+ skills. This allowed us to define our own ‘SKILL’ entity type without requiring a large annotated dataset. In addition, spaCy’s statistical NER (ORG, GPE, DATE) was used to extract metadata from resumes and job postings. This combination of rule-based and statistical NER forms a hybrid NER approach aligned with Module 4 of the course.

### Read cleaned resume file

In [None]:
import pandas as pd

text= pd.read_csv("Resume/cleaned_resume.csv")
text = text['cleaned_text'][0]


'SKILLS\n - Programming & Analytics: Python, R - Version Control: Git,\n - Databases & Data Management: PostgreSQL, MongoDB - Environment Control: Docker\n (NoSQL), Navicat, SAP ERP - Microsoft: Excel, PowerPoint, Word, Outlook\n\n PROFESSIONAL EXPERIENCE\nCEF SOLUTIONS, INC. CONSULTING Seoul, South Korea\nBusiness Enablement Associate Sep 2023 - Mar 2024\n - Conducted market research and workflow analysis for a leading Asian financial institution’s auto-loan initiative,\n synthesizing insights that informed strategic recommendations adopted by senior management.\n - Designed and delivered proposal decks and presentations for stakeholders, enhancing project clarity and accelerating\n stakeholder decision-making.\n - Optimized and standardized process workflows for new initiatives resulting in notable increases in operational efficiency.\n - Managed project timelines and deliverables, leading structured meetings and documentation to ensure alignment and\n continuous progress.\n\nSAMSUNG

### Custom SKILL dictionary

In [4]:
MASTER_SKILL_LIST = [

    # ------------------------------
    # TECHNICAL / DATA SKILLS
    # ------------------------------
    "python", "r", "java", "javascript", "typescript",
    "c++", "c#", "scala", "go", "matlab",
    "bash", "shell scripting",

    # Data Analytics
    "sql", "nosql", "postgresql", "mysql", "oracle", "sqlite",
    "mongodb", "snowflake", "redshift", "bigquery", "azure sql",
    "data analysis", "data analytics", "statistical analysis",

    # Data Tools
    "pandas", "numpy", "scipy", "matplotlib", "seaborn",
    "plotly", "pyspark", "spark", "hadoop", "hive", "mapreduce",

    # Machine Learning
    "machine learning", "deep learning", "neural networks",
    "logistic regression", "linear regression", "random forest",
    "xgboost", "lightgbm", "catboost",
    "svm", "knn", "decision trees", "pca", "kmeans",
    "gradient boosting", "model tuning", "feature engineering",

    # NLP
    "nlp", "natural language processing", "topic modeling",
    "lda", "lsa", "keyword extraction",
    "named entity recognition", "text classification",
    "sentiment analysis", "embeddings", "bert", "word2vec",

    # Cloud
    "aws", "azure", "gcp", "docker", "kubernetes",
    "lambda", "ec2", "s3", "athena", "dynamodb",
    "databricks", "airflow", "cloud functions",

    # BI Tools
    "tableau", "power bi", "metabase", "looker", "qlik",
    "data visualization", "dashboard development",

    # ETL / Pipelines
    "etl", "elt", "data pipeline", "data ingestion",
    "data cleaning", "data transformation", "data integration",

    # Version Control & DevOps
    "git", "github", "gitlab", "bitbucket",
    "ci/cd", "jenkins",

    # Enterprise Tools
    "sap", "sap erp", "salesforce", "salesforce crm",
    "hubspot", "hubspot crm", "airtable", "jira", "confluence", "notion",

    # ------------------------------
    # BUSINESS & ANALYTICS SKILLS
    # ------------------------------
    "business analysis", "requirements gathering",
    "market research", "competitive analysis",
    "financial analysis", "risk analysis", "cost analysis",
    "forecasting", "trend analysis", "variance analysis",
    "p&l management", "strategic planning",
    "business modeling", "stakeholder management",
    "reporting", "presentation development",
    "process improvement", "process optimization",
    "root cause analysis", "gap analysis",
    "workflow automation", "operational efficiency",
    "kpi analysis", "performance analysis",
    "customer segmentation", "persona development",
    "data-driven decision making",

    # Consulting skills
    "problem solving", "insights synthesis",
    "client communication", "proposal writing",
    "project scoping", "roadmap planning",
    "change management", "cross-functional collaboration",

    # ------------------------------
    # MARKETING / SALES / REVOPS SKILLS
    # ------------------------------
    "crm management", "lead generation", "pipeline management",
    "sales operations", "sales strategy", "sales forecasting",
    "revenue operations", "revops", "gtm strategy",
    "go-to-market", "account management",
    "client success", "customer retention",

    # Marketing
    "digital marketing", "content marketing",
    "seo", "sem", "ppc", "email marketing",
    "campaign optimization", "social media analytics",

    # Marketing tools
    "marketing automation", "google analytics",
    "google ads", "mailchimp", "marketo",
    "outreach", "gong", "zoominfo",

    # RevOps Processes
    "validation rules", "crm integrations",
    "funnel analysis", "data stamping",

    # ------------------------------
    # PRODUCT SKILLS
    # ------------------------------
    "product management", "product analytics",
    "a/b testing", "experiment design",
    "feature prioritization", "user research", "ux research",
    "user stories", "agile", "scrum", "kanban",
    "roadmap development", "user journey mapping",
    "requirements documentation",
    "market sizing", "competitive positioning",

    # ------------------------------
    # FINANCE & OPERATIONS SKILLS
    # ------------------------------
    "fp&a", "financial modeling", "budgeting",
    "scenario analysis", "invoice processing",
    "billing operations", "revenue analysis",
    "cost optimization",

    # Operations & Supply Chain
    "supply chain management", "inventory management",
    "logistics", "procurement", "vendor management",
    "operations management", "kpi reporting",

    # ------------------------------
    # SOFT SKILLS
    # ------------------------------
    "communication", "leadership", "teamwork",
    "collaboration", "critical thinking", "problem solving",
    "adaptability", "time management",
    "presentation skills", "negotiation",
    "public speaking", "project management",
    "detail oriented", "strategic thinking",
    "multitasking", "analytical thinking",
    "decision making", "organization skills"
]

MASTER_SKILL_LIST = list(set(MASTER_SKILL_LIST))


In [5]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")


def build_skill_ner(skill_list):
    """
    Builds a spaCy PhraseMatcher for custom skill extraction.
    """
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

    # Create spaCy Doc patterns for each skill phrase
    patterns = [nlp.make_doc(skill) for skill in skill_list]
    matcher.add("SKILL", patterns)

    return matcher

skill_matcher = build_skill_ner(MASTER_SKILL_LIST)

In [6]:
def extract_skill_entities(text):
    """
    Extracts skill entities from text using the SKILL PhraseMatcher.
    Returns a unique list of skills (lowercased).
    """
    doc = nlp(text)
    matches = skill_matcher(doc)

    skills_found = set()

    for match_id, start, end in matches:
        span = doc[start:end]
        skills_found.add(span.text.lower())

    return sorted(list(skills_found))


In [14]:
skills_in_resume = extract_skill_entities(text)
print("Extracted Skills:")
print(skills_in_resume)

Extracted Skills:
['communication', 'data analysis', 'docker', 'etl', 'forecasting', 'git', 'knn', 'lda', 'logistic regression', 'market research', 'metabase', 'mongodb', 'nosql', 'operational efficiency', 'pandas', 'postgresql', 'python', 'r', 'sap', 'sap erp', 'sql', 'topic modeling', 'trend analysis', 'xgboost']


### Custom SKILL recognizer using PhraseMatcher

In [18]:
import spacy
from spacy import displacy
from IPython.display import HTML
spacy.displacy.render.__globals__["is_in_jupyter"] = lambda: False


class HTMLVisualizer:
    def __init__(self, html):
        self.html = html

    def _repr_html_(self):
        return self.html


def visualize_skill_entities(text):
    doc = nlp(text)
    matches = skill_matcher(doc)

    ents = []
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    doc_dict = {"text": text, "ents": ents}

    html = displacy.render(doc_dict, style="ent", manual=True)

    return HTMLVisualizer(html)

visualize_skill_entities(text)

In [19]:
from spacy import displacy
from IPython.display import HTML

def visualize_all_entities(text):
    """
    Visualize BOTH:
    - spaCy NER (ORG, DATE, GPE, etc.)
    - custom SKILL entities
    """
    doc = nlp(text)

    # Default spaCy entities
    ents = [{
        "start": ent.start_char,
        "end": ent.end_char,
        "label": ent.label_
    } for ent in doc.ents]

    # Add custom SKILL entities
    matches = skill_matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    # Sort entities by start index
    ents = sorted(ents, key=lambda x: x["start"])

    # Build manual entity rendering dict
    doc_dict = {"text": text, "ents": ents}

    # --- SAFE HTML rendering (spaCy will NOT import display now) ---
    html = displacy.render(doc_dict, style="ent", manual=True)

    return HTML(html)

visualize_all_entities(text)