# NER Based Skill Extraction

For skill extraction, we built a custom Skill-NER model using spaCy's PhraseMatcher and a curated dictionary of 450+ skills. This allowed us to define our own ‘SKILL’ entity type without requiring a large annotated dataset. In addition, spaCy’s statistical NER (ORG, GPE, DATE) was used to extract metadata from resumes and job postings. This combination of rule-based and statistical NER forms a hybrid NER approach aligned with Module 4 of the course.

### Read cleaned resume file

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
text= pd.read_csv("../Data_Cleaning/cleaned_resume.csv")
text = text['cleaned_text'][0]

### Custom SKILL dictionary

In [2]:
# first version of the Master Skill list (the group's input)

MASTER_SKILL_LIST = [

    # Techical / Technical role
    "python", "r", "java", "javascript", "typescript",
    "c++", "c#", "scala", "go", "matlab",
    "bash", "shell scripting",
    "software engineering",
    "software development",
    "full stack development",
    "frontend development",
    "backend development",
    "api design",
    "rest apis",
    "microservices",
    "distributed systems",
    "scalable systems",
    "cloud infrastructure",
    "cloud computing",
    "cloud native",
    "cloud platforms",

    # Data Analytics
    "sql", "nosql", "postgresql", "mysql", "oracle", "sqlite",
    "mongodb", "snowflake", "redshift", "bigquery", "azure sql",
    "data analysis", "data analytics", "statistical analysis",
    "business intelligence", "operational reporting",
    "process mapping",  "requirements analysis",
    "risk management", "financial reporting",

    # Data Tools
    "pandas", "numpy", "scipy", "matplotlib", "seaborn",
    "plotly", "pyspark", "spark", "hadoop", "hive", "mapreduce", "jira"

    # Machine Learning
    "machine learning", "deep learning", "neural networks",
    "logistic regression", "linear regression", "random forest",
    "xgboost", "lightgbm", "catboost",
    "svm", "knn", "decision trees", "pca", "kmeans",
    "gradient boosting", "model tuning", "feature engineering",

    # NLP
    "nlp", "natural language processing", "topic modeling",
    "lda", "lsa", "keyword extraction",
    "named entity recognition", "text classification",
    "sentiment analysis", "embeddings", "bert", "word2vec",

    # Cloud
    "aws", "azure", "gcp", "docker", "kubernetes",
    "lambda", "ec2", "s3", "athena", "dynamodb",
    "databricks", "airflow", "cloud functions",

    # BI Tools
    "tableau", "power bi", "metabase", "looker", "qlik",
    "data visualization", "dashboard development",

    # ETL / Pipelines
    "etl", "elt", "data pipeline", "data ingestion",
    "data cleaning", "data transformation", "data integration",

    # Version Control & DevOps
    "git", "github", "gitlab", "bitbucket",
    "ci/cd", "jenkins",

    # Enterprise Tools
    "sap", "sap erp", "salesforce", "salesforce crm",
    "hubspot", "hubspot crm", "airtable", "jira", "confluence", "notion",

    # BUSINESS & ANALYTICS SKILLS
    "business analysis", "requirements gathering",
    "market research", "competitive analysis",
    "financial analysis", "risk analysis", "cost analysis",
    "forecasting", "trend analysis", "variance analysis",
    "p&l management", "strategic planning",
    "business modeling", "stakeholder management",
    "reporting", "presentation development",
    "process improvement", "process optimization",
    "root cause analysis", "gap analysis",
    "workflow automation", "operational efficiency",
    "kpi analysis", "performance analysis",
    "customer segmentation", "persona development",
    "data-driven decision making",

    # Consulting skills
    "problem solving", "insights synthesis",
    "client communication", "proposal writing",
    "project scoping", "roadmap planning",
    "change management", "cross-functional collaboration",


    # Marketing/ Sales
    "crm management", "lead generation", "pipeline management",
    "sales operations", "sales strategy", "sales forecasting",
    "revenue operations", "revops", "gtm strategy",
    "go-to-market", "account management",
    "client success", "customer retention", "digital marketing",
    "content marketing", "seo", "sem", "ppc", "email marketing",
    "campaign optimization", "social media analytics",

    # Marketing tools
    "marketing automation", "google analytics",
    "google ads", "mailchimp", "marketo",
    "outreach", "gong", "zoominfo",

    # RevOps Processes
    "validation rules", "crm integrations",
    "funnel analysis", "data stamping",

    # PRODUCT SKILLS
    "product management", "product analytics",
    "a/b testing", "experiment design",
    "feature prioritization", "user research", "ux research",
    "user stories", "agile", "scrum", "kanban",
    "roadmap development", "user journey mapping",
    "requirements documentation",
    "market sizing", "competitive positioning",

    # FINANCE & OPERATIONS SKILLS
    "fp&a", "financial modeling", "budgeting",
    "scenario analysis", "invoice processing",
    "billing operations", "revenue analysis",
    "cost optimization",

    # Operations & Supply Chain
    "supply chain management", "inventory management",
    "logistics", "procurement", "vendor management",
    "operations management", "kpi reporting",

    # SOFT SKILLS

    "communication", "leadership", "teamwork",
    "collaboration", "critical thinking", "problem solving",
    "adaptability", "time management",
    "presentation skills", "negotiation",
    "public speaking", "project management",
    "detail oriented", "strategic thinking",
    "multitasking", "analytical thinking",
    "decision making", "organization skills",
    "attention to detail", "stakeholder communication",
    "conflict resolution", "problem-solving skills",
    "relationship building","coaching", "mentoring"
]

# ---------------------------------------
# Extra skills / Tools (multi-industry)
# ---------------------------------------
EXTRA_SKILLS = [


    # Programming/ tech
    "django", "flask", "fastapi",
    "react", "react native", "angular", "vue.js", "next.js",
    "node.js", "express.js",
    "php", "ruby", "ruby on rails",
    "swift", "kotlin", "objective-c",
    "c", "perl", "rust", "haskell",

    # Mobile / app
    "android development", "ios development",
    "xcode", "android studio",

    # Testing / QA
    "unit testing", "integration testing",
    "qa testing", "automation testing",
    "selenium", "cypress", "pytest", "junit",

    # Security / networking
    "network security", "firewall configuration",
    "penetration testing", "vulnerability assessment",
    "siem", "splunk", "wireshark",
    "ssl", "tls", "vpn",

    # Data analytics
    "excel", "microsoft excel",
    "vlookup", "pivot tables",
    "google sheets",
    "sql server", "db2",

    "sas", "stata", "spss",
    "power query", "power pivot",

    "mode analytics", "lookml",
    "amplitude", "mixpanel",
    "hex", "metabase",

    # Cloud/ DevOps/ Infra
    "terraform", "ansible", "chef", "puppet",
    "github actions", "circleci", "travis ci",

    "aws lambda", "aws rds", "aws ecs", "aws ecr",
    "aws glue", "aws athena", "aws redshift",
    "azure data factory", "azure databricks",
    "gcp pubsub", "gcp dataflow", "gcp dataproc",


    # Product/ Design/ UX
    "figma", "sketch", "adobe xd",
    "invision", "balsamiq",
    "user journey mapping", "service blueprinting",
    "design thinking", "wireframing", "prototyping",
    "usability testing", "user interviews", "heuristic evaluation",


    # Marketing / Growth
    "meta ads manager", "facebook ads", "instagram ads",
    "tiktok ads", "linkedin ads",
    "google tag manager", "google search console",
    "seo keyword research", "on-page seo", "technical seo",
    "crm campaigns", "lifecycle marketing",
    "marketing funnel analysis", "conversion rate optimization",
    "ab testing", "landing page optimization",

    # Email / automation
    "klaviyo", "hubspot marketing", "salesforce marketing cloud",
    "customer.io", "braze", "iterable",

    # E-commerce
    "shopify", "woocommerce", "bigcommerce", "magento",
    "product catalog management", "pricing optimization",
    "merchandising", "inventory planning",


    # Sales/ customer success/ RevOps
    "salesforce administration", "salesforce reporting",
    "salesforce dashboards", "salesforce flows",
    "cpq", "quote to cash",
    "salesforce service cloud", "salesforce sales cloud",
    "hubspot sales", "pipedrive", "zoho crm",
    "microsoft dynamics 365",
    "outreach", "salesloft", "apollo",
    "gong", "chorus", "zoominfo",
    "cold calling", "cold emailing",
    "account planning", "territory planning",
    "renewal management", "upsell strategy",
    "churn analysis",


    # Finance/ Accounting
    "accounts payable", "accounts receivable",
    "general ledger", "reconciliation",
    "month-end close", "year-end close",
    "cash flow forecasting", "variance analysis",
    "quickbooks", "xero", "netsuite",
    "sap fico", "oracle ebs", "oracle fusion",
    "financial statement analysis",
    "credit risk modeling", "valuation modeling",
    "discounted cash flow", "dcf modeling",
    "equity research", "portfolio analysis",


    # HR/ People
    "recruiting", "candidate screening",
    "interview scheduling", "offer negotiation",
    "onboarding", "offboarding",
    "performance review process",
    "succession planning",
    "compensation analysis", "benefits administration",
    "workday", "workday hcm",
    "sap successfactors", "oracle hcm",
    "bamboohr", "greenhouse", "lever", "jobvite",

    # Operation / Logistic
    "demand planning", "capacity planning",
    "production scheduling", "quality control",
    "lean manufacturing", "six sigma", "kaizen",
    "5s methodology", "root cause analysis",
    "warehouse management", "route optimization",
    "fleet management", "last mile delivery",
    "order fulfillment", "inventory forecasting",
    "sap mm", "sap sd", "sap pp",
    "oracle scm", "manhattan wms",

    # Healthcare
    "electronic medical records", "emr systems",
    "ehr systems", "epic systems", "cerner",
    "icd-10 coding", "cpt coding",
    "clinical trials", "gcp compliance", "good clinical practice",
    "fda regulations", "hipaa compliance",
    "lab information systems", "pharmacovigilance",


    # Education
    "curriculum development", "lesson planning",
    "classroom management", "learning management systems",
    "moodle", "canvas lms", "blackboard lms",
    "online course design", "instructional design",

    # Legal

    "contract review", "contract drafting",
    "regulatory compliance", "policy development",
    "risk assessment", "internal controls",
    "gdpr compliance", "sox compliance",
    "kyd", "kyc", "aml monitoring", "anti-money laundering",


    # Creative/ Media

    "adobe photoshop", "adobe illustrator",
    "adobe indesign", "adobe premiere pro", "after effects",
    "video editing", "photo editing",
    "storyboarding", "script writing",
    "content strategy", "content calendar",
    "social media content creation",


    # Hospitality/ Retail

    "pos systems", "reservation systems",
    "inventory counting", "food safety",
    "barista skills", "cash handling",
    "customer check-in", "front desk operations",
    "event planning", "banquet operations",


    # Construction/ Engineer
    "autocad", "revit", "solidworks",
    "project bidding", "site inspection",
    "blueprint reading", "quantity surveying",
    "building codes", "osha compliance",
    "pmp", "primavera p6", "ms project",

    # Daya privacy
    "itil framework", "incident management",
    "change management process", "service desk operations",
    "access control", "identity management",


    # Language
    "translation", "interpretation",
    "bilingual communication", "multilingual support"
]




MASTER_SKILL_LIST = list(set(MASTER_SKILL_LIST + EXTRA_SKILLS))

In [3]:
print("Base master skills:", len(MASTER_SKILL_LIST))

Base master skills: 530


##  Skill matcher

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.8 MB/s[0m  [33m0:00:00[0meta [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import numpy as np
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")


def build_skill_ner(skill_list):
    """
    Builds a spaCy PhraseMatcher for custom skill extraction.
    """
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

    # Create spaCy Doc patterns for each skill phrase
    patterns = [nlp.make_doc(skill) for skill in skill_list]
    matcher.add("SKILL", patterns)

    return matcher

skill_matcher = build_skill_ner(MASTER_SKILL_LIST)

In [6]:
def extract_skill_entities(text):
    """
    Extracts skill entities from text using the SKILL PhraseMatcher.
    Returns a unique list of skills (lowercased).
    """
    doc = nlp(text)
    matches = skill_matcher(doc)

    skills_found = set()

    for match_id, start, end in matches:
        span = doc[start:end]
        skills_found.add(span.text.lower())

    return sorted(list(skills_found))

In [7]:
skills_in_resume = extract_skill_entities(text)
print("Extracted Skills:")
print(skills_in_resume)

Extracted Skills:
['communication', 'data analysis', 'docker', 'etl', 'excel', 'forecasting', 'git', 'knn', 'lda', 'logistic regression', 'market research', 'metabase', 'mongodb', 'nosql', 'operational efficiency', 'pandas', 'postgresql', 'power query', 'python', 'r', 'sap', 'sap erp', 'sql', 'topic modeling', 'trend analysis', 'xgboost']


### Custom SKILL recognizer using PhraseMatcher

In [8]:
import spacy
from spacy import displacy
from IPython.display import HTML
spacy.displacy.render.__globals__["is_in_jupyter"] = lambda: False


class HTMLVisualizer:
    def __init__(self, html):
        self.html = html

    def _repr_html_(self):
        return self.html


def visualize_skill_entities(text):
    doc = nlp(text)
    matches = skill_matcher(doc)

    ents = []
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    doc_dict = {"text": text, "ents": ents}

    html = displacy.render(doc_dict, style="ent", manual=True)

    return HTMLVisualizer(html)

visualize_skill_entities(text)

In [9]:
from spacy import displacy
from IPython.display import HTML

def visualize_all_entities(text):
    """
    Visualize BOTH:
    - spaCy NER (ORG, DATE, GPE, etc.)
    - custom SKILL entities
    """
    doc = nlp(text)

    # Default spaCy entities
    ents = [{
        "start": ent.start_char,
        "end": ent.end_char,
        "label": ent.label_
    } for ent in doc.ents]

    # Add custom SKILL entities
    matches = skill_matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    # Sort entities by start index
    ents = sorted(ents, key=lambda x: x["start"])

    # Build manual entity rendering dict
    doc_dict = {"text": text, "ents": ents}

    # --- SAFE HTML rendering (spaCy will NOT import display now) ---
    html = displacy.render(doc_dict, style="ent", manual=True)

    return HTML(html)

visualize_all_entities(text)

## Read cleaned job description file

In [13]:
jobs_df = pd.read_csv("../Data_Cleaning/cleaned_job_data_dedup.csv")

JOB_TEXT_COL = "job_text_cleaned"

print("Shape:", jobs_df.shape)
print("Columns:", jobs_df.columns)

Shape: (14760, 2)
Columns: Index(['job_id', 'job_text_cleaned'], dtype='object')


### NER skill extractor on each job description

In [14]:
# randomly pick 100 jobs (use for testing purposes)
#jobs_df_sample = jobs_df.sample(n=100, random_state=64)

#JOB_TEXT_COL = "job_text_cleaned"

### PhraseMatcher for skills

In [15]:
# use the same extract_skill_entities function

In [16]:
texts = jobs_df[JOB_TEXT_COL].fillna("").tolist()

all_skills = []

for doc in nlp.pipe(texts, batch_size=50):
    doc_skills = []

    # use the same matcher
    matches = skill_matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        doc_skills.append(span.text.strip())

    # remove duplicates for this JD
    doc_skills = list(set(doc_skills))
    all_skills.append(doc_skills)

# attach to the sample dataframe
jobs_df["skills_extracted"] = all_skills
jobs_df["n_skills"] = jobs_df["skills_extracted"].apply(len)

In [17]:
jobs_df.head(5)

Unnamed: 0,job_id,job_text_cleaned,skills_extracted,n_skills
0,0,Job Title:\nDigital Marketing Specialist\nResp...,"[Digital Marketing, Social media analytics]",2
1,1,Job Title:\nWeb Developer\nResponsibilities:\n...,"[JavaScript, React, Angular]",3
2,2,Job Title:\nOperations Manager\nResponsibiliti...,"[Root cause analysis, quality control, Quality...",4
3,3,Job Title:\nNetwork Engineer\nResponsibilities...,[],0
4,4,Job Title:\nEvent Manager\nResponsibilities:\n...,"[budgeting, Event planning, logistics]",3


In [18]:
jobs_df['n_skills'].min()

np.int64(0)

In [19]:
jobs_df['n_skills'].max()

np.int64(52)

#### SKILL-only visualization

In [20]:
import spacy
from spacy import displacy
from IPython.display import HTML

# Make displacy work nicely inside Jupyter
spacy.displacy.render.__globals__["is_in_jupyter"] = lambda: True

class HTMLVisualizer:
    def __init__(self, html):
        self.html = html

    def _repr_html_(self):
        return self.html

def visualize_skill_entities(text):
    """
    Visualize the skills detected by skill_matcher
    inside a block of job description text.
    """

    doc = nlp(text)
    matches = skill_matcher(doc)

    ents = []
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    # displaCy manual mode
    doc_dict = {
        "text": text,
        "ents": ents,
        "title": None
    }

    html = displacy.render(doc_dict, style="ent", manual=True)

    return HTMLVisualizer(html)

In [21]:
visualize_skill_entities(jobs_df["job_text_cleaned"].iloc[4])

<__main__.HTMLVisualizer at 0xffff29458590>

#### All Entities visualization

In [22]:
# just in case

In [23]:
import spacy
from spacy import displacy
from IPython.display import HTML

spacy.displacy.render.__globals__["is_in_jupyter"] = lambda: True

def visualize_job_entities(text):
    """
    Visualize BOTH:
    - spaCy NER entities (ORG, DATE, GPE, PERSON, MONEY, PRODUCT, NORP, etc.)
    - Custom SKILL entities from skill_matcher
    """

    doc = nlp(text)

    # ----- 1) spaCy default NER entities -----
    ents = [{
        "start": ent.start_char,
        "end": ent.end_char,
        "label": ent.label_
    } for ent in doc.ents]

    # ----- 2) Add custom SKILL entities -----
    matches = skill_matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        ents.append({
            "start": span.start_char,
            "end": span.end_char,
            "label": "SKILL"
        })

    # Sort entities by start index for clean display
    ents = sorted(ents, key=lambda x: x["start"])

    # ----- 3) Build displaCy structure -----
    doc_dict = {
        "text": text,
        "ents": ents,
        "title": None
    }

    html = displacy.render(doc_dict, style="ent", manual=True)
    return HTML(html)

In [24]:
visualize_job_entities(jobs_df["job_text_cleaned"].iloc[3])

<IPython.core.display.HTML object>

# SkillCore with Jaccard Similarity

In [25]:
def skill_jaccard_score(resume_skills, job_skills):
    """
    Jaccard similarity between resume skills and job skills.
    = overlap / union
    """

    resume_set = set(resume_skills)
    job_set = set(job_skills)

    # If both are empty, return 0
    union = resume_set | job_set
    if not union:
        return 0.0

    overlap = resume_set & job_set

    score = len(overlap) / len(union)
    return score


In [26]:
jobs_df["SkillScore"] = jobs_df["skills_extracted"].apply(
    lambda job_sk: skill_jaccard_score(skills_in_resume, job_sk)
)

# Add resume skills column (same for every row)
jobs_df["resume_skills"] = [skills_in_resume] * len(jobs_df)


top_matches = (
    jobs_df
    .sort_values("SkillScore", ascending=False)
    [[JOB_TEXT_COL, "skills_extracted", "resume_skills", "SkillScore"]]
    .head(10)
)

In [27]:
# pd.set_option('display.max_colwidth', None)

In [28]:
top_matches

Unnamed: 0,job_text_cleaned,skills_extracted,resume_skills,SkillScore
9830,Job Title:\nSenior DevOps Engineer with verifi...,"[software development, python, perl, bash, exc...","[communication, data analysis, docker, etl, ex...",0.125
2070,Job Title:\nSenior Analyst with verification\n...,"[collaboration, Workday, reporting, operationa...","[communication, data analysis, docker, etl, ex...",0.117647
9140,Job Title:\nJava Springboot Developer\nJob Des...,"[Git, tls, Kubernetes, TLS, aws, GCP, Microser...","[communication, data analysis, docker, etl, ex...",0.115385
991,Job Title:\nManufacturing Controller with veri...,"[attention to detail, reporting, data analysis...","[communication, data analysis, docker, etl, ex...",0.114286
8507,Job Title:\nData Engineer with verification\nJ...,"[collaboration, Snowflake, attention to detail...","[communication, data analysis, docker, etl, ex...",0.111111
5760,Job Title:\nData Engineer with verification\nJ...,"[collaboration, Snowflake, attention to detail...","[communication, data analysis, docker, etl, ex...",0.108108
13825,Job Title:\nIntern\nJob Description:\nAbout th...,"[data analysis, Teamwork, excel, communication...","[communication, data analysis, docker, etl, ex...",0.107143
10055,Job Title:\nFinancial Analyst\nJob Description...,"[reporting, operational efficiency, communicat...","[communication, data analysis, docker, etl, ex...",0.107143
8236,Job Title:\nCustomer Service Representative\nJ...,"[operational efficiency, communication, HubSpo...","[communication, data analysis, docker, etl, ex...",0.107143
12393,Job Title:\nMachine Learning Engineer\nJob Des...,"[Python, attention to detail, xgboost, pandas,...","[communication, data analysis, docker, etl, ex...",0.103448


In [29]:
bottom_matches = (
    jobs_df
    .sort_values("SkillScore", ascending=True)
    [[JOB_TEXT_COL, "skills_extracted", "resume_skills", "SkillScore"]]
    .head(10)
)

In [30]:
bottom_matches

Unnamed: 0,job_text_cleaned,skills_extracted,resume_skills,SkillScore
0,Job Title:\nDigital Marketing Specialist\nResp...,"[Digital Marketing, Social media analytics]","[communication, data analysis, docker, etl, ex...",0.0
12133,Job Title:\nFraiseur (F/H) with verification\n...,[],"[communication, data analysis, docker, etl, ex...",0.0
6566,Job Title:\nSoftware Engineer with verificatio...,"[Git, C#, C]","[communication, data analysis, docker, etl, ex...",0.0
12134,Job Title:\nResponsable de Comptes Régionaux ...,[],"[communication, data analysis, docker, etl, ex...",0.0
12135,Job Title:\nSpecjalista ds. Rozliczeń Energet...,[],"[communication, data analysis, docker, etl, ex...",0.0
6562,Job Title:\nNursing Assistant I\nJob Descripti...,[],"[communication, data analysis, docker, etl, ex...",0.0
6561,Job Title:\nEstrategista de mídias sociais\nJ...,[],"[communication, data analysis, docker, etl, ex...",0.0
12136,Job Title:\nSpezialist für IT-Sicherheit (m/w...,"[SIEM, Azure]","[communication, data analysis, docker, etl, ex...",0.0
6558,Job Title:\nSenior Software Engineer with veri...,"[Python, collaboration, attention to detail, R...","[communication, data analysis, docker, etl, ex...",0.0
6556,Job Title:\nProject Manager\nJob Description:\...,[],"[communication, data analysis, docker, etl, ex...",0.0


In [31]:
jobs_df_final = jobs_df[["job_text_cleaned", "SkillScore"]]

# Final df before normalize the score

In [32]:
jobs_df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14760 entries, 0 to 14759
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_text_cleaned  14760 non-null  object 
 1   SkillScore        14760 non-null  float64
dtypes: float64(1), object(1)
memory usage: 230.8+ KB


## Normalize the SkillScore

In [33]:
# min and max SkillScore
min_s = jobs_df_final["SkillScore"].min()
max_s = jobs_df_final["SkillScore"].max()

# Avoid division by zero
if max_s == min_s:
    jobs_df_final["SkillScore_normalized"] = 0.0
else:
    jobs_df_final["SkillScore_normalized"] = (
        (jobs_df_final["SkillScore"] - min_s) / (max_s - min_s)
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jobs_df_final["SkillScore_normalized"] = (


In [34]:
# New normalized dataframe
jobs_df_normalized = jobs_df_final[[
    "job_text_cleaned",
    "SkillScore_normalized"
]].copy()

jobs_df_normalized.head()

Unnamed: 0,job_text_cleaned,SkillScore_normalized
0,Job Title:\nDigital Marketing Specialist\nResp...,0.0
1,Job Title:\nWeb Developer\nResponsibilities:\n...,0.0
2,Job Title:\nOperations Manager\nResponsibiliti...,0.0
3,Job Title:\nNetwork Engineer\nResponsibilities...,0.0
4,Job Title:\nEvent Manager\nResponsibilities:\n...,0.0


In [35]:
top10_normalized = (
    jobs_df_normalized
    .sort_values("SkillScore_normalized", ascending=False)
    .head(10)
)

top10_normalized

Unnamed: 0,job_text_cleaned,SkillScore_normalized
9830,Job Title:\nSenior DevOps Engineer with verifi...,1.0
2070,Job Title:\nSenior Analyst with verification\n...,0.941176
9140,Job Title:\nJava Springboot Developer\nJob Des...,0.923077
991,Job Title:\nManufacturing Controller with veri...,0.914286
8507,Job Title:\nData Engineer with verification\nJ...,0.888889
5760,Job Title:\nData Engineer with verification\nJ...,0.864865
13825,Job Title:\nIntern\nJob Description:\nAbout th...,0.857143
10055,Job Title:\nFinancial Analyst\nJob Description...,0.857143
8236,Job Title:\nCustomer Service Representative\nJ...,0.857143
12393,Job Title:\nMachine Learning Engineer\nJob Des...,0.827586


# Final Normalized SkillScore df

In [36]:
jobs_df_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14760 entries, 0 to 14759
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   job_text_cleaned       14760 non-null  object 
 1   SkillScore_normalized  14760 non-null  float64
dtypes: float64(1), object(1)
memory usage: 230.8+ KB
