In [6]:
import pandas as pd

df = pd.read_csv("../data/annotated/job_ner_annotations_full_20_jds.csv")
df.head()


Unnamed: 0,sentence_id,token,label
0,1,We,O
1,1,are,O
2,1,looking,O
3,1,for,O
4,1,a,O


In [7]:
sentences_tokens = []
sentences_labels = []

for sent_id, group in df.groupby("sentence_id"):
    tokens = group["token"].tolist()
    labels = group["label"].tolist()
    sentences_tokens.append(tokens)
    sentences_labels.append(labels)


In [8]:
train_tokens = sentences_tokens[:16]
train_labels = sentences_labels[:16]

test_tokens = sentences_tokens[16:]
test_labels = sentences_labels[16:]


In [9]:
from seqeval.metrics import classification_report

pred_all_o = [["O"] * len(seq) for seq in test_tokens]
print(classification_report(test_labels, pred_all_o))


                      precision    recall  f1-score   support

             COMPANY       0.00      0.00      0.00         4
        DEGREE_MAJOR       0.00      0.00      0.00         1
     EDUCATION_LEVEL       0.00      0.00      0.00         1
    EMPLOYEMENT_TYPE       0.00      0.00      0.00         4
           FRAMEWORK       0.00      0.00      0.00         3
           JOB_TITLE       0.00      0.00      0.00         4
            LOCATION       0.00      0.00      0.00         4
PROGRAMMING_LANGUAGE       0.00      0.00      0.00         1
          SKILL_TECH       0.00      0.00      0.00         1
                TOOL       0.00      0.00      0.00         3

           micro avg       0.00      0.00      0.00        26
           macro avg       0.00      0.00      0.00        26
        weighted avg       0.00      0.00      0.00        26



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
import os

raw_texts = []
for i in range(1, 21):
    fname = f"../data/raw/jd_{i:03d}.txt"
    with open(fname, "r", encoding="utf-8") as f:
        raw_texts.append(f.read())


In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")

docs = [nlp(text) for text in raw_texts[16:]]


In [14]:
def spacy_to_bio(doc, tokens):
    bio = ["O"] * len(tokens)
    for ent in doc.ents:
        mapped = None
        if ent.label_ == "ORG":
            mapped = "COMPANY"
        elif ent.label_ in ("GPE", "LOC"):
            mapped = "LOCATION"
        else:
            continue

        ent_tokens = ent.text.split()

        for i in range(len(tokens)):
            if tokens[i:i+len(ent_tokens)] == ent_tokens:
                bio[i] = f"B-{mapped}"
                for j in range(1, len(ent_tokens)):
                    bio[i+j] = f"I-{mapped}"
                break

    return bio


In [15]:
pred_spacy = [
    spacy_to_bio(doc, tok_seq)
    for doc, tok_seq in zip(docs, test_tokens)
]

print(classification_report(test_labels, pred_spacy))


                      precision    recall  f1-score   support

             COMPANY       0.33      0.25      0.29         4
        DEGREE_MAJOR       0.00      0.00      0.00         1
     EDUCATION_LEVEL       0.00      0.00      0.00         1
    EMPLOYEMENT_TYPE       0.00      0.00      0.00         4
           FRAMEWORK       0.00      0.00      0.00         3
           JOB_TITLE       0.00      0.00      0.00         4
            LOCATION       0.00      0.00      0.00         4
PROGRAMMING_LANGUAGE       0.00      0.00      0.00         1
          SKILL_TECH       0.00      0.00      0.00         1
                TOOL       0.00      0.00      0.00         3

           micro avg       0.17      0.04      0.06        26
           macro avg       0.03      0.03      0.03        26
        weighted avg       0.05      0.04      0.04        26



  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# === Rule-based NER: gazetteers / keyword lists ===

job_titles = {
    "data scientist",
    "software engineer",
    "backend developer",
    "frontend developer",
    "machine learning engineer",
    "data analyst",
    "business analyst",
    "cloud engineer",
    "devops engineer",
    "software tester",
    "ui/ux designer",
    "product manager",
    "android developer",
    "data engineer",
    "cybersecurity analyst",
    "database administrator",
    "mobile app developer",
    "research intern",
    "network engineer",
    "full stack developer",
}

companies = {
    "google", "infosys", "amazon", "tcs", "wipro", "deloitte", "ibm",
    "accenture", "hcl", "zoho", "flipkart", "samsung", "microsoft",
    "ey", "oracle", "swiggy", "adobe", "cisco", "paytm"
}

locations = {
    "bangalore", "hyderabad", "chennai", "pune",
    "noida", "gurgaon", "mumbai", "delhi", "remote"
}

employment_types = {
    "full-time", "part-time", "internship", "contract"
}

programming_languages = {
    "python", "java", "javascript", "kotlin", "dart"
}

frameworks = {
    "tensorflow", "pytorch", "spring framework",
    "react", "flutter", "node.js"
}

tools = {
    "sql", "excel", "tableau", "power bi", "aws", "docker",
    "jenkins", "git", "linux", "selenium", "figma", "adobe xd",
    "android studio", "azure", "siem", "ccna", "pl/sql", "mongodb",
    "firebase", "kubernetes", "html", "css"
}

skill_tech = {
    "machine learning",
    "analytical skills",
    "communication skills",
    "prototyping tools",
    "market analysis",
    "network security",
    "routing/switching"
}

education_levels = {
    "bachelor's"
}

degree_majors = {
    "computer science"
}


In [19]:
# Map phrase (lowercase) -> entity label
phrase2label = {}

def add_phrases(phrases, label):
    for p in phrases:
        phrase2label[p.lower()] = label

add_phrases(job_titles, "JOB_TITLE")
add_phrases(companies, "COMPANY")
add_phrases(locations, "LOCATION")
add_phrases(employment_types, "EMPLOYEMENT_TYPE")  # keep your spelling
add_phrases(programming_languages, "PROGRAMMING_LANGUAGE")
add_phrases(frameworks, "FRAMEWORK")
add_phrases(tools, "TOOL")
add_phrases(skill_tech, "SKILL_TECH")
add_phrases(education_levels, "EDUCATION_LEVEL")
add_phrases(degree_majors, "DEGREE_MAJOR")

# Max phrase length (in tokens) – we use longest-match-first
max_phrase_len = max(len(p.split()) for p in phrase2label.keys())
print("Max phrase length:", max_phrase_len)


Max phrase length: 3


In [20]:
def rule_based_tag(tokens):
    """
    Very simple longest-match-first gazetteer tagger.
    tokens: list of strings
    returns: list of BIO labels
    """
    labels = ["O"] * len(tokens)
    i = 0
    n = len(tokens)
    
    while i < n:
        matched = False
        
        # Try longest span first
        for span_len in range(min(max_phrase_len, n - i), 0, -1):
            span_tokens = tokens[i:i+span_len]
            span_text = " ".join(span_tokens).lower()
            
            if span_text in phrase2label:
                ent_label = phrase2label[span_text]
                labels[i] = f"B-{ent_label}"
                for j in range(i+1, i+span_len):
                    labels[j] = f"I-{ent_label}"
                i += span_len
                matched = True
                break
        
        if not matched:
            # no entity starting here
            i += 1
    
    return labels


In [21]:
example = ["Google", "is", "hiring", "a", "Machine", "Learning", "Engineer", "skilled", "in", "Python", "and", "TensorFlow", "."]
print(example)
print(rule_based_tag(example))


['Google', 'is', 'hiring', 'a', 'Machine', 'Learning', 'Engineer', 'skilled', 'in', 'Python', 'and', 'TensorFlow', '.']
['B-COMPANY', 'O', 'O', 'O', 'B-JOB_TITLE', 'I-JOB_TITLE', 'I-JOB_TITLE', 'O', 'O', 'B-PROGRAMMING_LANGUAGE', 'O', 'B-FRAMEWORK', 'O']


In [22]:
from seqeval.metrics import classification_report

rb_preds = [rule_based_tag(tokens) for tokens in test_tokens]

print(classification_report(test_labels, rb_preds, digits=4))


                      precision    recall  f1-score   support

             COMPANY     1.0000    1.0000    1.0000         4
        DEGREE_MAJOR     1.0000    1.0000    1.0000         1
     EDUCATION_LEVEL     1.0000    1.0000    1.0000         1
    EMPLOYEMENT_TYPE     1.0000    1.0000    1.0000         4
           FRAMEWORK     1.0000    1.0000    1.0000         3
           JOB_TITLE     1.0000    1.0000    1.0000         4
            LOCATION     1.0000    1.0000    1.0000         4
PROGRAMMING_LANGUAGE     1.0000    1.0000    1.0000         1
          SKILL_TECH     1.0000    1.0000    1.0000         1
                TOOL     1.0000    1.0000    1.0000         3

           micro avg     1.0000    1.0000    1.0000        26
           macro avg     1.0000    1.0000    1.0000        26
        weighted avg     1.0000    1.0000    1.0000        26

