In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ner-training-dataset/traindata.json
/kaggle/input/ner-training-dataset/testdata.json
/kaggle/input/job-recom-dataset/datascientist.json
/kaggle/input/job-recom-dataset/dataengineer.json
/kaggle/input/job-recom-dataset/phpdeveloper.json
/kaggle/input/job-recom-dataset/javadeveloper.json
/kaggle/input/job-recom-dataset/backenddeveloper.json


In [2]:
import spacy
import json
import random
import logging
from spacy.training import Example
import re


In [3]:
import spacy
from spacy.training import Example
import json
import random
import re

# --- CONFIGURATION ---
TRAIN_DATA_PATH = "/kaggle/input/ner-training-dataset/traindata.json"
OUTPUT_DIR = "/kaggle/working/nlp_ner_model"
ITERATIONS = 15

# --- 1. CONVERSION FUNCTION ---
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    training_data = []
    lines = []
    try:
        with open(dataturks_JSON_FilePath, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading file: {e}")
        return []

    for line in lines:
        data = json.loads(line)
        text = data['content']
        entities = []
        if data['annotation'] is not None:
            for annotation in data['annotation']:
                point = annotation['points'][0]
                labels = annotation['label']
                if not isinstance(labels, list):
                    labels = [labels]
                for label in labels:
                    entities.append((point['start'], point['end'] + 1, label))

        training_data.append((text, {"entities": entities}))
    return training_data

# --- 2. AUTHOR'S CLEANING (Fixes Whitespace) ---
def trim_entity_spans(data: list):
    """Removes leading and trailing white spaces from entity spans."""
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(text[valid_start]): valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(text[valid_end-1]): valid_end -= 1
            valid_entities.append((valid_start, valid_end, label))
        cleaned_data.append((text, {'entities': valid_entities}))

    return cleaned_data

# --- 3. CRITICAL FIX: REMOVE OVERLAPS (Fixes Crash) ---
def remove_overlaps(data: list):
    """
    Resolves conflicting entities by keeping the longest one.
    Crucial for SpaCy v3 to prevent ValueError: [E103].
    """
    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        # Sort by start index (asc) and length (desc)
        sorted_entities = sorted(entities, key=lambda x: (x[0], -(x[1] - x[0])))
        
        non_overlapping = []
        last_end = -1
        
        for start, end, label in sorted_entities:
            # If the new entity starts AFTER the previous one ended, it's valid.
            if start >= last_end:
                non_overlapping.append((start, end, label))
                last_end = end
        
        cleaned_data.append((text, {'entities': non_overlapping}))
    return cleaned_data

# --- 4. PREPARE PIPELINE ---
print("Loading data...")
raw_data = convert_dataturks_to_spacy(TRAIN_DATA_PATH)

print("Cleaning whitespace (Author's method)...")
trimmed_data = trim_entity_spans(raw_data)

print("Removing overlaps (Fix for Error E103)...")
TRAIN_DATA = remove_overlaps(trimmed_data)

print(f"Data ready: {len(TRAIN_DATA)} resumes.")

# --- 5. TRAIN SPACY V3 ---
def train_spacy_v3(data, iterations):
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner = nlp.add_pipe("ner", last=True)
    
    # Add labels
    for _, annotations in data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        print(f"Starting training for {iterations} iterations...")
        
        for itn in range(iterations):
            random.shuffle(data)
            losses = {}
            for text, annotations in data:
                doc = nlp.make_doc(text)
                try:
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.2, sgd=optimizer, losses=losses)
                except Exception as e:
                    pass # Skip any remaining bad data
            print(f"Iteration {itn+1}/{iterations} - Loss: {losses.get('ner', 0.0):.2f}")
            
    return nlp

# --- EXECUTE ---
nlp_model = train_spacy_v3(TRAIN_DATA, ITERATIONS)
nlp_model.to_disk(OUTPUT_DIR)
print(f"Model saved successfully to {OUTPUT_DIR}")


Loading data...
Cleaning whitespace (Author's method)...
Removing overlaps (Fix for Error E103)...
Data ready: 200 resumes.
Starting training for 15 iterations...


6+ Exp in banking operations and cre..." with entities "[(0, 13, 'Name'), (14, 16, 'Years of Experience'),...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist
Server Support Engineer

Gurgaon, Har..." with entities "[(0, 12, 'Name'), (13, 36, 'Designation'), (38, 45...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Arabic Language supporter (Content Analyst..." with entities "[(0, 7, 'Name'), (8, 51, 'Designation'), (53, 62, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Azure Developer with 9 Yrs 8 months ..." with entities "[(0, 13, 'Name'), (14, 29, 'Designation'), (35, 48...". Use `spacy.tr

Iteration 1/15 - Loss: 12853.97
Iteration 2/15 - Loss: 5053.75
Iteration 3/15 - Loss: 4200.78
Iteration 4/15 - Loss: 3622.21
Iteration 5/15 - Loss: 3185.43
Iteration 6/15 - Loss: 3133.80
Iteration 7/15 - Loss: 2907.77
Iteration 8/15 - Loss: 2731.66
Iteration 9/15 - Loss: 2576.94
Iteration 10/15 - Loss: 2450.56
Iteration 11/15 - Loss: 2522.48
Iteration 12/15 - Loss: 2367.35
Iteration 13/15 - Loss: 2184.88
Iteration 14/15 - Loss: 2096.53
Iteration 15/15 - Loss: 1939.97
Model saved successfully to /kaggle/working/nlp_ner_model
