In [3]:
import json
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy import displacy
import random
import os

In [4]:
# Function to load JSON data from a file
def load_json_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

In [5]:
# Function to convert data to spaCy format
def convert_data(data):
    training_data = []
    for item in data["annotations"]:
        text, annotations = item
        entities = [(start, end, label) for start, end, label in annotations["entities"]]
        training_data.append((text, {"entities": entities}))
    return training_data

# Directory containing JSON files
data_directory ='/home/hp/Documents/Mini_Project/Labelled/json'


In [6]:
# Collect all training data
all_training_data = []
for filename in os.listdir(data_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(data_directory, filename)
        data = load_json_data(file_path)
        training_data = convert_data(data)
        all_training_data.extend(training_data)


In [7]:
# Create a blank spaCy model
nlp = spacy.blank("en")

# Create the NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add new labels to the NER component
for _, annotations in all_training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipeline components (if any)
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [8]:
# Training the NER model
with nlp.disable_pipes(*unaffected_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(100):
        random.shuffle(all_training_data)
        losses = {}
        batches = minibatch(all_training_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in zip(texts, annotations)]
            nlp.update(examples, drop=0.5, losses=losses)
        print(f"Iteration {iteration + 1}, Losses: {losses}")

# Save the trained model
nlp.to_disk("trained_model")

Appeal by special le..." with entities "[(5, 28, 'CASE_NUMBER'), (93, 113, 'DATE'), (122, ...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from a judgm..." with entities "[(6, 29, 'CASE_NUMBER'), (71, 86, 'DATE'), (95, 11...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal from a Judgme..." with entities "[(0, 28, 'CASE_NUMBER'), (71, 105, 'COURT'), (107,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.
Appeal by special leave fr..." with entities "[(4, 22, 'CASE_NUMBER'), (129, 148, 'DATE'), (203,...". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored duri

Iteration 1, Losses: {'ner': 218695.3405227661}
Iteration 2, Losses: {'ner': 27181.264691093296}
Iteration 3, Losses: {'ner': 11783.444394826889}
Iteration 4, Losses: {'ner': 9489.377210952342}
Iteration 5, Losses: {'ner': 10007.452535922639}
Iteration 6, Losses: {'ner': 9304.552739662468}
Iteration 7, Losses: {'ner': 8776.316722715273}
Iteration 8, Losses: {'ner': 14884.365037078038}
Iteration 9, Losses: {'ner': 11016.443701148033}
Iteration 10, Losses: {'ner': 9704.507415354252}
Iteration 11, Losses: {'ner': 8254.53370265942}
Iteration 12, Losses: {'ner': 8003.219293301692}
Iteration 13, Losses: {'ner': 10082.144146884792}
Iteration 14, Losses: {'ner': 8224.54743104428}
Iteration 15, Losses: {'ner': 7880.639371372759}
Iteration 16, Losses: {'ner': 7706.974409360439}
Iteration 17, Losses: {'ner': 7618.056745496986}
Iteration 18, Losses: {'ner': 7310.4077295024035}
Iteration 19, Losses: {'ner': 6885.059986386448}
Iteration 20, Losses: {'ner': 6655.625033980003}
Iteration 21, Losses: {'

In [4]:
#Test the model


# Load the trained model
nlp = spacy.load("trained_model")

# Function to visualize entities in text
def visualize_ner(text):
    doc = nlp(text)
    displacy.render(doc, style="ent", jupyter=True)



In [6]:
# Specify the path to your text file
file_path = "/home/hp/Documents/Mini_Project/dataset/IN-Abs/test-data/judgement/6276.txt"

# Read the content of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


visualize_ner(text)