In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.4-py3-none-any.whl.metadata (6.1 k

In [2]:
from datasets import load_dataset

ds = load_dataset("AjayMukundS/Indian_Legal_NER_Dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'entities'],
        num_rows: 10995
    })
    validation: Dataset({
        features: ['text', 'entities'],
        num_rows: 1074
    })
})

In [16]:
import re
from datasets import DatasetDict, Dataset

# Define the cleaning function
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove extra newlines and whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove unwanted special characters (keep legal punctuation like .,!? etc.)
    text = re.sub(r'[^\w\s.,!?()\-&]', '', text)
    
    return text

# Apply the cleaning function to the dataset
def clean_dataset(example):
    example["text"] = clean_text(example["text"])
    return example

# Assuming your dataset is already loaded into `ds`
# Example: ds = load_dataset("your_dataset_name")

# Clean the dataset
cleaned_ds = ds.map(clean_dataset)

# Verify the cleaning
print("Original Dataset:")
print(ds["train"][0])
print("\nCleaned Dataset:")
print(cleaned_ds["train"][0])

Original Dataset:
{'text': "\n\n(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessee's paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.", 'entities': [{'end': 103, 'label': 'ORG', 'start': 90}, {'end': 278, 'label': 'ORG', 'start': 267}]}

Cleaned Dataset:
{'text': '(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessees paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.', 'entities': [{'end': 103, 'label': 'ORG', 'start': 90}, {'end': 278, 'label': 'ORG', 'start': 267}]}


In [17]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [20]:
cleaned_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'entities'],
        num_rows: 10995
    })
    validation: Dataset({
        features: ['text', 'entities'],
        num_rows: 1074
    })
})

In [30]:
cleaned_ds["train"][0]

{'text': '(7) On specific query by the Bench about an entry of Rs. 1,31,37,500 on deposit side of Hongkong Bank account of which a photo copy is appearing at p. 40 of assessees paper book, learned authorised representative submitted that it was related to loan from broker, Rahul & Co. on the basis of his submission a necessary mark is put by us on that photo copy.',
 'entities': [{'end': 103, 'label': 'ORG', 'start': 90},
  {'end': 278, 'label': 'ORG', 'start': 267}]}

In [36]:
# Load the dataset
dataset = load_dataset("AjayMukundS/Indian_Legal_NER_Dataset")
train_data = dataset['train']

In [37]:
import spacy
from datasets import load_dataset
from spacy.training.example import Example
import random

# Initialize a blank English model (or use an existing pre-trained model like 'en_core_web_trf')
nlp = spacy.blank("en")

# Create a NER pipeline if it doesn't exist
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")


In [38]:
# Add labels to the NER component
for entry in train_data:
    for entity in entry['entities']:
        ner.add_label(entity['label'])

In [39]:
# Function to create training data in spaCy format
def create_training_data(data):
    training_data = []
    for entry in data:
        text = entry["text"]
        annotations = {"entities": []}
        for entity in entry["entities"]:
            start = entity["start"]
            end = entity["end"]
            label = entity["label"]
            annotations["entities"].append((start, end, label))
        training_data.append((text, annotations))
    return training_data

# Create the training data
train_examples = create_training_data(train_data)

In [40]:
# Convert to spaCy's Example format
train_examples_spacy = []
for text, annotations in train_examples:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    train_examples_spacy.append(example)


In [49]:
# Fine-tune the model
# Define a training loop (in this case, we'll use 10 iterations)
optimizer = nlp.begin_training()

# Training loop
for epoch in range(30):
    random.shuffle(train_examples_spacy)
    losses = {}
    for batch in spacy.util.minibatch(train_examples_spacy, size=8):
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Epoch {epoch+1} - Losses: {losses}")

# Save the trained model
nlp.to_disk("ner_model")



Epoch 1 - Losses: {'ner': np.float32(57768.973)}
Epoch 2 - Losses: {'ner': np.float32(44596.484)}
Epoch 3 - Losses: {'ner': np.float32(38824.99)}
Epoch 4 - Losses: {'ner': np.float32(36528.383)}
Epoch 5 - Losses: {'ner': np.float32(34120.152)}
Epoch 6 - Losses: {'ner': np.float32(32129.707)}
Epoch 7 - Losses: {'ner': np.float32(30420.629)}
Epoch 8 - Losses: {'ner': np.float32(29578.082)}
Epoch 9 - Losses: {'ner': np.float32(27796.457)}
Epoch 10 - Losses: {'ner': np.float32(27156.62)}
Epoch 11 - Losses: {'ner': np.float32(25820.252)}
Epoch 12 - Losses: {'ner': np.float32(25728.92)}
Epoch 13 - Losses: {'ner': np.float32(24376.182)}
Epoch 14 - Losses: {'ner': np.float32(23690.197)}
Epoch 15 - Losses: {'ner': np.float32(23352.732)}
Epoch 16 - Losses: {'ner': np.float32(23082.639)}
Epoch 17 - Losses: {'ner': np.float32(22681.576)}
Epoch 18 - Losses: {'ner': np.float32(21757.172)}
Epoch 19 - Losses: {'ner': np.float32(21626.717)}
Epoch 20 - Losses: {'ner': np.float32(21442.006)}
Epoch 21 - L

In [51]:
import spacy
from spacy import displacy

# Load the trained NER model
trained_ner_model = spacy.load("ner_model")

# Test text
test_text = "On 15th August 2023, the Supreme Court ruled in Case No. 567/2023 that, as per Section 302 of the IPC, the petitioner, represented by Lawyer Mr. Arvind Rao, had the right to appeal against the respondent, the Home Department of Tamil Nadu."

doc = trained_ner_model(test_text)

# Define custom colors for entity labels
custom_colors = {
    "ORG": "linear-gradient(90deg, #FF5733, #C70039)",  # Red-Orange
    "LAWYER": "linear-gradient(90deg, #6A0DAD, #9400D3)",  # Purple shades
    "DATE": "linear-gradient(90deg, #FFD700, #FFA500)",  # Gold-Orange
    "CASE_NUMBER": "linear-gradient(90deg, #0000FF, #1E90FF)",  # Blue
    "JUDGE": "linear-gradient(90deg, #32CD32, #008000)",  # Green
    "STATUTE": "linear-gradient(90deg, #FFA07A, #FF4500)",  # Light Coral to Orange Red
    "COURT": "linear-gradient(90deg, #20B2AA, #008B8B)",  # Teal
    "RESPONDENT": "linear-gradient(90deg, #800000, #FF0000)",  # Dark Red to Bright Red
    "PRECEDENT": "linear-gradient(90deg, #FF1493, #C71585)",  # Deep Pink to Medium Violet Red
    "WITNESS": "linear-gradient(90deg, #708090, #2F4F4F)",  # Slate Grey to Dark Slate Grey
    "OTHER_PERSON": "linear-gradient(90deg, #8B4513, #D2691E)",  # Saddle Brown to Chocolate
    "GPE": "linear-gradient(90deg, #4682B4, #5F9EA0)",  # Steel Blue to Cadet Blue
    "PROVISION": "linear-gradient(90deg, #9400D3, #8A2BE2)",  # Dark Violet to Blue Violet
    "PETITIONER": "linear-gradient(90deg, #556B2F, #6B8E23)"  # Dark Olive Green to Olive Drab
}

options = {"ents": list(custom_colors.keys()), "colors": custom_colors}

# Render the highlighted entities in Jupyter Notebook
displacy.render(doc, style="ent", jupyter=True, options=options)
