##Install Dependencies 
Run the following command to install all required Python libraries before proceeding if not used requirements.txt

In [1]:
!pip install pymongo pymupdf transformers google

Collecting pymongo
  Downloading pymongo-4.13.2-cp312-cp312-win_amd64.whl.metadata (22 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting google
  Downloading google-3.0.0-py2.py3-none-any.whl.metadata (627 bytes)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)

In [2]:
!pip install PyMuPDF torch

Collecting torch
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.7.1-cp312-cp312-win_amd64.whl (216.1 MB)
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.3/216.1 MB ? eta -:--:--
   ---------------------------------------- 0.5/216.1 MB 1.0 MB/s eta 0:03:26
   ---------------------------------------- 0.8/216.1 MB 1.1 MB/s eta 0:03:13
   ---------------------------------------- 1

In [4]:
!pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.176.0-py3-none-any.whl.metadata (7.0 kB)
Collecting httplib2<1.0.0,>=0.19.0 (from google-api-python-client->google-generativeai)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client->google-generativeai)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client->google-generativeai)
  Downloading uritemplate-4.2.0-py3-none-any.whl.metadata (2.6 kB)
Downloading google_generativeai-0.8.5-py3-none-any.whl (155 kB)
Downloading google_ai_generativelanguage

## Imports and Setup 
Import the necessary Python modules for PDF text extraction, API interaction, and data processing.

In [2]:
import fitz  # PyMuPDF
import json
import os
import time
import google.generativeai as genai
import torch
import numpy as np
import matplotlib.pyplot as plt
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [24]:
genai.configure(api_key="AIzaSyD9YL02If2zEYcDkvFWpIL-vM8d8ZwCykc")

# Step 2: Define your function
def extract_order_info_with_gemini(document_text):
    prompt = f"""
    Read the following orders on insider trading and return a json object with the following keys:
    Date of Order: Date on which the order was passed. (A single date in the formal DD/MM/YYYY)
    Date of Action: Date on which the offending action(actual violation or not) was considered to have taken place. (A single date in the formal DD/MM/YYYY)
    Order Type: Type of order (Restrict to only 4 types: Settlement, Adjudication, Final or Other)
    Case Name: Name of the case or the parties.
    Monetary Penalty Imposed: If any monetary penalty imposed. (Return the result in numeric format only)
    Non-monetary Penalty: Any other penalty besides monetary, such as barred from trading or non-monetory settlement terms.
    Judgment Criteria: The criteria used by the officer to impose the penalty, that is the reasoning offered at the time of judgment.
    Penalty Criteria: The reason/calculation used for imposition of the said penalty.
    Contextual Metadata: Brief description of case (15-20 words). Focussing on question law and not case specific facts
    Provisions: A list of any PIT-related provisions which were violated
    Type of Insider Trading: If violation of disclosure norms, trading on UPSI etc.
    Case Summary: Summary of the case in 25-30 words.
    PIT Version: Whether violation of PIT, 1992 or PIT, 2015

    If multiple people or entities have been punished create a separate json output for each of them (only those on whom a penalty is imposed)

    Also try to standardise the results so that further analysis becomes easier, that is try to use similar language and categories while retaining complexities and intricacies of each case.

    Document:
    \"\"\"{document_text}\"\"\"
    """

    # Step 3: Use the Gemini Flash 2.5 model
    model = genai.GenerativeModel(model_name="gemini-2.5-flash")

    response = model.generate_content(prompt)
    return response.text

In [25]:
def clean_json_output(llm_output):
    # Remove markdown code fences and extraneous whitespace
    cleaned = re.sub(r"^```(?:json)?\s*", "", llm_output, flags=re.IGNORECASE)
    cleaned = re.sub(r"\s*```$", "", cleaned)
    return cleaned.strip()

In [30]:
# Define the folders
pdf_folder = r"SEBI_9"
output_folder = r"SEBI_9_Output"

# Create the output folder if it doesn't exist. 
os.makedirs(output_folder, exist_ok=True)

# Gather PDF files from the pdf_folder.
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
counter = 0

for pdf_file in pdf_files:
    time.sleep(10)  # Delay between processing files.
    pdf_path = os.path.join(pdf_folder, pdf_file)

    # Extract text from the PDF. (Ensure extract_text_from_pdf is defined elsewhere.)
    pdf_text = extract_text_from_pdf(pdf_path)

    try:
        # Generate the LLM output (ensure extract_order_info_with_gemini is defined).
        llm_output = extract_order_info_with_gemini(pdf_text)
        cleaned_output = clean_json_output(llm_output)
        print(f"PDF File: {pdf_file}")
        print("LLM Output:")
        print(cleaned_output)
        print("\n")
        counter += 1
        print(f"Counter: {counter}")

        # Try to convert the LLM output to a Python dictionary.
        try:
            json_data = json.loads(cleaned_output)
        except Exception as json_err:
            print(f"Warning: Could not parse JSON for {pdf_file}: {json_err}")
            # Optionally, wrap the raw output in a dictionary.
            json_data = {"raw_output": llm_output}

        # Generate an output filename by replacing .pdf with .json.
        output_filename = os.path.splitext(pdf_file)[0] + ".json"
        output_path = os.path.join(output_folder, output_filename)

        # Save the JSON data to the output file.
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, ensure_ascii=False, indent=4)
        print(f"Saved {output_filename}")

    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        time.sleep(26)  # Wait before trying the next file if an error occurs.
        continue


PDF File: 1624359612868_5.pdf
LLM Output:
[
  {
    "Date of Order": "21/06/2021",
    "Date of Action": "09/11/2011",
    "Order Type": "Adjudication",
    "Case Name": "Kasturi Overseas Private Limited in the matter of Nouveau Global Ventures Limited",
    "Monetary Penalty Imposed": 300000,
    "Non-monetary Penalty": "None",
    "Judgment Criteria": "Violation of mandatory disclosure obligations under SEBI regulations. The purpose of disclosures is transparency and market monitoring. The default was repetitive in nature, making the Noticee liable for penalty under Section 15A(b) of SEBI Act. Factors under Section 15J of SEBI Act were considered.",
    "Penalty Criteria": "Inability to quantify disproportionate gain or loss to investors. Repetitive nature of disclosure defaults. Impugned transactions were old (over 8 years), which might be a mitigating factor for quantum. Penalty deemed commensurate with lapses/omissions.",
    "Contextual Metadata": "Penalty imposed on a promoter f