In [None]:
!pip install pymongo pymupdf transformers google

In [None]:
import fitz  # PyMuPDF
import json
import os
import time
from google import genai
import torch
import numpy as np
import matplotlib.pyplot as plt
import re

In [None]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
client = genai.Client(api_key="Enter your API key here")

def extract_order_info_with_gemini(document_text):
    prompt = f"""
    Read the following orders on insider trading and return a json object with the following keys:
    Date of Order: Date on which the order was passed. (A single date in the formal DD/MM/YYYY)
    Date of Action: Date on which the offending action(actual violation or not) was considered to have taken place. (A single date in the formal DD/MM/YYYY)
    Order Type: Type of order (Restrict to only 4 types: Settlement, Adjudication, Final or Other)
    Case Name: Name of the case or the parties.
    Monetary Penalty Imposed: If any monetary penalty imposed. (Return the result in numeric format only)
    Non-monetary Penalty: Any other penalty besides monetary, such as barred from trading or non-monetory settlement terms.
    Judgment Criteria: The criteria used by the officer to impose the penalty, that is the reasoning offered at the time of judgment.
    Penalty Criteria: The reason/calculation used for imposition of the said penalty.
    Contextual Metadeta: Brief description of case (15-20 words). Focussing on question law and not case specific facts
    Provisions: A list of any PIT-related provisions which were violated
    Type of Insider Trading: If violation of disclosure norms, trading on UPSI etc.
    Case Summary: Summary of the case in 25-30 words.
    PIT Version: Whether violation of PIT, 1992 or PIT, 2015

    If multiple people or entities have been punished create a separate json output for each of them (only those on whom a penalty is imposed)

    Also try to standardise the results so that further analysis becomes easier, that is try to use similar language and categories while retaining complexities and intracacies of each case.

    Document:
    \"\"\"
    {document_text}
    \"\"\"
    """
    response = client.models.generate_content(
    model="gemini-2.5-flash", contents=prompt)

    answer = response.text
    return answer

In [None]:
def clean_json_output(llm_output):
    # Remove markdown code fences and extraneous whitespace
    cleaned = re.sub(r"^```(?:json)?\s*", "", llm_output, flags=re.IGNORECASE)
    cleaned = re.sub(r"\s*```$", "", cleaned)
    return cleaned.strip()

In [None]:
# Define the folders
pdf_folder = r"Enter the PDF Folder Path Here"
output_folder = r"Enter the Output Folder Path Here"

# Create the output folder if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)

# Gather PDF files from the pdf_folder.
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
counter = 0

for pdf_file in pdf_files:
    time.sleep(10)  # Delay between processing files.
    pdf_path = os.path.join(pdf_folder, pdf_file)

    # Extract text from the PDF. (Ensure extract_text_from_pdf is defined elsewhere.)
    pdf_text = extract_text_from_pdf(pdf_path)

    try:
        # Generate the LLM output (ensure extract_order_info_with_gemini is defined).
        llm_output = extract_order_info_with_gemini(pdf_text)
        cleaned_output = clean_json_output(llm_output)
        print(f"PDF File: {pdf_file}")
        print("LLM Output:")
        print(cleaned_output)
        print("\n")
        counter += 1
        print(f"Counter: {counter}")

        # Try to convert the LLM output to a Python dictionary.
        try:
            json_data = json.loads(cleaned_output)
        except Exception as json_err:
            print(f"Warning: Could not parse JSON for {pdf_file}: {json_err}")
            # Optionally, wrap the raw output in a dictionary.
            json_data = {"raw_output": llm_output}

        # Generate an output filename by replacing .pdf with .json.
        output_filename = os.path.splitext(pdf_file)[0] + ".json"
        output_path = os.path.join(output_folder, output_filename)

        # Save the JSON data to the output file.
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, ensure_ascii=False, indent=4)
        print(f"Saved {output_filename}")

    except Exception as e:
        print(f"Error processing {pdf_file}: {e}")
        time.sleep(26)  # Wait before trying the next file if an error occurs.
        continue
