In [1]:
import pandas as pd
!pip install transformers
import json
from transformers import AutoTokenizer



In [54]:
import json

# Load the JSONL dataset
data_file = "/content/super_scotus_sample_clean.jsonl"
cases = {}

with open(data_file, "r") as f:
    for line in f:
        try:
            case = json.loads(line)
        except json.JSONDecodeError as e:
                print(f"Skipping malformed line: {e}")
                continue
        case_id = case.get("case_id") or case.get("id")
        justia_sections = case.get("justia_sections", {}) #judgments
        convos = case.get("convos", {})

        # Begin parsing the convos
        utterances = []
        speaker_map = convos.get("speaker", {}) #oral transcripts

        if isinstance(convos, dict) and "utterances" in convos:
            outer = convos["utterances"]
            if (isinstance(outer, list) and len(outer) > 0 and isinstance(outer[0], list)
                    and all(isinstance(u, dict) for u in outer[0])):

                for utt in outer[0]:  # Flatten and parse
                    utt_id = utt.get("id")
                    speaker_id = utt.get("speaker_id")
                    text = utt.get("text", "")

                    speaker_meta = speaker_map.get(speaker_id, {}) if speaker_id else {}
                    speaker_type = speaker_meta.get("type")

                    # Debug this
                    if speaker_type is None:
                        print(f"[DEBUG] Missing speaker_type for: speaker_id={speaker_id}, utt_text={text[:50]}")

                    utterances.append({
                         "id": utt_id,
                         "text": text,
                         "speaker": speaker_id,
                         "side": speaker_meta.get("side"),
                         "speaker_type": speaker_type
                    })

            else:
                print(f"Warning: Unexpected 'utterances' format in case {case_id}. Skipping it.")


        # Store the parsed case
        cases[case_id] = {
            "justia_sections": justia_sections,
            "convos": utterances
        }

print(f"Loaded {len(cases)} cases from Super-SCOTUS dataset.")

# Example: Check structure for one case
example_id, example_case = next(iter(cases.items()))
print("Case ID:", example_id)
print("Justia sections:", list(example_case["justia_sections"].keys()))
print("First 2 transcript utterances:", example_case["convos"][:2])


Loaded 6733 cases from Super-SCOTUS dataset.
Case ID: 1955_71
Justia sections: ['Syllabus', 'Case']
First 2 transcript utterances: [{'id': None, 'text': 'Number 71, Lonnie Affronti versus United States of America.\nMr. Murphy.', 'speaker': 'j__earl_warren', 'side': None, 'speaker_type': 'J'}, {'id': None, 'text': 'May it please the Court.\nWe are here by writ of certiorari to the Eighth Circuit.\nThere is one question to be decided in this case, decided carefully.\nUpon sentence to consecutive sentences or terms by a District Court.\nThe defending pattern started the service of a first sentence.\nThus, the District Court thereafter have jurisdiction to suspend the execution of the remaining sentences and place the defendant on probation.', 'speaker': 'harry_f_murphy', 'side': 1, 'speaker_type': 'A'}]


In [50]:
# Install necessary packages (if not already installed)
!pip install langchain tiktoken

import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

# Custom splitter that prefers paragraph-level breaks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", " "]
)

justia_chunks = []

for case_id, case_data in cases.items():
    sections = case_data.get("justia_sections", {})
    if not sections:
        continue

    for section_name, section_text in sections.items():
        #Clean citations like [12]
        section_text = re.sub(r'\[\d+\]', '', section_text)

        #Extract justice name with regex
        sec_type = section_name
        justice = None
        match = re.search(r'Justice (\w+)', section_name)
        if match:
            justice = match.group(1)
        elif "Justice" in section_text:
            #try from body
            match_body = re.search(r'Justice (\w+)', section_text)
            if match_body:
                justice = match_body.group(1)

        #Normalize section type (e.g., Concurring / Dissenting)
        if "Concurring" in section_name:
            sec_type = "Concurring"
        elif "Dissent" in section_name:
            sec_type = "Dissenting"
        elif "Opinion" in section_name:
            sec_type = "Opinion"
        elif "Syllabus" in section_name:
            sec_type = "Syllabus"

        #Use chunker to split text into coherent pieces
        for chunk in text_splitter.split_text(section_text):
            doc = Document(
                page_content=chunk.strip(),
                metadata={
                    "case_id": case_id,
                    "section_type": sec_type,
                    "justice": justice
                }
            )
            justia_chunks.append(doc)

# Print summary
print(f"Total Justia chunks: {len(justia_chunks)} (from {len(cases)} cases)")
print("Sample chunk metadata:", justia_chunks[0].metadata)
print("Sample chunk text snippet:", justia_chunks[0], "...")

Total Justia chunks: 354239 (from 6733 cases)
Sample chunk metadata: {'case_id': '1955_71', 'section_type': 'Syllabus', 'justice': None}
Sample chunk text snippet: page_content='U.S. Supreme CourtAffronti v. United States, 350 U.S. 79 (1955)Affronti v. United StatesNo. 71Argued November 15, 1955Decided December 5, 1955350 U.S. 79Syllabus

Under 18 U.S.C. § 3651, after a sentence of consecutive terms on multiple counts of an indictment has been imposed and service of sentence for the first such term has commenced, a federal district court may not suspend sentence and grant probation as to the remaining term or terms.  United States v. Murray, 275 U. S. 347.  Pp.  350 U. S. 79-84.
(a) The legislative history of this section does not require a different result.  Pp.  350 U. S. 81-82,  350 U. S. 84.
(b) The probationary power ceases with respect to all of the sentences composing a single cumulative sentence immediately upon imprisonment for any part of the cumulative sentence.  Pp.  350 U.

In [51]:
!pip install langchain sentence-transformers scikit-learn --quiet

import re
import numpy as np
from langchain.schema import Document
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load semantic model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Token-aware splitter
splitter = SentenceTransformersTokenTextSplitter(
    chunk_size=128,
    chunk_overlap=20,
    model_name="all-MiniLM-L6-v2"
)

# Parameters
MIN_CLUSTER_LEN = 3

# Phase-Level Collection Using Side Clustering
phase_blocks = []

for case_id, case_data in cases.items():
    utterances = case_data.get("convos", [])
    if not utterances:
        continue

    current_cluster = []
    current_side = None

    for utt in utterances:
        side = utt.get("side")
        text = utt.get("text", "").strip()
        if not text:
            continue

        if current_side is None:
            current_side = side

        if side != current_side and len(current_cluster) >= MIN_CLUSTER_LEN:
            full_text = " ".join(u["text"] for u in current_cluster)
            phase_blocks.append(Document(
                page_content=full_text,
                metadata={"case_id": case_id, "side": current_side}
            ))
            current_cluster = []
            current_side = side

        current_cluster.append(utt)

    if len(current_cluster) >= MIN_CLUSTER_LEN:
        full_text = " ".join(u["text"] for u in current_cluster)
        phase_blocks.append(Document(
            page_content=full_text,
            metadata={"case_id": case_id, "side": current_side}
        ))

# Token-aware semantic chunking using LangChain
def semantically_chunk_sentences(text):
    return splitter.split_text(text)

# Final chunking pass
convos_chunks = []

for doc in phase_blocks:
    sem_chunks = semantically_chunk_sentences(doc.page_content)
    for i, chunk in enumerate(sem_chunks):
        convos_chunks.append(Document(
            page_content=chunk.strip(),
            metadata={**doc.metadata, "semantic_chunk_id": i}
        ))

# Report
print(f"\nTotal transcript chunks: {len(convos_chunks)}")
if convos_chunks:
    print("Sample transcript chunk metadata:", convos_chunks[0].metadata)
    print("Sample chunk text snippet:", convos_chunks[0], "...")
else:
    print("⚠️ No transcript chunks were generated.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:01<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Total transcript chunks: 517861
Sample transcript chunk metadata: {'case_id': '1955_71', 'side': 1, 'semantic_chunk_id': 0}
Sample chunk text snippet: page_content='number 71, lonnie affronti versus united states of america. mr. murphy. may it please the court. we are here by writ of certiorari to the eighth circuit. there is one question to be decided in this case, decided carefully. upon sentence to consecutive sentences or terms by a district court. the defending pattern started the service of a first sentence. thus, the district court thereafter have jurisdiction to suspend the execution of the remaining sentences and place the defendant on probation. consecutive sentences. consecutive sentences. in this case, the defendant, affronti, was indicted in 1932 by a grand jury in the western district of missouri. charged in an indictment in ten counts with the illegal sale of narcotics. i ' ve mentioned the dates because they, if not of importance, will be of interest. in 1944, the defe

In [52]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm

# Merge chunked datasets
all_chunks = justia_chunks + convos_chunks  # Combine chunks
texts = [doc.page_content for doc in all_chunks]

# Load semantic model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Embed in batches
batch_size = 256
embeddings = []

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
    batch = texts[i:i+batch_size]
    batch_embeddings = embedder.encode(batch)
    embeddings.extend(batch_embeddings)

# Convert to NumPy array
embeddings_np = np.array(embeddings)

# Build FAISS index
dimension = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)  # Euclidean distance
index.add(embeddings_np)

# Retain document references
indexed_docs = all_chunks

In [None]:
# Save FAISS index to disk
faiss.write_index(index, "justia_convos.index")
print("✅ FAISS index saved as justia_convos.index")