In [None]:
!pip install sentence-transformers pinecone tqdm

In [None]:
import json
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm import tqdm

In [None]:
INPUT_FILE = "/content/drive/MyDrive/judgments_chunks.jsonl"
INDEX_NAME = "legal-landmark-cases"
LANDMARK_YEARS = [1973,1978,2018,2024,1992, 1994, 1997, 2014, 2017, 2023]
PINECONE_API_KEY = ""
PINECONE_ENV = "gcp-starter"
DIMENSION = 384
BATCH_SIZE = 100

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Create Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
if INDEX_NAME not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIMENSION,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

# Connect to index
index = pc.Index(INDEX_NAME)

In [None]:
index = pc.Index(INDEX_NAME)
print("Index stats:", index.describe_index_stats())

In [None]:
vectors = []
total_uploaded = 0
year_counts = {year: 0 for year in LANDMARK_YEARS}
upload_failed = False

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    total_lines = sum(1 for _ in f)

print(f"Processing {total_lines} lines from JSONL...")

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=total_lines, desc="Uploading chunks"):
        if line.strip():
            case = json.loads(line)
            year = int(case.get("date_of_judgment", "")[:4])

            if year in LANDMARK_YEARS:
                text = case["chunk_text"]
                embedding = model.encode(text).tolist()

                vectors.append({
                    "id": case["chunk_id"],
                    "values": embedding,
                    "metadata": {
                        "case_title": case.get("case_title", ""),
                        "year": year,
                        "citation": case.get("citation", ""),
                        "bench": ", ".join(case.get("bench", [])),
                        "bench_strength": case.get("bench_strength", 0),
                        "article_references": case.get("article_references", []),
                        "source_pdf": case.get("source_pdf", "")
                    }
                })

                year_counts[year] += 1

                # Upload in batches
                if len(vectors) == BATCH_SIZE:
                    try:
                        index.upsert(vectors=vectors)
                        total_uploaded += len(vectors)
                        vectors = []
                    except Exception as e:
                        print(f"\n❌ Upload failed: {e}")
                        print("Stopping further uploads (likely Pinecone limit reached).")
                        upload_failed = True
                        break

    # Upload any remaining vectors
    if not upload_failed and vectors:
        try:
            index.upsert(vectors=vectors)
            total_uploaded += len(vectors)
        except Exception as e:
            print(f"\n❌ Final upload failed: {e}")
            upload_failed = True

print(f"\n✅ Upload complete! Total uploaded before error: {total_uploaded}")
print("\nYear-wise counts:")
for y, count in year_counts.items():
    print(f"{y}: {count} chunks")