In [None]:
import zipfile
import pandas
import requests
import json
import shutil
import time
import io
import gzip
from pathlib import Path

In [None]:
# Zip file download for reporter (ie ca-app-4th, etc)
# process json files within zip download into a single jsonl file per volume
def process_volume_zip(vol: int, reporter: str):
    BASE_VOL = vol
    BASE_URL = f"https://static.case.law/{reporter}/{BASE_VOL}"
    OUT_DIR = Path(f"/content/{reporter}-{BASE_VOL:02d}")
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    zip_url = f"{BASE_URL}.zip"
    print(f"Downloading ZIP from {zip_url}")
    resp = requests.get(zip_url, stream=True)
    # add a delay
    time.sleep(2)
    resp.raise_for_status()

    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        # Files are in a json/ folder within zip
      case_files = [f for f in zf.namelist() if f.startswith(
          "json/") and f.endswith(".json")]
      print(f" Extracting {len(case_files)} case JSON files...")

      records = []
      for f in case_files:
          with zf.open(f) as case_file:
              records.append(json.load(case_file))

      # Save records to JSONL
      jsonl_path = OUT_DIR / "cases.jsonl"
      with open(jsonl_path, "w", encoding="utf-8") as f:
          for rec in records:
              json.dump(rec, f)
              f.write("\n")

    print(f"Wrote {len(records)} cases to {jsonl_path}")
    return OUT_DIR

In [None]:
# create loops for reporter and number of volume to download files
reporter_volumes = {
    "cal-app-4th": 248,  # get these from looking at website
    "cal-app-5th": 11
}

for reporter, max_vol in reporter_volumes.items():
  for vol in range(1, max_vol + 1):
    try:
      print(f"Processing volume {vol} of {reporter}")
      process_volume_zip(vol, reporter)
    except Exception as e:
      print(f"Error processing volume {vol} of {reporter}: {e}")
      break

In [None]:
# Put all jsonl files together into one giant jsonl file
BASE_PATH = Path("/content")

jsonl_paths_4th = list(BASE_PATH.glob("cal-app-4th-*/cases.jsonl"))
jsonl_paths_5th = list(BASE_PATH.glob("cal-app-5th-*/cases.jsonl"))

jsonl_paths = sorted(jsonl_paths_4th + jsonl_paths_5th)

print(f"Found {len(jsonl_paths)} JSONL files")


output_path = BASE_PATH / "cal-app-4th-5th-all.jsonl"

with open(output_path, "w", encoding="utf-8") as out_file:
    total = 0
    for path in jsonl_paths:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                out_file.write(line)
                total += 1

print(f"Wrote {total} combined cases to {output_path}")

In [None]:
# Convert to gzip
input_path = Path("/content/cal-app-4th-5th-all.jsonl")
output_path = input_path.with_suffix(".jsonl.gz")

with open(input_path, "rb") as f_in, gzip.open(output_path, "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)