In [1]:
# ============================================================
# Build cpmc_small.csv
#
# Purpose:
# Create a lightweight CPMC lookup table used to:
#   - detect MDJ â†’ CPMC case transfers
#   - define the END of the pretrial window for transferred cases
#
# Design:
#   - Only TWO columns are required:
#       1) originatingdocketnumber
#       2) offensedispositiondate
#
# Why:
#   - Full CPMC data is extremely large
#   - Loading all columns is unnecessary and expensive
#   - This file is reused across multiple downstream notebooks
#
# IMPORTANT:
#   Do NOT add extra columns unless downstream logic changes.
# ============================================================

import pandas as pd
import os

In [2]:
# ============================================================
# Load ONLY the columns needed from the raw CPMC dataset
# ============================================================

CPMC_RAW_PATH = "csv data/CMU AOPC CP_MC Filings 2015-2018.csv" 

use_cols = [
    "originatingdocketnumber",
    "offensedispositiondate"
]

cpmc = pd.read_csv(
    CPMC_RAW_PATH,
    usecols=use_cols
)

In [3]:
# ============================================================
# Normalize originating docket numbers
# - cast to string
# - strip whitespace
# - uppercase
# ============================================================

cpmc["originatingdocketnumber"] = (
    cpmc["originatingdocketnumber"]
    .astype(str)
    .str.strip()
    .str.upper()
)

In [4]:
# ============================================================
# Convert disposition date to datetime
# Invalid values are coerced to NaT
# ============================================================

cpmc["offensedispositiondate"] = pd.to_datetime(
    cpmc["offensedispositiondate"],
    errors="coerce"
)

In [5]:
# ============================================================
# Remove rows that cannot be used for pretrial window logic
# ============================================================

cpmc = cpmc.dropna(
    subset=["originatingdocketnumber", "offensedispositiondate"]
)

In [6]:
# ============================================================
# Aggregate to case level
#
# If a docket appears multiple times in CPMC,
# keep the LATEST disposition date
# (ensures full pretrial window coverage)
# ============================================================

cpmc_small = (
    cpmc
    .groupby("originatingdocketnumber", as_index=False)
    .agg({"offensedispositiondate": "max"})
)

In [7]:
# ============================================================
# Sanity checks
# ============================================================

print("Shape:", cpmc_small.shape)
print("Columns:", cpmc_small.columns.tolist())
print("Unique dockets:", cpmc_small["originatingdocketnumber"].nunique())

cpmc_small.head()

Shape: (568163, 2)
Columns: ['originatingdocketnumber', 'offensedispositiondate']
Unique dockets: 568163


Unnamed: 0,originatingdocketnumber,offensedispositiondate
0,321010492291,2017-06-19 16:00:00
1,321020589481,1982-07-12 00:00:00
2,321040758388,2017-09-25 16:07:00
3,321060407492,2016-09-06 09:52:00
4,321150195292,2016-09-06 10:18:00


In [8]:
cpmc_small.to_csv("csv data/cpmc_small.csv", index=False)

print(f"Saved cpmc_small.csv")

Saved cpmc_small.csv
