In [1]:
import pandas as pd
import numpy as np

### PSP & CPMC Datasets

1. Similar to the PSP and MDJ data merge, we dropped the PSP columns with over 90% missing values to decrease the file size to allow complete uploading. 

These columns included: ['death_yeardeath_month', 'death_day', 'sor_status', 'release_year', 'release_month', 'release_day', 'inchoate_charge', 'final_charge', 'consec_charge', 'disp3', 'disp4', 'disp5', 'disp6', 'disp7', 'disp8', 'disp9', 'disp10', 'min_sent_year', 'max_sent_year']

2. We also dropped CPMC columns with over 90% missing values in CPMC. 

These columns included: ['defendantcounty', 'fineadjustment', 'restitutionadjustment', 'rrristatus', 'sentencestartdate', 'confinementlocation']

3. After finding common OTN and ID pairs in both datasets, we sorted both columns in order then split them into 5 "chunks" for faster processing in future steps. We then did an inner join on OTN and ID. 

In [2]:
psp = pd.read_csv('CMU PSP Data.csv')

  psp = pd.read_csv('CMU PSP Data.csv')


In [3]:
cpmc = pd.read_csv('CMU AOPC CP_MC Filings 2015-2018.csv')

  cpmc = pd.read_csv('CMU AOPC CP_MC Filings 2015-2018.csv')


In [4]:
missing_percent = cpmc.isna().mean() * 100
cols_over_90_missing = missing_percent[missing_percent > 90].index.tolist()
print("Columns with >90% missing values:", cols_over_90_missing)

Columns with >90% missing values: ['defendantcounty', 'fineadjustment', 'restitutionadjustment', 'rrristatus', 'sentencestartdate', 'confinementlocation']


In [5]:
# drop over 90% missing columns in both datasets
psp_dropped = psp.drop(columns=['death_yeardeath_month', 'death_day', 'sor_status', 'release_year', 'release_month', 'release_day', 'inchoate_charge', 'final_charge', 'consec_charge', 'disp3', 'disp4', 
                                'disp5', 'disp6', 'disp7', 'disp8', 'disp9', 'disp10', 'min_sent_year', 'max_sent_year'])
cpmc_dropped = cpmc.drop(columns=['defendantcounty', 'fineadjustment', 'restitutionadjustment', 'rrristatus', 'sentencestartdate', 'confinementlocation'])
print(psp_dropped.shape)
print(cpmc_dropped.shape)

(12753994, 41)
(4167282, 51)


In [9]:
# common otn & id pairs
cpmc_pairs = set(cpmc[['otn','id']].itertuples(index=False, name=None))
psp_pairs = set(psp[['otn','id']].itertuples(index=False, name=None))
common_pairs = cpmc_pairs & psp_pairs

In [13]:
# sort in order & create 10 subsets
common_pairs = pd.DataFrame(list(common_pairs), columns=['otn','id'])   # convert from set to list
common_pairs = common_pairs.sort_values(['otn','id']).reset_index(drop=True)

In [14]:
# 5 chunks
chunks = np.array_split(common_pairs, 5)
for i, chunk in enumerate(chunks, 1):
    start = chunk.iloc[0].to_dict() 
    end = chunk.iloc[-1].to_dict()   
    print(f"Chunk {i}: start {start}, end {end}, size {len(chunk)}")

  return bound(*args, **kwds)


Chunk 1: start {'otn': 'B1760603', 'id': 99812093}, end {'otn': 'N9625416', 'id': 99786818}, size 106904
Chunk 2: start {'otn': 'N9625442', 'id': 99605301}, end {'otn': 'T8070731', 'id': 99631961}, size 106903
Chunk 3: start {'otn': 'T8070786', 'id': 99880670}, end {'otn': 'U0701540', 'id': 99582544}, size 106903
Chunk 4: start {'otn': 'U0701551', 'id': 99633727}, end {'otn': 'X0182265', 'id': 99837421}, size 106903
Chunk 5: start {'otn': 'X0182280', 'id': 99755480}, end {'otn': 'Z1763370', 'id': 99904288}, size 106903


In [16]:
# subset 2 merge
subset_chunk = chunks[1]

cpmc_filtered = cpmc_dropped.set_index(['otn','id']).loc[subset_chunk.set_index(['otn','id']).index].reset_index()
psp_filtered = psp_dropped.set_index(['otn','id']).loc[subset_chunk.set_index(['otn','id']).index].reset_index()

print(cpmc_filtered.shape)
print(psp_filtered.shape)

(809728, 51)
(623069, 41)


In [17]:
cpmc_1 = cpmc_filtered.merge(psp_filtered, on=['otn','id'], how='inner')
print(cpmc_1.shape)

(7693731, 90)


In [18]:
cpmc_1.head()

Unnamed: 0,otn,id,docketnumber,originatingdocketnumber,citationcomplaintnumber,citytownboro,countyofoffense,casestatus,filingdate,offensedate,...,conv_flag,offense_year,offense_month,offense_day,offense_date,disp_date,ofn_title,ofn_section,ofn_subsection,citation
0,N9625442,99605301,MC-51-CR-0014291-2015,MC-51-CR-0014291-2015,1515043307-0014291,Philadelphia City,Philadelphia,Closed,2015-05-10 12:17:00,2015-05-09,...,N,2015.0,5.0,9.0,09may2015,31jul2015,35.0,780-113,(a)(31),35-780-113 (a)(31)
1,N9625442,99605301,MC-51-CR-0014291-2015,MC-51-CR-0014291-2015,1515043307-0014291,Philadelphia City,Philadelphia,Closed,2015-05-10 12:17:00,2015-05-09,...,,2015.0,5.0,10.0,10may2015,,35.0,780-113,(a)(16),35-780-113 (a)(16)
2,N9625442,99605301,MC-51-CR-0014291-2015,MC-51-CR-0014291-2015,1515043307-0014291,Philadelphia City,Philadelphia,Closed,2015-05-10 12:17:00,2015-05-09,...,,2015.0,5.0,10.0,10may2015,,35.0,780-113,(a)(31),35-780-113 (a)(31)
3,N9625442,99605301,MC-51-CR-0014291-2015,MC-51-CR-0014291-2015,1515043307-0014291,Philadelphia City,Philadelphia,Closed,2015-05-10 12:17:00,2015-05-09,...,,2015.0,5.0,10.0,10may2015,,35.0,780-113,(a)(19),35-780-113 (a)(19)
4,N9625442,99605301,MC-51-CR-0014291-2015,MC-51-CR-0014291-2015,1515043307-0014291,Philadelphia City,Philadelphia,Closed,2015-05-10 12:17:00,2015-05-09,...,N,2015.0,5.0,9.0,09may2015,31jul2015,35.0,780-113,(a)(31),35-780-113 (a)(31)


In [19]:
cpmc_1.to_csv('cpmc_1.csv', index=False)

In [22]:
# subset 3 merge
subset_chunk2 = chunks[2]

cpmc_filtered2 = cpmc_dropped.set_index(['otn','id']).loc[subset_chunk2.set_index(['otn','id']).index].reset_index()
psp_filtered2 = psp_dropped.set_index(['otn','id']).loc[subset_chunk2.set_index(['otn','id']).index].reset_index()

cpmc_2 = cpmc_filtered2.merge(psp_filtered2, on=['otn','id'], how='inner')
print(cpmc_2.shape)

(7002379, 90)


In [23]:
cpmc_2.to_csv('cpmc_2.csv', index=False)

In [24]:
# subset 4 merge
subset_chunk3 = chunks[3]

cpmc_filtered3 = cpmc_dropped.set_index(['otn','id']).loc[subset_chunk3.set_index(['otn','id']).index].reset_index()
psp_filtered3 = psp_dropped.set_index(['otn','id']).loc[subset_chunk3.set_index(['otn','id']).index].reset_index()

cpmc_3 = cpmc_filtered3.merge(psp_filtered3, on=['otn','id'], how='inner')
print(cpmc_3.shape)

(6817378, 90)


In [25]:
cpmc_3.to_csv('cpmc_3.csv', index=False)

In [26]:
# subset 5 merge
subset_chunk4 = chunks[4]

cpmc_filtered4 = cpmc_dropped.set_index(['otn','id']).loc[subset_chunk4.set_index(['otn','id']).index].reset_index()
psp_filtered4 = psp_dropped.set_index(['otn','id']).loc[subset_chunk4.set_index(['otn','id']).index].reset_index()

cpmc_4 = cpmc_filtered4.merge(psp_filtered4, on=['otn','id'], how='inner')
print(cpmc_4.shape)

(5865874, 90)


In [27]:
cpmc_4.to_csv('cpmc_4.csv', index=False)

In [29]:
# subset 1 merge
subset_chunk5 = chunks[0]

cpmc_filtered5 = cpmc_dropped.set_index(['otn','id']).loc[subset_chunk5.set_index(['otn','id']).index].reset_index()
psp_filtered5 = psp_dropped.set_index(['otn','id']).loc[subset_chunk5.set_index(['otn','id']).index].reset_index()

cpmc_5 = cpmc_filtered5.merge(psp_filtered5, on=['otn','id'], how='inner')
print(cpmc_5.shape)

(6155856, 90)


In [32]:
cpmc_5.to_csv('cpmc_5.csv', index=False)