# Combine district court cases with their superior court counterparts

In [1]:
import pandas as pd
import numpy as np
import datetime
import csv

In [2]:
charge_data = pd.read_excel('../data/steps/Step4Output_merged.xlsx')
charge_data.head()

Unnamed: 0,arresting_pd,case_id,case_no,charge_filing_date,charge_id,charge_number,disposition,disposition_date,dob,first_name,judge,last_name,offense_date,offense_description,sentence,sentence_conditions,sentence_date,sentence_terms,unique_identifier
0,MIDDLETOWN POLICE DEPARTMENT,7422794,21-2001-01905,2001-08-04,3909041,1,,NaT,1965-10-27,SCOTT,,DRISCOLL,2001-08-03,Larceny < 500 Person 65+\n,,,NaT,,4232652
1,TIVERTON POLICE DEPARTMENT,7423292,21-2001-02507,2001-09-25,3909791,1,Plea of Nolo Contendere,2001-10-18,1971-07-04,NED,"Pirraglia (Retired), Robert K.",FISHER,2001-09-24,Larceny < 500 Person 65+\n,Criminal Sentence,Probation 1Y Active: 10/18/2001; Court Cost...,2001-10-18,,4234254
2,Newport Police Department,7424561,21-2002-00802,2002-03-27,3911694,2,Dismissed 48A,2003-02-25,1983-04-28,JOSEPH,"Pirraglia (Retired), Robert K.",FERREIRA,2002-03-27,FAILURE TO APPEAR FOR SUMMONS,,,NaT,,4650281
3,Newport Police Department,7424561,21-2002-00802,2002-03-27,3911693,1,Dismissed 48A,2003-02-25,1983-04-28,JOSEPH,"Pirraglia (Retired), Robert K.",FERREIRA,2002-02-28,Larceny < 500 Person 65+\n,,,NaT,,4650281
4,Newport Police Department,7425369,21-2002-01767,2002-07-24,3913116,1,Plea of Nolo Contendere,2002-07-24,1976-03-25,DION,"Pirraglia (Retired), Robert K.",ARDREY,2002-07-11,Larceny < 500 Person 65+\n,Criminal Sentence,Credit for Time Served 10H Active: 07/24/20...,2002-07-24,,4651070


## Sub-step 1: Combine DC/Superior linkage files to create case mappings

In [3]:
dc_to_sup = {}
sup_to_dc = {}
dc_to_dc = {}
sup_to_sup = {}

In [4]:
def add_to_set_map(case_map, key, value):
    if key in case_map:
        case_map[key].add(value)
    else:
        case_map[key] = set()
        case_map[key].add(value)

In [5]:
with open('../data/inputs/dc_links.csv') as dc_links, open('../data/inputs/sup_links.csv') as sup_links:
    dc_reader = csv.reader(dc_links)
    sup_reader = csv.reader(sup_links)
    next(dc_reader)
    next(sup_reader)
    for line in dc_reader:
        if line[1] == line[3]:
            add_to_set_map(dc_to_dc, line[0], line[2])
            add_to_set_map(dc_to_dc, line[0], line[0]) # Always map an item to itself for dc to dc
        elif line[2] != 'NULL':
            add_to_set_map(dc_to_sup, line[0], line[2])
    for line in sup_reader:
        if line[1] == line[3]:
            add_to_set_map(sup_to_sup, line[0], line[2])
            add_to_set_map(sup_to_sup, line[0], line[0]) # Always map an item to itself for sup to sup
        elif line[2] != 'NULL':
            add_to_set_map(sup_to_dc, line[0], line[2])

## Sub-step 2: Get a list of all of our case numbers (and make a mapping of case number -> list of charges while we're at it)

In [6]:
def add_to_list_map(case_map, key, value):
    if key in case_map:
        case_map[key].append(value)
    else:
        case_map[key] = []
        case_map[key].append(value)

In [7]:
charge_map = {}
for row in charge_data.iterrows():
    case_number = row[1]['case_no']
    add_to_list_map(charge_map, case_number, row[1])

## Sub-step 3: Create unique versions of `dc_to_dc` and `sup_to_sup` that are specific to our cases, and can be merged easily

In [8]:
unique_sup_to_sup = []
unique_dc_to_dc = []

sup_handled = set()
dc_handled = set()
for key in charge_map:
    if key in sup_to_sup and not key in sup_handled:
        value = [v for v in sup_to_sup[key] if v in charge_map]
        if len(value) <= 1:
            continue
        unique_sup_to_sup.append(value)
        for v in value:
            sup_handled.add(v)
    if key in dc_to_dc and not key in dc_handled:
        value = [v for v in dc_to_dc[key] if v in charge_map]
        if len(value) <= 1:
            continue
        unique_dc_to_dc.append(value)
        for v in value:
            dc_handled.add(v)
print(len(unique_sup_to_sup))

19


#### These now contain lists of groups of case numbers that are related. Some are actually dc->sup mappings, so we'll handle those

In [9]:
def get_dc_to_sup(group):
    # Returns dc, sup if True
    if len(group) != 2:
        return None, None
    item1 = group[0]
    item2 = group[1]
    if item1[0].isalpha() and not item2[0].isalpha():
        return item2, item1
    if not item1[0].isalpha() and item2[0].isalpha():
        return item1, item2
    return None, None

In [10]:
for group in unique_sup_to_sup:
    dc, sup = get_dc_to_sup(group)
    if dc and sup:
        unique_sup_to_sup.remove(group)
        add_to_set_map(dc_to_sup, dc, sup)
print(len(unique_sup_to_sup))

14


#### Now we've just got weird case number mappings. So for these, pick the one with the higher charge date, and remove the other from our data mapping -- they should have the same data anyway

In [11]:
def get_max(group, field, full=False): # Update this to check vals
    max_field = None
    max_case_num = None
    
    sorted(
        group,
        key=lambda x:max(charge_map[x],key=lambda x:x[field])[field],
        reverse=True
    )
    if full:
        return group
    return group[0]

In [12]:
for group in unique_sup_to_sup:
    max_case_num = get_max(group, 'charge_filing_date')
    for ele in group:
        if ele != max_case_num:
            del charge_map[ele]
print(len(charge_map))

1379


#### Do the same for dc_to_dc

In [13]:
for group in unique_dc_to_dc:
    max_case_num = get_max(group, 'charge_filing_date')
    for ele in group:
        if ele != max_case_num:
            del charge_map[ele]
print(len(charge_map))

1378


## Sub-step 4: use `dc_to_sup` to combine dc and sup data

In [14]:
def check_linked_case_num(linked_case_number):
    linked_case_num = next(iter(linked_case_number))
    if linked_case_num in charge_map:
        return linked_case_num
    return None

In [15]:
def filter_multiple_linked_cases(current_case_number, linked_case_nums):
    if current_case_number[0].isalpha():        
        return get_max(
            [
                case_number for case_number in linked_case_nums
                if case_number in charge_map and not case_number[0].isalpha()
            ], 'charge_filing_date', full=True
        )
    else:
        return get_max(
            [
                case_number for case_number in linked_case_nums
                if case_number in charge_map and case_number[0].isalpha()
            ], 'charge_filing_date', full=True
        )

In [16]:
approved_cases = {}
ignore = set()
deleted = 0
ignoring = 0
case_nums = set()
for row in charge_data.iterrows():
    case_number = row[1]['case_no']
    if case_number not in charge_map:
        deleted +=1
        continue
    if case_number in ignore:
        ignoring += 1
        continue
    case_nums.add(case_number)
    
    linked_case_num = None
    if case_number in dc_to_sup or case_number in sup_to_dc:
        if case_number[0].isalpha():
            linked_case_numbers = sup_to_dc[case_number]
        else:
            linked_case_numbers = dc_to_sup[case_number]

        if len(linked_case_numbers) == 1:
            linked_case_num = check_linked_case_num(linked_case_numbers)
        else:
            ordered_linked_case_nums = filter_multiple_linked_cases(case_number, linked_case_numbers)
            if len(ordered_linked_case_nums):
                linked_case_num = ordered_linked_case_nums[0]
            else:
                linked_case_num = None
            for k in ordered_linked_case_nums[1:]:
                ignore.add(k)

    if linked_case_num is None:
        approved_cases[case_number] = None
    else:
        ordered = get_max([case_number, linked_case_num], 'charge_filing_date', full=True)
        ignore.add(ordered[1])
        if ordered[0] == case_number:
            approved_cases[case_number] = ordered[1]
print(deleted, ignoring)

122 795


In [17]:
to_write = []
for row in charge_data.iterrows():
    charge_record = row[1]
    case_number = charge_record['case_no']

    if case_number in approved_cases:
        if case_number[0].isalpha():
            charge_record['sup_case'] = case_number
            charge_record['dc_case'] = approved_cases[case_number]
        else:
            charge_record['sup_case'] = approved_cases[case_number]
            charge_record['dc_case'] = case_number
        del row[1]['case_no']
        to_write.append(charge_record)

In [18]:
print(len(to_write))

2411


## Sub-step 5: Write data to `steps` folder