Data Extraction

In [1]:
# import packages
import pandas as pd
import numpy as np
import os, sys 
import re

sys.path.append(os.getcwd())

In [2]:
# import datasets

# load state dataset: Arkansas
file = "../../orig/AR_Washington_Charges.csv"
dfArkansas = pd.read_csv(os.path.join(os.path.dirname('__file__'),file),index_col=0)

# load state dataset: Louisiana
file = "../../orig/LA_East_Baton_Rouge_Charges.csv"
dfLouisiana = pd.read_csv(os.path.join(os.path.dirname('__file__'),file),index_col=0)

# load state dataset: Michigan
file = "../../orig/MI_Wayne_Charges.csv"
dfMichigan = pd.read_csv(os.path.join(os.path.dirname('__file__'),file),index_col=0)

# load state dataset: New York
file = "../../orig/2_4_data_NYC.xlsx"
dfNY = pd.read_excel(os.path.join(os.path.dirname('__file__'),file),index_col=0)

In [3]:
# general state level statistics

# 1. Arkansas
uniqueRaceAR = dfArkansas.groupby(['Race'])['Race'].count()
uniqueSexAR = dfArkansas.groupby(['Sex'])['Sex'].count()
print('ARKANSAS')
print(f'Total: {sum(uniqueRaceAR)}: {uniqueRaceAR}')
print(f'Total: {sum(uniqueSexAR)}: {uniqueSexAR}')
print('',sep='\n\n')

# 2. Louisiana
uniqueRaceLA = dfLouisiana.groupby(['Race'])['Race'].count()
uniqueSexLA = dfLouisiana.groupby(['Sex'])['Sex'].count()
print('LOUISIANA')
print(f'Total: {sum(uniqueRaceLA)}: {uniqueRaceLA}')
print(f'Total: {sum(uniqueSexLA)}: {uniqueSexLA}')
print('',sep='\n\n')

# 3. Michigan
print('MICHIGAN: N/A')
print('',sep='\n\n')

# 4. New York
uniqueRaceNY = dfNY.groupby(['race'])['race'].count()
uniqueSexNY = dfNY.groupby(['gender'])['gender'].count()
print('NEW YORK')
print(f'Total: {sum(uniqueRaceNY)}: {uniqueRaceNY}')
print(f'Total: {sum(uniqueSexNY)}: {uniqueSexNY}')
print('',sep='\n\n')

ARKANSAS
Total: 49960: Race
Asian         284
Black        8764
Hispanic     5752
Indian        183
None          691
White       34055
p             231
Name: Race, dtype: int64
Total: 49960: Sex
F        8467
M       41441
None       52
Name: Sex, dtype: int64

LOUISIANA
Total: 280240: Race
Asian          290
Black       222211
Hispanic      5636
None          2285
White        49818
Name: Race, dtype: int64
Total: 280240: Sex
F        38098
M       242067
None        75
Name: Sex, dtype: int64

MICHIGAN: N/A

NEW YORK
Total: 5182: race
Asian        83
Black      3068
Indian       12
Other      1426
Unknown      13
White       580
Name: race, dtype: int64
Total: 5182: gender
Female     299
Male      4883
Name: gender, dtype: int64



In [4]:
dfArkansas.groupby(['Race','Sex'])['Age'].count()
dfLouisiana.groupby(['Race','Sex'])['Sex'].count()
dfNY.groupby(['race','gender'])['race'].count()

race     gender
Asian    Female       9
         Male        74
Black    Female     161
         Male      2907
Indian   Female       2
         Male        10
Other    Female      67
         Male      1359
Unknown  Female       1
         Male        12
White    Female      59
         Male       521
Name: race, dtype: int64

In [5]:
def swap_columns(df, c1, c2):
    
    df['temp'] = df.iloc[:, c1]
    df.iloc[:, c1] = df.iloc[:, c2]
    df.iloc[:, c2] = df['temp']
    df.drop(columns=['temp'], inplace=True)

In [6]:
# preprocess each state before mapping by BJS codes


from datetime import *

# 1. Arkansas
temp = dfArkansas.copy()

# no duplicates across all columns
print("Arkansas")
print('Before dropping duplicates:',len(temp))
temp = temp.drop_duplicates(keep='first', inplace=False)
print('After dropping duplicates:',len(temp))

# remove unncessary columns for computation time
temp.drop(['Address','Release_Time','Inmate_ID','Eyes','Hair','Height',\
           'Weight','Booking_Time','Arresting_Agency','Charge_End',\
           'Charge_Start','Court','Date','Department','Time'], \
          axis=1, inplace=True)
# append time spent in jail column
durationJailAR = []
for book, release in zip(list(temp['Booking_Date']), list(temp['Release_Date'])):
    book_tokens = book.split('/')
    book_yy = int(book_tokens[2])
    book_mm = int(book_tokens[0])
    book_dd = int(book_tokens[1])
    bookdate = date(book_yy, book_mm, book_dd)
    
    release_tokens = release.split('/')
    if release_tokens[0] == 'None':
        duration = -1
    else:
        release_yy = int(release_tokens[2])
        release_mm = int(release_tokens[0])
        release_dd = int(release_tokens[1])
        releasedate = date(release_yy, release_mm, release_dd)
        duration = int((releasedate - bookdate).days)
    durationJailAR.append(duration)
temp.drop(['Booking_Date','Release_Date'], axis=1, inplace=True)
temp['Duration_Jail'] = durationJailAR

temp.reset_index(drop=True,inplace=True)
temp['Bond'] = list(temp['Bond'].map(lambda x: float(str(x).lstrip('$').rstrip(' ').replace(',',''))))

# swap columns for reaability
swap_columns(temp,3,4)
temp.columns = temp.columns.str.replace('Bond','tmp')
temp.columns = temp.columns.str.replace('Charge','Bond')
temp.columns = temp.columns.str.replace('tmp','Charge')

# substitute back the filtered dataframe
dfArkansasCleaned = temp.copy()
dfArkansasCleaned

Arkansas
Before dropping duplicates: 49960
After dropping duplicates: 8480


Unnamed: 0,Age,Race,Sex,Charge,Bond,Duration_Jail
0,21,Black,F,FAILURE TO APPEAR,50000.0,-1
1,21,Black,F,RULE 8.1 HEARING,0.0,-1
2,21,Black,F,HOLD FOR OTHER DEPT,0.0,-1
3,21,Black,F,FAILURE TO APPEAR,0.0,-1
4,21,Black,F,RULE 8.1 HEARING,0.0,-1
...,...,...,...,...,...,...
8475,31,Hispanic,F,ASSAULT ON FAMILY MEMBER 3RD,0.0,-1
8476,31,Hispanic,F,ENDANG. WELFARE OF A MINOR 3RD,0.0,-1
8477,31,Hispanic,F,RULE 8.1 HEARING,0.0,-1
8478,37,Black,M,INTERFERENCE W CUSTODY,0.0,-1


In [7]:
# 2. Louisiana

temp = dfLouisiana.copy()

# no duplicates across all columns
print("Louisiana")
print('Before dropping duplicates:',len(temp))
temp = temp.drop_duplicates(keep='first', inplace=False)
print('After dropping duplicates:',len(temp))

temp.drop(['Jail_ID','Bond_Date','Court', 'File_Number', 'Name','Statute'], axis=1, inplace=True)
durationJailLA = []
for book, release, status in zip(list(temp['Booking_Date']), list(temp['Release_Date']), \
                                 list(temp['Status'])):
    book_tokens = book.split('-')
    release_tokens = release.split('-')
    if book_tokens[0] == 'None' or release_tokens[0] == 'None':
        duration = -1;  
    else:
        book_yy = int(book_tokens[0])
        book_mm = int(book_tokens[1])
        book_dd = int(book_tokens[2])
        bookdate = date(book_yy, book_mm, book_dd)
        
        release_yy = int(release_tokens[0])
        release_mm = int(release_tokens[1])
        release_dd = int(release_tokens[2])
        releasedate = date(release_yy, release_mm, release_dd)
        duration = int((releasedate - bookdate).days)
        
    durationJailLA.append(duration)

temp.drop(['Booking_Date','Release_Date','Status'], axis=1, inplace=True)
temp['Duration_Jail'] = durationJailLA

# swap columns for reaability
swap_columns(temp,2,3)
temp.columns = temp.columns.str.replace('Bond_Amount','tmp')
temp.columns = temp.columns.str.replace('Charge','Bond')
temp.columns = temp.columns.str.replace('tmp','Charge')

# substitute back the filtered dataframe
dfLouisianaCleaned = temp.copy()
dfLouisianaCleaned

Louisiana
Before dropping duplicates: 280240
After dropping duplicates: 50528


Unnamed: 0,Sex,Race,Charge,Bond,Duration_Jail
0,M,Black,POSS SCH 1 DRUGS,20000.0,1
1,M,Black,DIST/MANF SCH1 DRUG,0.0,-1
2,M,Black,CRUELTY TO ANIMALS,0.0,16
3,F,Black,CRUELTY TO JUVENILES,2500.0,-1
4,F,Black,WARRANT/BENCH WARRAN,0.0,-1
...,...,...,...,...,...
278852,M,Black,ISS WORTHLESS CHECKS,0.0,-1
278853,M,Black,BATT/SIMPLE/CC,0.0,-1
278970,M,Black,POSS SCH 2 DRUGS,96000.0,-1
279356,M,White,MONETARY INSTRUMENT,15000.0,-1


In [8]:
# 3. Michigan
# For Michigan, we process this differently than others because it does not have 
# demographic info or time already spent in jail info
temp = dfMichigan.copy()

# no duplicates across all columns
print("Michigan")
print('Before dropping duplicates:',len(temp))
temp = temp.drop_duplicates(keep='first', inplace=False)
print('After dropping duplicates:',len(temp))

# Michigan has booking date, offense date, sentence date, but not release date
# however, it has sentence length, which is expected time that will be spent in jail (meaningful metric)
temp.drop(['Booking_Date','Booking_Time','County','Court_Date',\
           'DOB','Disposition', 'Docket_Number','Location','Offense_Date',\
           'Offense_Time','Sentence_Date'], axis=1, inplace=True)
temp.reset_index(inplace=True)

# mark NaN sentence lengths as 01
temp['Sentence_Length'].replace(np.nan, -1, inplace=True)

# expected Duration_Jail
print('processing sentence times...')
sentence_times = []
for index,row in enumerate(temp.iterrows()):
    # sentence time
    sentence_time = 0
    sentence_entry = row[1].Sentence_Length
    
    if sentence_entry == -1:
        sentence_time = 0
    else:
        sentence_entry_tokens = sentence_entry.split()
        if sentence_entry_tokens[1] == 'day' or sentence_entry_tokens[1] == 'days':
            sentence_time = int(sentence_entry_tokens[0])
        elif sentence_entry_tokens[1] == 'month' or sentence_entry_tokens[1] == 'months':
            sentence_time = (30 * int(sentence_entry_tokens[0]))
            if len(sentence_entry_tokens) > 2:
                if sentence_entry_tokens[3] == 'day' or sentence_entry_tokens[3] == 'days':
                    sentence_time += int(sentence_entry_tokens[2])
        elif sentence_entry_tokens[1] == 'year' or sentence_entry_tokens[1] == 'years':
            sentence_time = (365 * int(sentence_entry_tokens[0]))
    sentence_times.append(sentence_time)

# insert expected sentence lengths
temp['Duration_JailExp'] = sentence_times
temp.drop(['Sentence_Length'],axis=1,inplace=True)

# Bond amounts
print('processing bond amounts...')
# mark empty or NaN bond entries as -1
temp['Bond'].replace(np.nan, -1, inplace=True)
temp['Bond'].replace('', -1, inplace=True)
previous_person = ''
personIdx = -1
found_duplicate_person = False
bond_amt_list = []
keyword = 'Bond_Amount'
for index, row in temp.iterrows():    
    if row.Bond == -1: # NaN or empty Bond information
        bond_amt_list.append(-1) # -1
    elif row.Bond == 'None' or len(row.Bond) == 0:
        bond_amt_list.append(-1) # 0
    else:
        # strip bond info's into chunks
        bond_info = [item+'}' if item[0] == '}' else item for item in row.Bond.strip('][').split('}, ')]
        # extract all bond amount values for that person
        bond_amounts = []
        for bond in bond_info:
            # if 'None' or empty, set value to 0
            if bond == 'None' or len(bond) == 0:
                val = -1
            else:
                startidx = bond.index(keyword)
                val = bond[startidx + len(keyword) + 3:-1].strip("}'")
            bond_amounts.append(val)
        
        # keep person count
        if previous_person != row.Name:
            personIdx = 0   # start new count if new person
        elif previous_person == row.Name:
            personIdx += 1  # increment if same person
#         print('personIdx:',personIdx, 'length of bond info:',len(bond_info))
        if len(bond_info) < (personIdx+1):
            # add $0 if insufficient information is provided
            bond_amt_list.append(-1)
        else:
            # otherwise, append the bond amount for the current case index
            bond_amt_list.append(bond_amounts[personIdx])
#         print('added bond value:',bond_amt_list[-1])
    previous_person = row.Name

# insert bond amounts into each case (charge)
temp['Bond_'] = bond_amt_list
temp['Bond_'] = pd.to_numeric(temp['Bond_'], errors='coerce')
temp.drop(['Bond'],axis=1,inplace=True)
temp['Bond_'].replace(-1, np.nan, inplace=True)
temp.dropna(subset=['Bond_'], inplace=True)

# swap columns for reaability
swap_columns(temp,3,4)
temp.columns = temp.columns.str.replace('Bond_','tmp')
temp.columns = temp.columns.str.replace('Duration_JailExp','Bond')
temp.columns = temp.columns.str.replace('tmp','Duration_JailExp')

dfMichiganCleaned = temp.copy()
dfMichiganCleaned


Michigan
Before dropping duplicates: 198973
After dropping duplicates: 44396
processing sentence times...
processing bond amounts...


Unnamed: 0,Name,Charge,Charge_Classification,Bond,Duration_JailExp
0,MOHAMED-ALI NABIL ABDALLAH,ASSAULT OR ASSAULT AND BATTERY,Misdemeanor,500.0,0
1,MOHAMED-ALI NABIL ABDALLAH,DOMESTIC VIOLENCE,Misdemeanor,500.0,0
3,THOMAS PATRICK ABRAHAM,OPERATING - OUIL/PER SE/OWI - 3RD OFFENSE NOTICE,Felony,250.0,0
4,ALEXANDER ACEVAL,CONTROLLED SUBSTANCE-DELIVERY/MANUFACTURE (SCH...,Felony,750000.0,0
5,ALEXANDER ACEVAL,CONTROLLED SUBSTANCE-DEL/MFG-450-999 GRAMS,Felony,500000.0,0
...,...,...,...,...,...
44391,AVERY THOMPSON,Assault with a Dangerous Weapon (Felonious Ass...,Felony,1500.0,0
44392,JUSTIN WESLEY,CONTROLLED SUBSTANCE-DEL/MFG LESS THAN 50 GRAMS,Felony,300.0,0
44393,JUSTIN WESLEY,ORDINANCE VIOLATION,Local Ordinance,300.0,0
44394,JUSTIN WESLEY,ORDINANCE VIOLATION,Local Ordinance,5000.0,0


In [9]:
# New York
temp = dfNY.copy()

# no duplicates across all columns
print("New York")
print('Before dropping duplicates:',len(temp))
temp = temp.drop_duplicates(keep='first', inplace=False)
print('After dropping duplicates:',len(temp))

# remove rows without bond information
replace_values = {'released' : np.nan, 'remanded' : np.nan, 'sentenced':np.nan, 'unknown' : np.nan}  
temp['bond_info'].replace(replace_values, inplace=True)
temp.dropna(subset=['bond_info'], inplace=True)
orig_removed_str_bond_total_count = len(list(temp.bond_info))
print("Now: {} cases without non-numeric bond information".format(orig_removed_str_bond_total_count))

# remove $ bonds (those <= 10 dollars)
replace_values = {0 : np.nan, 1 : np.nan, 2 : np.nan, 3: np.nan, 4: np.nan, 5: np.nan, \
                  6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan, 10: np.nan}   
temp['bond_info'].replace(replace_values, inplace=True)
temp.dropna(subset=['bond_info'], inplace=True)
orig_removed_zero_bond_total_count = len(list(temp.bond_info))
print("Now: {} cases without placeholder bond information".format(orig_removed_zero_bond_total_count))

# sort by NYSID
temp.sort_values(by=['nysid'], ascending=True, na_position='first', inplace=True)
# convert bond info into float
bond_amts = [float(item) for item in list(temp.bond_info)]
temp['bond_info'] = bond_amts

temp.reset_index(inplace=True)
temp.drop(['nysid','warrants','arrest_date','next_court_date','housing_facility','docket_numbers','is_new','Unnamed: 11'],\
          axis = 1, inplace=True)

# extract only first state statute
statute_list = []
for iter, row in temp.iterrows():
    charges = row.charges
    statute = [int(re.sub("[^0-9]","",charges[0:7].replace('.','').replace('-',''))) \
                     if charges[0:7][0].isdigit() else 0]
    statute_list.append(statute[0])
temp['first_statute'] = statute_list

dfNYCleaned = temp.copy()


New York
Before dropping duplicates: 5393
After dropping duplicates: 4996
Now: 1906 cases without non-numeric bond information
Now: 1548 cases without placeholder bond information


In [10]:
# save to csv
dfArkansasCleaned.to_csv('./cleaned/arkansas.csv', index=False)
dfLouisianaCleaned.to_csv('./cleaned/louisiana.csv', index=False)
dfMichiganCleaned.to_csv('./cleaned/michigan.csv', index=False)
dfNYCleaned.to_csv('./cleaned/newyork.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: './cleaned/arkansas.csv'

In [11]:
dfArkansasCleaned

Unnamed: 0,Age,Race,Sex,Charge,Bond,Duration_Jail
0,21,Black,F,FAILURE TO APPEAR,50000.0,-1
1,21,Black,F,RULE 8.1 HEARING,0.0,-1
2,21,Black,F,HOLD FOR OTHER DEPT,0.0,-1
3,21,Black,F,FAILURE TO APPEAR,0.0,-1
4,21,Black,F,RULE 8.1 HEARING,0.0,-1
...,...,...,...,...,...,...
8475,31,Hispanic,F,ASSAULT ON FAMILY MEMBER 3RD,0.0,-1
8476,31,Hispanic,F,ENDANG. WELFARE OF A MINOR 3RD,0.0,-1
8477,31,Hispanic,F,RULE 8.1 HEARING,0.0,-1
8478,37,Black,M,INTERFERENCE W CUSTODY,0.0,-1


In [12]:
dfLouisianaCleaned

Unnamed: 0,Sex,Race,Charge,Bond,Duration_Jail
0,M,Black,POSS SCH 1 DRUGS,20000.0,1
1,M,Black,DIST/MANF SCH1 DRUG,0.0,-1
2,M,Black,CRUELTY TO ANIMALS,0.0,16
3,F,Black,CRUELTY TO JUVENILES,2500.0,-1
4,F,Black,WARRANT/BENCH WARRAN,0.0,-1
...,...,...,...,...,...
278852,M,Black,ISS WORTHLESS CHECKS,0.0,-1
278853,M,Black,BATT/SIMPLE/CC,0.0,-1
278970,M,Black,POSS SCH 2 DRUGS,96000.0,-1
279356,M,White,MONETARY INSTRUMENT,15000.0,-1


In [13]:
dfMichiganCleaned

Unnamed: 0,Name,Charge,Charge_Classification,Bond,Duration_JailExp
0,MOHAMED-ALI NABIL ABDALLAH,ASSAULT OR ASSAULT AND BATTERY,Misdemeanor,500.0,0
1,MOHAMED-ALI NABIL ABDALLAH,DOMESTIC VIOLENCE,Misdemeanor,500.0,0
3,THOMAS PATRICK ABRAHAM,OPERATING - OUIL/PER SE/OWI - 3RD OFFENSE NOTICE,Felony,250.0,0
4,ALEXANDER ACEVAL,CONTROLLED SUBSTANCE-DELIVERY/MANUFACTURE (SCH...,Felony,750000.0,0
5,ALEXANDER ACEVAL,CONTROLLED SUBSTANCE-DEL/MFG-450-999 GRAMS,Felony,500000.0,0
...,...,...,...,...,...
44391,AVERY THOMPSON,Assault with a Dangerous Weapon (Felonious Ass...,Felony,1500.0,0
44392,JUSTIN WESLEY,CONTROLLED SUBSTANCE-DEL/MFG LESS THAN 50 GRAMS,Felony,300.0,0
44393,JUSTIN WESLEY,ORDINANCE VIOLATION,Local Ordinance,300.0,0
44394,JUSTIN WESLEY,ORDINANCE VIOLATION,Local Ordinance,5000.0,0


In [14]:
dfNYCleaned

Unnamed: 0,bond_info,charges,race,gender,first_statute
0,50000.0,110-120.10 FC (Attempted ASSAULT-1ST C Felony),Other,Male,110120
1,50000.0,110-125.25 FB (Attempted MURDER B Felony),Black,Male,110125
2,1000000.0,110-125.25 FB (Attempted MURDER B Felony),Black,Male,110125
3,15000.0,120.05 FD (ASSAULT -2ND D Felony),Black,Male,12005
4,500000.0,230.34 FB ( B Felony),Black,Male,23034
...,...,...,...,...,...
1543,25000.0,125.25 FA (MURDER A Felony),Black,Male,12525
1544,50000.0,265.03 FC (CRIM POSS WEAPON- 2ND DEGREE C Felony),Black,Male,26503
1545,100000.0,130.35 FB (RAPE-1ST B Felony),Other,Male,13035
1546,5000.0,130.65 FD (SEXUAL ABUSE-1ST D Felony),Other,Male,13065


In [15]:
# for manual mapping of charge descriptions to national BJS code
charge_list_AR = list(map(lambda x: str(x),dfArkansasCleaned.Charge.unique()))
charge_list_LA = list(dfLouisianaCleaned.Charge.unique())
charge_list_MI = list(dfMichiganCleaned.Charge.unique())
charge_list_NY = list(dfNYCleaned.charges.unique())

with open('./cleaned/charge_list_AR.txt', 'w') as filehandle:
    for charge in charge_list_AR:
        filehandle.write('%s\n' % charge)
with open('./cleaned/charge_list_LA.txt', 'w') as filehandle:
    for charge in charge_list_LA:
        filehandle.write('%s\n' % charge)
with open('./cleaned/charge_list_MI.txt', 'w') as filehandle:
    for charge in charge_list_MI:
        filehandle.write('%s\n' % charge)
with open('./cleaned/charge_list_NY.txt', 'w') as filehandle:
    for charge in charge_list_NY:
        filehandle.write('%s\n' % charge)

FileNotFoundError: [Errno 2] No such file or directory: './cleaned/charge_list_AR.txt'

In [16]:
# load penal code: Bureau of Justice Statistics penal codes
file = "../../penal_codes/BJS_states.xlsx"
dfBJS = pd.read_excel(os.path.join(os.path.dirname('__file__'),file))

In [17]:
# BJS code to broad category dict
# key: bjs code, value: bjs broad category
bjs_dict = pd.Series(dfBJS.iloc[1:,3].str.lower().values,\
                    index=dfBJS.iloc[1:,0].str.lower()\
                     .replace(np.nan,0,inplace=False).astype(int))\
                    .dropna().rename({'None':'other'}).to_dict()
# print(bjs_dict)
print("BJS Broad Categories:",sorted( list(np.unique([categ for categ in bjs_dict.values()])), reverse=False) )

BJS Broad Categories: ['drug', 'other', 'property', 'public order', 'violent']


In [18]:
# mapping of state-level unique charges to BJS codes

# 1. Arkansas
file = "../../code_maps/arkansas_map.csv"
AR_code_map = pd.read_csv(os.path.join(os.path.dirname('__file__'),file), \
                                     index_col=0, header=None, usecols=[1])

AR_charge_BJS_code_map = list(AR_code_map.index[1:])

# enter BJS code as values
charge_list_AR = list(map(lambda x: str(x),dfArkansasCleaned.Charge.unique()))
AR_charge_code = dict() # state charge desc to BJS code
for charge, label in zip(charge_list_AR, AR_charge_BJS_code_map):
    if label == '0':
        label = 999 # other
    if isinstance(charge,str):
        charge = charge.lower()
    AR_charge_code[charge] = int(label)

# run through actual data (cleaned) to map to code
data = dfArkansasCleaned.Charge.str.lower()
AR_code_categ = dict()
count = 0

# BJS Broad Category dicts to store row indices
BJS_broad_categ_dict = dict()
for idx,charge in enumerate(data):
    if charge in list(AR_charge_code.keys()):
        count += 1
        foundIdx = list(AR_charge_code.keys()).index(charge)
        AR_code =  list(AR_charge_code.values())[foundIdx]
        AR_categ = bjs_dict.get(AR_code)
        if AR_categ is None:
            AR_categ = 'other'
        # keep counter
        # and store indices to the BJS Broad Category dicts
        if AR_categ in AR_code_categ:
            AR_code_categ[AR_categ] += 1
        else:
            AR_code_categ[AR_categ] = 1
            BJS_broad_categ_dict[AR_categ] = []
        BJS_broad_categ_dict[AR_categ].append(idx)
            
# print('ex:',BJS_broad_categ_dict['violent'][0:5])
print("Arkansas stats: %s" % AR_code_categ)

# groups = ['violent','public order','drug','property','other']
for i, group in enumerate(AR_code_categ):
    print('\n\n====GROUP====')
    idx = BJS_broad_categ_dict[group]
    dfARgroup = dfArkansasCleaned.iloc[idx,:]
    dfARgroup.to_csv('./cleaned/AR_'+group+'.csv', index=False)
    # statistics by each BJS broad category for AR
    avgAge = dfARgroup['Age'].mean()
    races = dfARgroup['Race'].value_counts().sort_index()
    sexes = dfARgroup['Sex'].value_counts().sort_index()
    bonds = dfARgroup['Bond'].mean()
    print(f'\n\ngroup: {group} \n avg age: \n\t{avgAge:.1f}, \n races: \n\t{races}, \nsexes: \n\t{sexes}\n avg bonds: \n\t{bonds:.2f}')
    
    # break into placeholder and nontrivial dataframes
    mask = dfARgroup['Bond'] == 0
    placeholder_df = dfARgroup[mask]        
    avgAge = placeholder_df['Age'].mean()
    races = placeholder_df['Race'].value_counts().sort_index()
    sexes = placeholder_df['Sex'].value_counts().sort_index()
    bonds = placeholder_df['Bond'].mean()
    print(f'\n\nplaceholder: \n age: \n\t{avgAge:.1f}, \n races: \n\t{races}, \n sexes: \n\t{sexes} \n avg bonds: \n\t{bonds:.2f}')
    
    nontrivial_df = dfARgroup[~mask]   
    avgAge = nontrivial_df['Age'].mean()
    races = nontrivial_df['Race'].value_counts().sort_index()
    sexes = nontrivial_df['Sex'].value_counts().sort_index()
    bonds = nontrivial_df['Bond'].mean()
    print(f'\n\nmeaningful: \n\t\t age: \n\t{avgAge:.1f}, \n races: \n\t{races}, \n sexes: \n\t{sexes} \n avg bonds: \n\t{bonds:.2f}')
        
    # Bond breakdown on nontrivial df
    mask = nontrivial_df['Bond'] <= 1000
    bondGroupAR1 = nontrivial_df[mask]
    bondGroupAR2 = nontrivial_df[~mask]

    
    for i, dfARgroup in enumerate([bondGroupAR1, bondGroupAR2]):
        print('\n\n====BOND====')
        if i == 0:
            group = 'Bond <= $1,000'
        elif i == 1:
            group = 'Bond > $1,000'
        avgAge = dfARgroup['Age'].mean()
        races = dfARgroup['Race'].value_counts().sort_index()
        sexes = dfARgroup['Sex'].value_counts().sort_index()
        bonds = dfARgroup['Bond'].mean()
        times = dfARgroup['Duration_Jail'].value_counts().sort_index()
        print(f'\nbond group: {group} \n avg age: \n\t{avgAge:.1f}, \n races: \n{races}, \n sexes:\n{sexes} \n avg bonds: \n\t{bonds:.2f}, \ntimes:{times}')
        
        # bond and time
        print('\n\n====BOND AND TIME====')
        mask1 = dfARgroup['Duration_Jail'] != -1
        mask = dfARgroup['Duration_Jail'] <= 20 & mask1
        dfARgroupBondTime1 = dfARgroup[mask]
        avgAge = dfARgroupBondTime1['Age'].mean()
        races = dfARgroupBondTime1['Race'].value_counts().sort_index()
        sexes = dfARgroupBondTime1['Sex'].value_counts().sort_index()
        bonds = dfARgroupBondTime1['Bond'].mean()
        times = dfARgroupBondTime1['Duration_Jail'].value_counts().sort_index()
        print(f'\nbond: {group}, time: <= 20  \n avg age: \n\t{avgAge:.1f}, \n races: \n\t{races}, \n sexes:\n\t{sexes} \n avg bonds: \n\t{bonds:.2f}, \ntimes: \n\t{times}')
        
        dfARgroupBondTime2 = dfARgroup[~mask & mask1]
        avgAge = dfARgroupBondTime2['Age'].mean()
        races = dfARgroupBondTime2['Race'].value_counts().sort_index()
        sexes = dfARgroupBondTime2['Sex'].value_counts().sort_index()
        bonds = dfARgroupBondTime2['Bond'].mean()
        times = dfARgroupBondTime2['Duration_Jail'].value_counts().sort_index()
        print(f'\nbond: {group}, time: > 20  \n avg age: \n\t{avgAge:.1f}, \n races: \n\t{races}, \n sexes:\n\t{sexes} \n avg bonds: \n\t{bonds:.2f}, \ntimes: \n\t{times}')
    
    # Time breakdown on nontrivial df
    dfArkansasCleanedTime = nontrivial_df[nontrivial_df['Duration_Jail'] != -1]
    mask = dfArkansasCleanedTime['Duration_Jail'] <= 20
    timeGroupAR1 = dfArkansasCleanedTime[mask]
    timeGroupAR2 = dfArkansasCleanedTime[~mask]

    for i, dfARgroup in enumerate([timeGroupAR1, timeGroupAR2]):
        print('\n\n====TIME ONLY====')
        if i == 0:
            group = 'time <= 20 days'
        elif i == 1:
            group = 'time > 20 days'

        avgAge = dfARgroup['Age'].mean()
        races = dfARgroup['Race'].value_counts().sort_index()
        sexes = dfARgroup['Sex'].value_counts().sort_index()
        bonds = dfARgroup['Bond'].mean()
        times = dfARgroup['Duration_Jail'].value_counts().sort_index()
        print(f'\ntime group: {group} \n avg age: \n\t{avgAge:.1f}, \
                \n races: \n\t{races}, \n sexes:\n\t{sexes} \
                \n avg bonds: \n\t{bonds:.2f}')

Arkansas stats: {'public order': 5133, 'other': 1178, 'drug': 938, 'property': 562, 'violent': 658}


====GROUP====


FileNotFoundError: [Errno 2] No such file or directory: './cleaned/AR_public order.csv'

In [None]:
# mapping of state-level unique charges to BJS codes
# 2. Louisiana
file = "../../code_maps/louisiana_map.csv"
LA_code_map = pd.read_csv(os.path.join(os.path.dirname('__file__'),file), \
                                     index_col=0, header=None, usecols=[1])

LA_charge_BJS_code_map = list(LA_code_map.index[1:])

# enter BJS code as values
charge_list_LA = list(map(lambda x: str(x),dfLouisianaCleaned.Charge.unique()))
LA_charge_code = dict() # state charge desc to BJS code
for charge, label in zip(charge_list_LA, LA_charge_BJS_code_map):
    if label == '0':
        label = 999 # other
    if isinstance(charge,str):
        charge = charge.lower()
    LA_charge_code[charge] = int(label)

# run through actual data (cleaned) to map to code
data = dfLouisianaCleaned.Charge.str.lower()
LA_code_categ = dict()
count = 0

# BJS Broad Category dicts to store row indices
BJS_broad_categ_dict = dict()
for idx,charge in enumerate(data):
    if charge in list(LA_charge_code.keys()):
        count += 1
        foundIdx = list(LA_charge_code.keys()).index(charge)
        LA_code =  list(LA_charge_code.values())[foundIdx]
        LA_categ = bjs_dict.get(LA_code)
        if LA_categ is None:
            LA_categ = 'other'
        # keep counter
        # and store indices to the BJS Broad Category dicts
        if LA_categ in LA_code_categ:
            LA_code_categ[LA_categ] += 1
        else:
            LA_code_categ[LA_categ] = 1
            BJS_broad_categ_dict[LA_categ] = []
        BJS_broad_categ_dict[LA_categ].append(idx)
            
# print('ex:',BJS_broad_categ_dict['violent'][0:5])
print("Louisiana stats: %s" % LA_code_categ)

# groups = ['violent','public order','drug','property','other']
for i, group in enumerate(LA_code_categ):
    print('\n\n====GROUP====')
    idx = BJS_broad_categ_dict[group]
    dfLouisiana = dfLouisianaCleaned.iloc[idx,:]
    dfLouisiana.to_csv('./cleaned/LA_'+group+'.csv', index=False)
    # statistics by each BJS broad category for AR
    races = dfLouisiana['Race'].value_counts().sort_index()
    sexes = dfLouisiana['Sex'].value_counts().sort_index()
    bonds = dfLouisiana['Bond'].mean()
    print(f'\n\ngroup: {group} \n races: \n\t{races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    # break into placeholder and nontrivial dataframes
    mask = dfLouisiana['Bond'] == 0
    placeholder_df = dfLouisiana[mask]
    races = placeholder_df['Race'].value_counts().sort_index()
    sexes = placeholder_df['Sex'].value_counts().sort_index()
    bonds = placeholder_df['Bond'].mean()
    print(f'\n\nplaceholder: \n races: \n\t{races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    nontrivial_df = dfLouisiana[~mask]
    races = nontrivial_df['Race'].value_counts().sort_index()
    sexes = nontrivial_df['Sex'].value_counts().sort_index()
    bonds = nontrivial_df['Bond'].mean()
    print(f'\n\nmeaningful: \n races: {races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    # Bond breakdown on nontrivial df
    mask = nontrivial_df['Bond'] <= 1000
    bondGroupLA1 = nontrivial_df[mask]
    bondGroupLA2 = nontrivial_df[~mask]


    for i, dfLAgroup in enumerate([bondGroupLA1, bondGroupLA2]):
        print('\n\n====BOND====')
        if i == 0:
            group = 'Bond <= $1,000'
        elif i == 1:
            group = 'Bond > $1,000'
        races = dfLAgroup['Race'].value_counts().sort_index()
        sexes = dfLAgroup['Sex'].value_counts().sort_index()
        bonds = dfLAgroup['Bond'].mean()
        times = dfLAgroup['Duration_Jail'].value_counts().sort_index()
        print(f'\nbond group: {group} \n races: \n\t{races}, \n sexes:\n\t{sexes} \
                \n\ avg bonds: \n\t{bonds:.2f}, \ntimes:\n\t{times}')
        
        # bond and time
        print('\n\n====BOND AND TIME====')
        mask1 = dfLAgroup['Duration_Jail'] != -1
        mask = (dfLAgroup['Duration_Jail'] <= 20 ) & mask1
        
        dfLAgroupBondTime1 = dfLAgroup[mask]
        races = dfLAgroupBondTime1['Race'].value_counts().sort_index()
        sexes = dfLAgroupBondTime1['Sex'].value_counts().sort_index()
        bonds = dfLAgroupBondTime1['Bond'].mean()
        times = dfLAgroupBondTime1['Duration_Jail'].value_counts()
        print(f'\nbond: {group}, time: <= 20 \n races: \n\t{races}, \n sexes:\n\t{sexes} \
                \n avg bonds: \n\t{bonds:.2f}, \ntimes:\n\t{times}')
        
        dfLAgroupBondTime2 = dfLAgroup[~mask & mask1]
        races = dfLAgroupBondTime2['Race'].value_counts().sort_index()
        sexes = dfLAgroupBondTime2['Sex'].value_counts().sort_index()
        bonds = dfLAgroupBondTime2['Bond'].mean()
        times = dfLAgroupBondTime2['Duration_Jail'].value_counts().sort_index()
        print(f'\nbond: {group}, time: > 20 \n races: \n\t{races}, \n sexes:\n\t{sexes} \
                \n avg bonds: \n\t{bonds:.2f}, \ntimes:\n\t{times}')
        
    # Time breakdown on nontrivial df
    print('\n\n====TIME ONLY====')
    mask1 = nontrivial_df['Duration_Jail'] != -1
    dfLouisianaCleanedTime = nontrivial_df[mask1]
    
    mask = dfLouisianaCleanedTime['Duration_Jail'] <= 20
    timeGroupLA1 = dfLouisianaCleanedTime[mask]
    timeGroupLA2 = dfLouisianaCleanedTime[~mask]

    for i, dfLAgroup in enumerate([timeGroupLA1, timeGroupLA2]):
        if i == 0:
            group = 'time <= 20 days'
        elif i == 1:
            group = 'time > 20 days'

        races = dfLAgroup['Race'].value_counts()
        sexes = dfLAgroup['Sex'].value_counts()
        bonds = dfLAgroup['Bond'].mean()
        times = dfLAgroup['Duration_Jail'].value_counts()

        print(f'\ntime group: {group} \n\n races: \n\t{races}, \
                \n\n sexes:\n\t{sexes} \n\n avg bonds: \n\t{bonds:.2f}')

In [24]:
# mapping of state-level unique charges to BJS codes
# 3. Michigan
file = "../../code_maps/michigan_map.csv"
MI_code_map = pd.read_csv(os.path.join(os.path.dirname('__file__'),file), \
                                     index_col=0, header=None, usecols=[1])
# if duration_jail expected == 0, it is empty value
MI_charge_BJS_code_map = list(MI_code_map.index[1:])

# enter BJS code as values
charge_list_MI = list(map(lambda x: str(x),dfMichiganCleaned.Charge.unique()))
MI_charge_code = dict() # state charge desc to BJS code
for charge, label in zip(charge_list_MI, MI_charge_BJS_code_map):
    if label == '0':
        label = 999 # other
    if isinstance(charge,str):
        charge = charge.lower()
    MI_charge_code[charge] = int(label)

# run through actual data (cleaned) to map to code
data = dfMichiganCleaned.Charge.str.lower()
MI_code_categ = dict()
count = 0

# BJS Broad Category dicts to store row indices
BJS_broad_categ_dict = dict()
for idx,charge in enumerate(data):
    if charge in list(MI_charge_code.keys()):
        count += 1
        foundIdx = list(MI_charge_code.keys()).index(charge)
        MI_code =  list(MI_charge_code.values())[foundIdx]
        MI_categ = bjs_dict.get(MI_code)
        if MI_categ is None:
            MI_categ = 'other'
        # keep counter
        # and store indices to the BJS Broad Category dicts
        if MI_categ in MI_code_categ:
            MI_code_categ[MI_categ] += 1
        else:
            MI_code_categ[MI_categ] = 1
            BJS_broad_categ_dict[MI_categ] = []
        BJS_broad_categ_dict[MI_categ].append(idx)
            
# print('ex:',BJS_broad_categ_dict['violent'][0:5])
print("Michigan stats: %s" % MI_code_categ)

# groups = ['violent','public order','drug','property','other']
for i, group in enumerate(MI_code_categ):
    print('\n\n====GROUP====')
    idx = BJS_broad_categ_dict[group]
    dfMichigan = dfMichiganCleaned.iloc[idx,:]
    print(group, dfMichigan.shape)
#     dfMichigan.to_csv('./cleaned/MI_'+group+'.csv', index=False)
    # statistics by each BJS broad category for AR
    
    bonds = dfMichigan['Bond'].mean()
    print(f'\n\ngroup: {group} \n avg bonds: \n\t{bonds:.2f}')
    
    # break into placeholder and nontrivial dataframes
    mask = dfMichigan['Bond'] == 0
    placeholder_df = dfMichigan[mask]
    print("placeholder size:",placeholder_df.shape)
    bonds = placeholder_df['Bond'].mean()
    print(f'\n\placeholder: \n avg bonds: \n\t{bonds:.2f}')
    
    nontrivial_df = dfMichigan[~mask]
    print("nontrivial_df size:",nontrivial_df.shape)
    bonds = nontrivial_df['Bond'].mean()
    print(f'\n\nmeaningful: \n avg bonds: \n\t{bonds:.2f}')
    
    # Bond breakdown on nontrivial df
    mask = nontrivial_df['Bond'] <= 1000
    bondGroupMI1 = nontrivial_df[mask]
    bondGroupMI2 = nontrivial_df[~mask]
    print("<= 1000 bond group:",bondGroupMI1.shape)
    print("> 1000 bond group:",bondGroupMI2.shape)

    for i, dfMIgroup in enumerate([bondGroupMI1, bondGroupMI2]):
        print('\n\n====BOND====')
        if i == 0:
            group = 'Bond <= $1,000'
        elif i == 1:
            group = 'Bond > $1,000'
        bonds = dfMIgroup['Bond'].mean()
        times = dfMIgroup['Duration_JailExp'].value_counts().sort_index()
        print(f'\nbond group: {group} \n avg bonds: \n\t{bonds:.2f}, \ntimes: \n\t{times}')
        
        # bond and time
        print('\n\n====BOND AND TIME====')
        mask1 = dfMIgroup['Duration_JailExp'] != -1
        mask = (dfMIgroup['Duration_JailExp'] <= 20 ) & mask1
        
        dfMIgroupBondTime1 = dfMIgroup[mask]
        print("bond group:",group," and time in jail expected <= 20:",dfMIgroupBondTime1.shape)
        bonds = dfMIgroupBondTime1['Bond'].mean()
        times = dfMIgroupBondTime1['Duration_JailExp'].value_counts().sort_index()
        print(f'\nbond: {group}, time: <= 20 \n avg bonds: \n\t{bonds:.2f}, \ntimes:\n\t{times}')
        
        dfMIgroupBondTime2 = dfMIgroup[~mask & mask1]
        print("bond group:",group," and time in jail expected > 20:",dfMIgroupBondTime2.shape)
        bonds = dfMIgroupBondTime2['Bond'].mean()
        times = dfMIgroupBondTime2['Duration_JailExp'].value_counts().sort_index()
        print(f'\nbond: {group}, time: > 20 \n avg bonds: \n\t{bonds:.2f}, \ntimes: \n\t{times}')
        
    # Time breakdown on nontrivial df
    print('\n\n====TIME ONLY (Expected)====')
    mask1 = nontrivial_df['Duration_JailExp'] != -1
    dfMichiganCleanedTime = nontrivial_df[mask1]
    
    mask = dfMichiganCleanedTime['Duration_JailExp'] <= 20
    timeGroupMI1 = dfMichiganCleanedTime[mask]
    timeGroupMI2 = dfMichiganCleanedTime[~mask]
    print("<= 20 time group:",timeGroupMI1.shape)
    print("> 20 time group:",timeGroupMI2.shape)

    for i, dfMIgroup in enumerate([timeGroupMI1, timeGroupMI2]):
        if i == 0:
            group = 'time <= 20 days'
        elif i == 1:
            group = 'time > 20 days'

        bonds = dfMIgroup['Bond'].mean()
        times = dfMIgroup['Duration_JailExp'].value_counts().sort_index()

        print(f'\ntime group: {group} \n\n avg bonds: \n\t{bonds:.2f}, \ntimes: \n\t{times}')

Michigan stats: {'violent': 6525, 'other': 4680, 'drug': 1611, 'public order': 10275, 'property': 3875}


====GROUP====
violent (6525, 5)


group: violent 
 avg bonds: 
	47860.36
placeholder size: (1399, 5)

\placeholder: 
 avg bonds: 
	0.00
nontrivial_df size: (5126, 5)


meaningful: 
 avg bonds: 
	60922.53
<= 1000 bond group: (1689, 5)
> 1000 bond group: (3437, 5)


====BOND====

bond group: Bond <= $1,000 
 avg bonds: 
	515.84, 
times: 
	0      1611
5         9
20        2
25        4
28        4
30        1
40        1
45        2
60        6
90        5
93        5
120       7
150       4
180       3
240       2
270       4
300       4
360      10
365       5
Name: Duration_JailExp, dtype: int64


====BOND AND TIME====
bond group: Bond <= $1,000  and time in jail expected <= 20: (1622, 5)

bond: Bond <= $1,000, time: <= 20 
 avg bonds: 
	507.25, 
times:
	0     1611
5        9
20       2
Name: Duration_JailExp, dtype: int64
bond group: Bond <= $1,000  and time in jail expected > 20

In [None]:
# mapping of state-level unique charges to BJS codes
# 4. New York
file = "../../code_maps/newyork_map.csv"
NY_code_map = pd.read_csv(os.path.join(os.path.dirname('__file__'),file), \
                                     index_col=0, header=None, usecols=[1])

NY_charge_BJS_code_map = list(NY_code_map.index[1:])

# enter BJS code as values
charge_list_NY = list(map(lambda x: str(x),dfNYCleaned['charges'].unique()))
NY_charge_code = dict() # state charge desc to BJS code
for charge, label in zip(charge_list_NY, NY_charge_BJS_code_map):
    if label == '0':
        label = 999 # other
    if isinstance(charge,str):
        charge = charge.lower()
    NY_charge_code[charge] = int(label)

# run through actual data (cleaned) to map to code
data = dfNYCleaned['charges'].str.lower()
NY_code_categ = dict()
count = 0

# BJS Broad Category dicts to store row indices
BJS_broad_categ_dict = dict()
for idx,charge in enumerate(data):
    if charge in list(NY_charge_code.keys()):
        count += 1
        foundIdx = list(NY_charge_code.keys()).index(charge)
        NY_code =  list(NY_charge_code.values())[foundIdx]
        NY_categ = bjs_dict.get(NY_code)
        if NY_categ is None:
            NY_categ = 'other'
        # keep counter
        # and store indices to the BJS Broad Category dicts
        if NY_categ in NY_code_categ:
            NY_code_categ[NY_categ] += 1
        else:
            NY_code_categ[NY_categ] = 1
            BJS_broad_categ_dict[NY_categ] = []
        BJS_broad_categ_dict[NY_categ].append(idx)
            
# print('ex:',BJS_broad_categ_dict['violent'][0:5])
print("New York stats: %s" % NY_code_categ)

# groups = ['violent','public order','drug','property','other']
for i, group in enumerate(NY_code_categ):
    print('\n\n====GROUP====')
    idx = BJS_broad_categ_dict[group]
    dfNewYork = dfNYCleaned.iloc[idx,:]
    dfNewYork.to_csv('./cleaned/NY_'+group+'.csv', index=False)
    # statistics by each BJS broad category for AR
    races = dfNewYork['race'].value_counts().sort_index()
    sexes = dfNewYork['gender'].value_counts().sort_index()
    bonds = dfNewYork['bond_info'].mean()
    print(f'\n\ngroup: {group} \n races: \n\t{races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    
    # break into placeholder and nontrivial dataframes
    # $10: max threshold
    mask = dfNewYork['bond_info'] <= 10
    placeholder_df = dfNewYork[mask]
    races = placeholder_df['race'].value_counts().sort_index()
    sexes = placeholder_df['gender'].value_counts().sort_index()
    bonds = placeholder_df['bond_info'].mean()
    print(f'placeholder (bond <= $10): \n races: \n\t{races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    nontrivial_df = dfNewYork[~mask]
    races = nontrivial_df['race'].value_counts().sort_index()
    sexes = nontrivial_df['gender'].value_counts().sort_index()
    bonds = nontrivial_df['bond_info'].mean()
    print(f'\n\nmeaningful (bond > $10): \n races: \n\t{races}, \n sexes: \n\t{sexes}'+ \
         f'\n avg bonds: \n\t{bonds:.2f}')
    
    # Bond breakdown on nontrivial df
    mask = nontrivial_df['bond_info'] <= 1000
    bondGroupNY1 = nontrivial_df[mask]
    bondGroupNY2 = nontrivial_df[~mask]

    for i, dfNYgroup in enumerate([bondGroupNY1, bondGroupNY2]):
        print('\n\n====BOND====')
        if i == 0:
            group = '$10 < Bond <= $1,000'
        elif i == 1:
            group = 'Bond > $1,000'
        races = dfNYgroup['race'].value_counts().sort_index()
        sexes = dfNYgroup['gender'].value_counts().sort_index()
        bonds = dfNYgroup['bond_info'].mean()
        print(f'\nbond group: {group} \n races: \n\t{races}, \
                \n sexes:\n\t{sexes} \n avg bonds: \n\t{bonds:.2f}')


In [None]:
# bond group analysis module
mask = dfArkansasCleaned['Bond'] <= 1000
bondGroupAR1 = dfArkansasCleaned[mask]
bondGroupAR2 = dfArkansasCleaned[~mask]

print('====ARKANSAS====')
for i, dfARgroup in enumerate([bondGroupAR1, bondGroupAR2]):
    if i == 0:
        group = 'Bond <= $1,000'
    elif i == 1:
        group = 'Bond > $1,000'
    avgAge = dfARgroup['Age'].mean()
    races = dfARgroup['Race'].value_counts()
    sexes = dfARgroup['Sex'].value_counts()
    bonds = dfARgroup['Bond'].mean()
    print(f'\nbond group: {group} \n avg age: \n\t{avgAge:.1f}, \n races: \n{races}, \
            \n sexes:\n{sexes} \n avg bonds: \n\t{bonds:.2f}')

# 2. Louisiana
mask = dfLouisianaCleaned['Bond'] <= 1000
bondGroupLA1 = dfLouisianaCleaned[mask]
bondGroupLA2 = dfLouisianaCleaned[~mask]

# general statistics on these bond groups
print('\n\n====LOUISIANA====')
for i, dfLAgroup in enumerate([bondGroupLA1, bondGroupLA2]):
    if i == 0:
        group = 'Bond <= $1,000'
    elif i == 1:
        group = 'Bond > $1,000'
    races = dfLAgroup['Race'].value_counts()
    sexes = dfLAgroup['Sex'].value_counts()
    bonds = dfLAgroup['Bond'].mean()
    print(f'\nbond group: {group} \n races: \n{races}, \
            \n sexes:\n{sexes} \n avg bonds: \n\t{bonds:.2f}')

In [None]:
# time analysis module

# 1. Arkansas
dfArkansasCleanedTime = dfArkansasCleaned[dfArkansasCleaned['Duration_Jail'] != -1]
mask = dfArkansasCleanedTime['Duration_Jail'] <= 90
timeGroupAR1 = dfArkansasCleanedTime[mask]
timeGroupAR2 = dfArkansasCleanedTime[~mask]

print('\n\n====ARKANSAS====')
for i, dfARgroup in enumerate([timeGroupAR1, timeGroupAR2]):
    if i == 0:
        group = 'time <= 90 days'
    elif i == 1:
        group = 'time > 90 days'

    avgAge = dfARgroup['Age'].mean()
    races = dfARgroup['Race'].value_counts()
    sexes = dfARgroup['Sex'].value_counts()
    bonds = dfARgroup['Bond'].mean()
    times = dfARgroup['Duration_Jail'].value_counts()
    print(f'\ntime group: {group} \n\n avg age: \n\t{avgAge:.1f}, \n races: \n{races}, \
            \n sexes:\n{sexes} \n avg bonds: \n\t{bonds:.2f}')

# 2. Louisiana
dfLouisianaCleanedTime = dfLouisianaCleaned[dfLouisianaCleaned['Duration_Jail'] != -1]
mask = dfLouisianaCleanedTime['Duration_Jail'] <= 90
timeGroupLA1 = dfLouisianaCleanedTime[mask]
timeGroupLA2 = dfLouisianaCleanedTime[~mask]

print('\n\n====LOUISIANA====')
for i, dfLAgroup in enumerate([timeGroupLA1, timeGroupLA2]):
    if i == 0:
        group = 'time <= 90 days'
    elif i == 1:
        group = 'time > 90 days'

    races = dfLAgroup['Race'].value_counts()
    sexes = dfLAgroup['Sex'].value_counts()
    bonds = dfLAgroup['Bond'].mean()
    times = dfLAgroup['Duration_Jail'].value_counts()
    
    print(f'\ntime group: {group} \n\n races: \n{races}, \
            \n\n sexes:\n{sexes} \n\n avg bonds: \n\t{bonds:.2f}')