In [1]:
import pandas as pd
from sodapy import Socrata
import numpy as np
import sqlite3

# Get information from Cook County Sentencing Data Database

In [2]:
# Call API
client = Socrata("datacatalog.cookcountyil.gov", None)



In [3]:
# Retrive Data
results = client.get("tg8v-tm6u", limit=300000)

In [4]:
# Generate DataFrame
original_data = pd.DataFrame.from_records(results)

# Explore data
list(original_data.columns.values)

['case_id',
 'case_participant_id',
 'received_date',
 'offense_category',
 'primary_charge',
 'charge_id',
 'charge_version_id',
 'disposition_charged_offense_title',
 'charge_count',
 'disposition_date',
 'disposition_charged_chapter',
 'disposition_charged_class',
 'disposition_charged_aoic',
 'charge_disposition',
 'sentence_judge',
 'court_name',
 'court_facility',
 'sentence_phase',
 'sentence_date',
 'sentence_type',
 'current_sentence',
 'commitment_type',
 'commitment_term',
 'commitment_unit',
 'length_of_case_in_days',
 'age_at_incident',
 'race',
 'gender',
 'incident_begin_date',
 'law_enforcement_agency',
 'arrest_date',
 'felony_review_date',
 'felony_review_result',
 'arraignment_date',
 'updated_offense_category',
 'incident_end_date',
 'disposition_charged_act',
 'disposition_charged_section',
 'charge_disposition_reason',
 'incident_city',
 'unit']

In [5]:
data = original_data.copy()

In [6]:
# Explore values in columns
data.count()

case_id                              241659
case_participant_id                  241659
received_date                        241659
offense_category                     241659
primary_charge                       241659
charge_id                            241659
charge_version_id                    241659
disposition_charged_offense_title    241659
charge_count                         241659
disposition_date                     241659
disposition_charged_chapter          241659
disposition_charged_class            241640
disposition_charged_aoic             241635
charge_disposition                   241659
sentence_judge                       240917
court_name                           240274
court_facility                       239777
sentence_phase                       241659
sentence_date                        241659
sentence_type                        241659
current_sentence                     241659
commitment_type                      240026
commitment_term                 

In [7]:
# Delete columns that we don't need
data = data.drop([
    'charge_disposition_reason',
    'unit',
    'disposition_charged_chapter',
    'disposition_charged_act',
    'disposition_charged_section',
    'incident_end_date',
    'law_enforcement_agency',
    'disposition_charged_aoic',
    'incident_city',
    'arraignment_date',
    'felony_review_date',
    'felony_review_result',
    'received_date',
    'disposition_date',
    'charge_id',
    'charge_version_id',
    'sentence_judge',
    'sentence_phase',
    'offense_category',
    'commitment_type'
],axis=1)

In [8]:
# Latest offense_category data
data = data.rename(columns={'updated_offense_category':'offense_category'})

In [9]:
data.count()

case_id                              241659
case_participant_id                  241659
primary_charge                       241659
disposition_charged_offense_title    241659
charge_count                         241659
disposition_charged_class            241640
charge_disposition                   241659
court_name                           240274
court_facility                       239777
sentence_date                        241659
sentence_type                        241659
current_sentence                     241659
commitment_term                      239997
commitment_unit                      239997
length_of_case_in_days               222753
age_at_incident                      232210
race                                 240404
gender                               240855
incident_begin_date                  232963
arrest_date                          236619
offense_category                     241659
dtype: int64

# Cleaning Process

## Filter data after 2000 & before 2019

In [10]:
# Create a copy of the Sentence Date to filter it
data['date_year'] = data['sentence_date'].copy()

In [11]:
# Select only the year
data['date_year'] = data['date_year'].replace(regex=['[0-9]*\/[0-9]*\/','\s[0-9]*\:[0-9]*\:[0-9]*\s[A-Z]*'], value='')

In [12]:
# Change value type
data['date_year'] = data['date_year'].astype(int)

In [13]:
# Confirm value type change
data.dtypes

case_id                              object
case_participant_id                  object
primary_charge                         bool
disposition_charged_offense_title    object
charge_count                         object
disposition_charged_class            object
charge_disposition                   object
court_name                           object
court_facility                       object
sentence_date                        object
sentence_type                        object
current_sentence                       bool
commitment_term                      object
commitment_unit                      object
length_of_case_in_days               object
age_at_incident                      object
race                                 object
gender                               object
incident_begin_date                  object
arrest_date                          object
offense_category                     object
date_year                             int32
dtype: object

In [14]:
# Filter year
data = data[(data.date_year >= 2010) & (data.date_year <= 2020)]

In [15]:
# Confirm values
data.describe()

Unnamed: 0,date_year
count,241360.0
mean,2014.728567
std,2.543026
min,2010.0
25%,2013.0
50%,2015.0
75%,2017.0
max,2020.0


## Filter only current sentence and primary charge

In [16]:
# Drop False values for Current Sentence and Primary Charge
data = data.loc[data.current_sentence == True]
data = data.loc[data.primary_charge == True]

In [17]:
data.count()

case_id                              164729
case_participant_id                  164729
primary_charge                       164729
disposition_charged_offense_title    164729
charge_count                         164729
disposition_charged_class            164719
charge_disposition                   164729
court_name                           164104
court_facility                       163798
sentence_date                        164729
sentence_type                        164729
current_sentence                     164729
commitment_term                      163492
commitment_unit                      163492
length_of_case_in_days               152488
age_at_incident                      158511
race                                 163786
gender                               164125
incident_begin_date                  158857
arrest_date                          161148
offense_category                     164729
date_year                            164729
dtype: int64

## Analize Offense Category and consolidate into related categories

In [18]:
# Review all the information
data.offense_category.value_counts()

Narcotics                                 47630
Aggravated DUI                            14161
Retail Theft                              12990
UUW - Unlawful Use of Weapon              12793
Burglary                                   9772
                                          ...  
DUI                                           8
Pandering                                     8
Arson and Attempt Arson                       4
Violation of Sex Offender Registration        3
Tampering                                     1
Name: offense_category, Length: 78, dtype: int64

In [19]:
data.offense_category.unique()

array(['PROMIS Conversion', 'Domestic Battery', 'Narcotics', 'Homicide',
       'UUW - Unlawful Use of Weapon', 'Sex Crimes', 'Burglary',
       'Aggravated DUI', 'Driving With Suspended Or Revoked License',
       'Retail Theft', 'Identity Theft', 'Attempt Homicide',
       'Home Invasion', 'Forgery', 'Theft',
       'Aggravated Fleeing and Eluding', 'DUI', 'Robbery',
       'Possession of Stolen Motor Vehicle',
       'Aggravated Battery With A Firearm', 'Fraudulent ID',
       'Criminal Damage to Property', 'Armed Robbery',
       'Escape - Failure to Return', 'Other Offense',
       'Failure to Register as a Sex Offender', 'Aggravated Battery',
       'Aggravated Discharge Firearm', 'Credit Card Cases', 'Kidnapping',
       'Residential Burglary', 'Aggravated Battery Police Officer',
       'Deceptive Practice', 'Attempt Armed Robbery', 'Arson',
       'Aggravated Identity Theft', 'Aggravated Robbery',
       'Possession of Contraband in Penal Institution', 'Bribery',
       'Dog F

In [20]:
# Consolidate Homicide category
data.offense_category = data.offense_category.replace(to_replace=['Attempt Homicide','Reckless Homicide'],value='Homicide')

In [21]:
# Consolidate Offense Against Police Officers category
data.offense_category = data.offense_category.replace(to_replace=['Aggravated Assault Police Officer','Aggravated Assault Police Officer Firearm','Police Shooting','Impersonating Police Officer','Aggravated Battery Police Officer','Aggravated Battery Police Officer Firearm','Disarming Police Officer'],value='Offense Against Police Officers')

In [22]:
# Consolidate Battery category
data.offense_category = data.offense_category.replace(to_replace=['Aggravated Battery','Aggravated Battery With A Firearm','Domestic Battery'],value='Battery')

In [23]:
# Consolidate Robbery, Burglery and Theft category
data.offense_category = data.offense_category.replace(to_replace=['Aggravated Identity Theft','Aggravated Robbery','Aggravated Robbery BB Gun','Armed Robbery','Attempt Armed Robbery','Burglary','Identity Theft','Residential Burglary','Retail Theft','Robbery','Theft','Theft by Deception','Possession Of Burglary Tools'],value='Robbery/Burglary/Theft')

In [24]:
# Consolidate Arson category
data.offense_category = data.offense_category.replace(to_replace=['Arson and Attempt Arson','Attempt Arson'],value='Arson')

In [25]:
# Consolidate Firearms and Explosives category
data.offense_category = data.offense_category.replace(to_replace=['Aggravated Discharge Firearm','Armed Violence','Gun Running','Possession of Explosives','UUW - Unlawful Use of Weapon','Gun - Non UUW','Bomb Threat','Reckless Discharge of Firearm'],value='Firearms and Explosives')

In [26]:
# Consolidate Motor Vehicles Offenses category
data.offense_category = data.offense_category.replace(to_replace=['Aggravated DUI','Attempt Vehicular Hijacking','DUI','Driving With Suspended Or Revoked License','Major Accidents','Possession of Stolen Motor Vehicle','Vehicular Hijacking','Vehicular Invasion'],value='Motor Vehicle Offenses')

In [27]:
# Consolidate Judicial Process Violations category
data.offense_category = data.offense_category.replace(to_replace=['Communicating With Witness','Escape - Failure to Return','Obstructing Justice','Perjury','Tampering','Violate Bail Bond','Violation Order Of Protection'],value='Judicial Process Violations')

In [28]:
# Consolidate Sex Offenses category
data.offense_category = data.offense_category.replace(to_replace=['Attempt Sex Crimes','Child Pornography','Failure to Register as a Sex Offender','Pandering','Prostitution','Sex Crimes','Violation of Sex Offender Registration'],value='Sex Offenses')

In [29]:
# Consolidate Human Trafficking, Detention and Kidnapping category
data.offense_category = data.offense_category.replace(to_replace=['Child Abduction','Human Trafficking','Kidnapping','Unlawful Restraint'],value='Human Trafficking/Detention/Kidnapping')

In [30]:
# Consolidate Trespassing category
data.offense_category = data.offense_category.replace(to_replace=['Home Invasion','Criminal Trespass To Residence'],value='Trespassing')

In [31]:
# Consolidate Fraud and Deception category
data.offense_category = data.offense_category.replace(to_replace=['Credit Card Cases','Deceptive Practice','Forgery','Fraud','Fraudulent ID'],value='Fraud/Deception')

In [32]:
# Consolidate Corruption category
data.offense_category = data.offense_category.replace(to_replace=['Intimidation','Official Misconduct','Bribery'],value='Corruption')

In [33]:
# Consolidate Inside Penal Institutions category
data.offense_category = data.offense_category.replace(to_replace=['Possession of Contraband in Penal Institution','Possession of Shank in Penal Institution'],value='Inside Penal Institutions')

In [34]:
# Consolidate Other Offense category
data.offense_category = data.offense_category.replace(to_replace=['Dog Fighting','Gambling','Failure To Pay Child Support','Compelling Gang Membership'],value='Other Offense')

In [35]:
data.offense_category.value_counts()

Narcotics                                 47630
Robbery/Burglary/Theft                    45019
Motor Vehicle Offenses                    26700
Firearms and Explosives                   13502
Sex Offenses                               5310
Battery                                    4423
Offense Against Police Officers            4385
Fraud/Deception                            3713
Judicial Process Violations                3122
Other Offense                              2526
PROMIS Conversion                          2183
Aggravated Fleeing and Eluding             1726
Criminal Damage to Property                1712
Homicide                                   1124
Trespassing                                 524
Arson                                       291
Corruption                                  263
Human Trafficking/Detention/Kidnapping      214
Inside Penal Institutions                   179
Stalking                                    162
Hate Crimes                             

## Cleaning Race columns

In [36]:
# Review data
data.race.value_counts()

Black                               106854
White [Hispanic or Latino]           25770
White                                24758
HISPANIC                              4236
Asian                                  940
White/Black [Hispanic or Latino]       840
Unknown                                235
American Indian                         84
ASIAN                                   41
Biracial                                28
Name: race, dtype: int64

In [37]:
# Consolidate Asian race
data.race = data.race.replace(to_replace='ASIAN',value='Asian')

In [38]:
# Consolidate Hispanic/Latino race
data.race = data.race.replace(to_replace=['White [Hispanic or Latino]','HISPANIC','White/Black [Hispanic or Latino]'],value='Hispanic/Latino')

In [39]:
data.race.value_counts()

Black              106854
Hispanic/Latino     30846
White               24758
Asian                 981
Unknown               235
American Indian        84
Biracial               28
Name: race, dtype: int64

## Cleaning Sentence Type

In [40]:
# Review data
data.sentence_type.value_counts()

Prison                                   83764
Probation                                65798
Jail                                      6161
Conditional Discharge                     2836
Supervision                               2083
2nd Chance Probation                      1638
Cook County Boot Camp                     1469
Probation Terminated Unsatisfactorily      665
Conditional Release                         74
Probation Terminated Instanter              74
Inpatient Mental Health Services            60
Probation Terminated Satisfactorily         52
Death                                       47
Conversion                                   8
Name: sentence_type, dtype: int64

In [41]:
# Consolidate Probation & Supervision sentence
data.sentence_type = data.sentence_type.replace(to_replace=['Probation','2nd Chance Probation','Supervision','Probation Terminated Unsatisfactorily','Probation Terminated Instanter','Probation Terminated Satisfactorily'],value='Probation/Supervision')

In [42]:
# Consolidate Conditional sentence type
data.sentence_type = data.sentence_type.replace(to_replace='Conditional Release',value='Conditional Discharge')

In [43]:
# Consolidate Incarceration sentence
data.sentence_type = data.sentence_type.replace(to_replace=['Jail','Prison'],value='Incarceration')

In [44]:
data.sentence_type.value_counts()

Incarceration                       89925
Probation/Supervision               70310
Conditional Discharge                2910
Cook County Boot Camp                1469
Inpatient Mental Health Services       60
Death                                  47
Conversion                              8
Name: sentence_type, dtype: int64

## Cleaning Gender

In [45]:
data.gender.value_counts()

Male                          143275
Female                         20839
Unknown                            5
Male name, no gender given         3
Unknown Gender                     3
Name: gender, dtype: int64

In [46]:
# Select only Male and Female genders due to size
data = data.loc[(data.gender == "Male") | (data.gender == "Female")]

In [47]:
data.gender.value_counts()

Male      143275
Female     20839
Name: gender, dtype: int64

## Cleaning Commitment Unit

In [48]:
# Data Exploration
data.commitment_unit.value_counts()

Year(s)         116649
Months           39289
Days              5557
Term              1206
Natural Life       112
Dollars             56
Hours               11
Weeks               11
Pounds               2
Kilos                1
Ounces               1
Name: commitment_unit, dtype: int64

In [49]:
# Unify Weight units
data.commitment_unit = data.commitment_unit.replace(to_replace=['Pounds','Kilos','Ounces'],value='Weight')

In [50]:
# Clean all non-numeric characters and transform value type to float
data.commitment_term = data.commitment_term.replace(to_replace='two',value=2)
data.commitment_term = data.commitment_term.replace(regex=['[a-z]*','\,','\`'], value='')
data.commitment_term = data.commitment_term.astype('float')

In [51]:
# Review commitment unit values
data.commitment_unit.value_counts()

Year(s)         116649
Months           39289
Days              5557
Term              1206
Natural Life       112
Dollars             56
Hours               11
Weeks               11
Weight               4
Name: commitment_unit, dtype: int64

In [52]:
# Change all years over 130 to 130 to mark natural life
data.loc[(data.commitment_unit == 'Year(s)') & (data.commitment_term > 129),['commitment_term']] = 130

In [53]:
# Changes in Bootcamp Terms to Months
data.loc[(data.sentence_type == 'Cook County Boot Camp') & (data.commitment_unit == 'Term') & (data.commitment_term == 1),['commitment_term']] = 12
data.loc[(data.sentence_type == 'Cook County Boot Camp') & (data.commitment_unit == 'Term') & (data.commitment_term == 12),['commitment_unit']] = 'Months'
data.loc[(data.sentence_type == 'Cook County Boot Camp') & (data.commitment_unit == 'Term') & (data.commitment_term == 18),['commitment_unit']] = 'Months'

In [54]:
# Change Death info to 130 years
data.loc[(data.sentence_type == 'Death'),['commitment_unit']] = 'Year(s)'
data.loc[(data.sentence_type == 'Death'),['commitment_term']] = 130

In [55]:
# Create column with all values in months
def month_convert(row):
    if row ['commitment_unit'] == 'Months':
        return round(float(row['commitment_term']),2)
    if row["commitment_unit"] == "Year(s)":
        return round(int(row["commitment_term"]) * 12.0, 2)
    if row["commitment_unit"] == "Weeks":
       return round(float(row['commitment_term']) / 4, 2)
    if row["commitment_unit"] == "Days":
        return round(float( row['commitment_term']) / 30, 2)
    if row['commitment_unit'] == "Natural Life":
        return 1560.
    else:
        return 0.

data['month'] = data.apply(lambda row:month_convert(row), axis = 1)

In [56]:
# Create column with all values in years
def year_convert(row):
    if row ['commitment_unit'] == 'Year(s)':
        return round(float(row['commitment_term']),2)
    if row["commitment_unit"] == "Months":
        return round(int(row["commitment_term"]) / 12.0, 2)
    if row["commitment_unit"] == "Weeks":
       return round(float(row['commitment_term']) / 52, 2)
    if row["commitment_unit"] == "Days":
        return round(float( row['commitment_term'])/365, 2)
    if row['commitment_unit'] == "Natural Life":
        return 130.
    else:
        return 0.

data['year'] = data.apply(lambda row:year_convert(row), axis = 1)

## Create bins for age

In [57]:
# Delete all 'Promis Conversion' data from dataframe
data.replace(regex=['PROMIS*'], value=np.nan, inplace=True)
data.dropna(inplace=True)

In [58]:
# Change age dtype from string to integer 
data.age_at_incident = data.age_at_incident.astype(int)

In [59]:
# Bins to group age
bins_ranges = [0,18,24,29,39,49,59,137]
bins_names = ["<18", '18-24', '25-29', '30s', '40s', '50s', '60+']

data['age_bins'] = pd.cut(data.age_at_incident,bins_ranges,labels=bins_names)

In [60]:
# Drop duplicates of data with current values
data.drop_duplicates(inplace=True)
data.reset_index(drop=True,inplace=True)

In [61]:
data

Unnamed: 0,case_id,case_participant_id,primary_charge,disposition_charged_offense_title,charge_count,disposition_charged_class,charge_disposition,court_name,court_facility,sentence_date,...,age_at_incident,race,gender,incident_begin_date,arrest_date,offense_category,date_year,month,year,age_bins
0,388421850391,126135811747,True,ATTEMPT FIRST DEGREE MURDER,1,X,Finding Guilty,District 6 - Markham,Markham Courthouse,6/6/2011 12:00:00 AM,...,23,Black,Male,5/5/2007 12:00:00 AM,5/5/2007 2:07:00 AM,Battery,2011,312.0,26.0,18-24
1,388507957919,131947840553,True,UNLWFL USE FIREARMS/FELON,1,3,Plea Of Guilty,District 6 - Markham,Markham Courthouse,1/12/2012 12:00:00 AM,...,25,Black,Male,8/8/2008 12:00:00 AM,8/15/2008 11:15:00 AM,Narcotics,2012,60.0,5.0,25-29
2,388614373825,132056841488,True,AGG UNLAWFUL USE OF WEAPON/VEH,1,4,Plea Of Guilty,District 6 - Markham,Markham Courthouse,3/9/2011 12:00:00 AM,...,21,Black,Male,9/17/2008 12:00:00 AM,9/17/2008 8:17:00 PM,Firearms and Explosives,2011,24.0,2.0,18-24
3,389093922351,132098416129,True,BURGLARY,1,2,Plea Of Guilty,District 6 - Markham,Markham Courthouse,3/2/2012 12:00:00 AM,...,49,Black,Male,8/19/2008 12:00:00 AM,8/19/2008 7:49:00 PM,Robbery/Burglary/Theft,2012,72.0,6.0,40s
4,390800367756,127089025222,True,PRED CRIM SEX ASLT (EFF 5-29-1996),1,X,Finding Guilty,District 6 - Markham,Markham Courthouse,6/24/2013 12:00:00 AM,...,34,Black,Male,6/25/2006 12:00:00 AM,12/22/2006 12:00:00 AM,Sex Offenses,2013,300.0,25.0,30s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141571,814823348343,557047778586,True,AGGRAVATED BATTERY,1,3,Plea Of Guilty,District 1 - Chicago,26TH Street,6/17/2020 12:00:00 AM,...,27,Hispanic/Latino,Male,3/9/2020 12:00:00 AM,3/19/2020 11:45:00 AM,Battery,2020,24.0,2.0,25-29
141572,814953592748,557210308344,True,BURGLARY,1,1,Plea Of Guilty,District 1 - Chicago,26TH Street,6/29/2020 12:00:00 AM,...,57,Black,Male,3/31/2020 12:00:00 AM,3/31/2020 8:55:00 PM,Robbery/Burglary/Theft,2020,72.0,6.0,50s
141573,814953863526,557210661670,True,BURGLARY,1,2,Plea Of Guilty,District 1 - Chicago,26TH Street,6/29/2020 12:00:00 AM,...,57,Black,Male,3/7/2020 12:00:00 AM,3/31/2020 8:55:00 PM,Robbery/Burglary/Theft,2020,72.0,6.0,50s
141574,814954946640,557212251635,True,BURGLARY,1,2,Plea Of Guilty,District 1 - Chicago,26TH Street,6/29/2020 12:00:00 AM,...,57,Black,Male,3/29/2020 12:00:00 AM,3/31/2020 12:00:00 AM,Robbery/Burglary/Theft,2020,72.0,6.0,50s


## Cleaning of Charge Disposition

In [62]:
data.charge_disposition.value_counts()

Plea Of Guilty                      133770
Finding Guilty                        6238
Verdict Guilty                         905
Case Dismissed                         397
Nolle Prosecution                      110
FNG Reason Insanity                     24
Finding Not Not Guilty                  22
BFW                                     21
Charge Vacated                          18
Plea of Guilty - Amended Charge         16
FNG                                     15
Plea of Guilty But Mentally Ill         15
Finding Guilty - Lesser Included         7
Death Suggested-Cause Abated             7
Finding Guilty But Mentally Ill          5
SOLW                                     1
WOWI                                     1
FNPC                                     1
Superseded by Indictment                 1
Sexually Dangerous Person                1
Plea of Guilty - Lesser Included         1
Name: charge_disposition, dtype: int64

In [63]:
# Delete data that isn't the following
data = data[
    (data.charge_disposition != 'WOWI') & 
    (data.charge_disposition != 'Superseded by Indictment') & 
    (data.charge_disposition != 'Death Suggested-Cause Abated') &
    (data.charge_disposition != 'Sexually Dangerous Person')
]

In [64]:
# Consolidate No Guilty Findings
data.charge_disposition = data.charge_disposition.replace(to_replace=[
    'Nolle Prosecution', 
    'Case Dismissed', 
    'FNG',
    'FNG Reason Insanity',
    'FNPC',
    'SOLW',
    'Charge Vacated'
],value='No Guilty Finding')

In [65]:
# Consolidate Finding Guilty
data.charge_disposition = data.charge_disposition.replace(to_replace=[
    'Finding Guilty - Lesser Included',
    'Finding Guilty But Mentally Ill'
], value='Finding Guilty')

In [66]:
# Consolidate Plea of Guilty
data.charge_disposition = data.charge_disposition.replace(to_replace=[
    'Plea of Guilty - Amended Charge',
    'Plea of Guilty But Mentally Ill',
    'Plea of Guilty - Lesser Included'
], value='Plea Of Guilty')

In [67]:
data.charge_disposition.value_counts()

Plea Of Guilty            133802
Finding Guilty              6250
Verdict Guilty               905
No Guilty Finding            566
Finding Not Not Guilty        22
BFW                           21
Name: charge_disposition, dtype: int64

# Analize new dataframe to create the database diagram

In [68]:
data.columns

Index(['case_id', 'case_participant_id', 'primary_charge',
       'disposition_charged_offense_title', 'charge_count',
       'disposition_charged_class', 'charge_disposition', 'court_name',
       'court_facility', 'sentence_date', 'sentence_type', 'current_sentence',
       'commitment_term', 'commitment_unit', 'length_of_case_in_days',
       'age_at_incident', 'race', 'gender', 'incident_begin_date',
       'arrest_date', 'offense_category', 'date_year', 'month', 'year',
       'age_bins'],
      dtype='object')

![DBD-from-quickDBD.png](../images/data_model.png)

# Create dataframe tables that will populate the database

In [69]:
# Create participants df table
participants = data[['case_participant_id','age_at_incident','gender','race','age_bins']].drop_duplicates()

In [70]:
courts

NameError: name 'courts' is not defined

In [71]:
# Create courts df table
courts = data[['court_facility', 'court_name']].drop_duplicates().reset_index(drop=True)
courts['court_id']=['1-26','6','2','5','4','3','1-4','1-DV','1-1','1-3','1-2','1-5','1-RJCC']
courts = courts[['court_id', 'court_facility', 'court_name']]

In [72]:
# Create offense df table
offenses = data[['offense_category']].drop_duplicates().reset_index(drop=True)
offenses['offense_id'] = offenses.index + 1
offenses = offenses[['offense_id','offense_category']]

In [73]:
# Create sentences df table
sentences = data[['sentence_type','commitment_term','commitment_unit','month', 'year']].drop_duplicates().reset_index(drop=True)
sentences['sentence_id'] = sentences.index + 1
sentences = sentences[['sentence_id','sentence_type','commitment_term','commitment_unit','month', 'year']]

In [74]:
# Create results df table
results = data.merge(sentences,on=['sentence_type','commitment_term','commitment_unit','month','year'])
results = results.merge(offenses,on='offense_category')
results = results.merge(courts,on=['court_facility','court_name'])
results = results[['case_participant_id','court_id', 'offense_id', 'sentence_id', 'case_id', 'charge_disposition', 'length_of_case_in_days', 'disposition_charged_offense_title', 'charge_count', 'disposition_charged_class', 'sentence_date', 'incident_begin_date', 'arrest_date']]

# Create database and create the tables defined before

In [75]:
# Create connection to database
conn = sqlite3.connect("sentencing.db")
c = conn.cursor()

In [76]:
# Clean table if exist
pragma_fk = """PRAGMA foreign_keys = OFF;"""
drop_results = """DROP TABLE IF EXISTS results"""
drop_sentences = """DROP TABLE IF EXISTS sentences"""
drop_offenses = """DROP TABLE IF EXISTS offenses"""
drop_courts = """DROP TABLE IF EXISTS courts"""
drop_participants = """DROP TABLE IF EXISTS participants"""
clear_db = [pragma_fk,drop_results,drop_sentences,drop_offenses,drop_courts,drop_participants]
for i in clear_db:
    c.execute(i)

In [77]:
table_participants = """
CREATE TABLE "participants" (
    "case_participant_id" bigint,
    "age_at_incident" integer,
    "gender" varchar,
    "race" varchar,
    "age_bins" varchar,
CONSTRAINT "pk_participants" PRIMARY KEY ("case_participant_id")
);
"""
c.execute(table_participants)

<sqlite3.Cursor at 0x12ca4cec490>

In [78]:
table_courts = """
CREATE TABLE "courts" (
    "court_id" varchar,
    "court_facility" varchar,
    "court_name" varchar,
CONSTRAINT "pk_courts" PRIMARY KEY ("court_id")
);
"""
c.execute(table_courts)

<sqlite3.Cursor at 0x12ca4cec490>

In [79]:
table_offenses = """
CREATE TABLE "offenses" (
 "offense_id" integer,
 "offense_category" varchar,
 CONSTRAINT "pk_offenses" PRIMARY KEY (
 "offense_id"
 )
);
"""
c.execute(table_offenses)

<sqlite3.Cursor at 0x12ca4cec490>

In [80]:
table_sentences = """
CREATE TABLE "sentences" (
    "sentence_id" integer,
    "sentence_type" varchar,
    "commitment_term" float,
    "commitment_unit" varchar,
    "month" float,
    "year" float,
 CONSTRAINT "pk_sentences" PRIMARY KEY ("sentence_id")
);
"""
c.execute(table_sentences)

<sqlite3.Cursor at 0x12ca4cec490>

In [81]:
table_results = """
CREATE TABLE "results" (
    "case_participant_id" bigint,
    "court_id" varchar,
    "offense_id" integer,
    "sentence_id" integer,
    "case_id" bigint,
    "charge_disposition" varchar,
    "length_of_case_in_days" bigint,
    "disposition_charged_offense_title" varchar,
    "charge_count" integer,
    "disposition_charged_class" varchar,
    "sentence_date" date,
    "incident_begin_date" date,
    "arrest_date" date,
    FOREIGN KEY(case_participant_id) REFERENCES participants (case_participant_id),
    FOREIGN KEY(court_id) REFERENCES courts (court_id),
    FOREIGN KEY(offense_id) REFERENCES offenses (offense_id),
    FOREIGN KEY(sentence_id) REFERENCES sentences (sentence_id)
);
"""
c.execute(table_results)

<sqlite3.Cursor at 0x12ca4cec490>

# Add data to database

In [82]:
participants.to_sql(name='participants',con=conn,if_exists='append',index=False)
courts.to_sql(name='courts',con=conn,if_exists='append',index=False)
offenses.to_sql(name='offenses',con=conn,if_exists='append',index=False)
sentences.to_sql(name='sentences',con=conn,if_exists='append',index=False)
results.to_sql(name='results',con=conn,if_exists='append',index=False)

In [83]:
pd.read_sql_query('Select * from results', con=conn)

Unnamed: 0,case_participant_id,court_id,offense_id,sentence_id,case_id,charge_disposition,length_of_case_in_days,disposition_charged_offense_title,charge_count,disposition_charged_class,sentence_date,incident_begin_date,arrest_date
0,126135811747,1-26,1,1,388421850391,Finding Guilty,1455,ATTEMPT FIRST DEGREE MURDER,1,X,6/6/2011 12:00:00 AM,5/5/2007 12:00:00 AM,5/5/2007 2:07:00 AM
1,130458985529,1-26,1,1,423065508204,Plea Of Guilty,1430,ARMED ROBBERY,1,X,8/9/2012 12:00:00 AM,7/20/2008 12:00:00 AM,7/28/2008 7:13:00 PM
2,130479360636,1-26,1,1,423065778982,Plea Of Guilty,1430,ARMED ROBBERY,1,X,8/9/2012 12:00:00 AM,7/26/2008 12:00:00 AM,7/29/2008 7:13:00 PM
3,130459044416,1-26,1,1,423066049761,Plea Of Guilty,1430,ARMED ROBBERY,1,X,8/9/2012 12:00:00 AM,7/25/2008 12:00:00 AM,7/28/2008 7:13:00 PM
4,464682176135,1-26,1,2,727573677532,Finding Guilty,913,AGGRAVATED DOMESTIC BATTERY,1,2,4/20/2015 12:00:00 AM,8/25/2012 12:00:00 AM,9/4/2012 12:16:00 PM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
141561,540718248938,1-2,2,6,799332929067,Plea Of Guilty,263,POSSESSION OF A CONTROLLED SUBSTANCE,1,4,10/21/2019 12:00:00 AM,10/31/2018 12:00:00 AM,10/31/2018 10:06:00 AM
141562,465201329158,1-2,2,25,728101424612,Plea Of Guilty,102,POSSESSION OF LOOK-ALIKE SUBSTANCE WITH INTENT...,1,3,1/28/2013 12:00:00 AM,9/4/2012 12:00:00 AM,9/14/2012 9:02:00 AM
141563,467863696143,1-2,2,59,730819227298,Plea Of Guilty,0,POSSESSION OF A CONTROLLED SUBSTANCE,1,4,3/5/2013 12:00:00 AM,11/29/2012 12:00:00 AM,11/29/2012 6:25:00 PM
141564,510605724731,1-5,2,457,771416760319,Plea Of Guilty,19,MFG/DEL CANNABIS/2.5-10 GRAMS,1,A,5/3/2016 12:00:00 AM,3/13/2016 12:00:00 AM,3/13/2016 8:45:00 PM


In [84]:
data.to_json('data.json',orient='records')