In [47]:
# importing libraries
import os
import numpy as np
import pandas as pd
from sodapy import Socrata
import matplotlib.pyplot as plt

In [96]:
import datetime

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
# creating API connection environment variables
socrata_domain = 'data.cityofchicago.org'
socrata_dataset_identifier = '6zsd-86xi'
# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# to get an API visit 
# $ export SODAPY_APPTOKEN=<token>
socrata_token = os.environ.get("SODAPY_APPTOKEN")

In [50]:
# Connecting to the database API
client = Socrata(socrata_domain, socrata_token)
print("Domain: {domain:}\nSession: {session:}\nURI Prefix: {uri_prefix:}".format(**client.__dict__))

Domain: data.cityofchicago.org
Session: <requests.sessions.Session object at 0x118cc55f8>
URI Prefix: https://


In [51]:
# Fetching data pages 50,000 records at a time ordered by date
results1 = client.get(socrata_dataset_identifier, limit=50000, offset = 0, order='date DESC')
df1 = pd.DataFrame.from_dict(results1)
df1.head(3)

Unnamed: 0,arrest,beat,block,case_number,community_area,date,description,district,domestic,fbi_code,...,latitude,location,location_description,longitude,primary_type,updated_on,ward,x_coordinate,y_coordinate,year
0,False,2424,069XX N WOLCOTT AVE,JB552555,1,2018-12-13T23:54:00.000,VIOLATE ORDER OF PROTECTION,24,True,26,...,42.008296907,"{'type': 'Point', 'coordinates': [-87.67803761...",OTHER,-87.678037618,OTHER OFFENSE,2018-12-20T16:12:36.000,49,1162345,1946295,2018
1,True,225,0000X E GARFIELD BLVD,JB552587,40,2018-12-13T23:52:00.000,OBSTRUCTING IDENTIFICATION,2,False,24,...,41.794670043,"{'type': 'Point', 'coordinates': [-87.62368457...",STREET,-87.623684577,INTERFERENCE WITH PUBLIC OFFICER,2018-12-20T16:12:36.000,3,1177761,1868567,2018
2,True,423,086XX S MARQUETTE AVE,JB552450,46,2018-12-13T23:51:00.000,VIOLATE ORDER OF PROTECTION,4,False,26,...,41.738299859,"{'type': 'Point', 'coordinates': [-87.55873836...",RESIDENTIAL YARD (FRONT/BACK),-87.558738361,OTHER OFFENSE,2018-12-20T16:12:36.000,7,1195656,1848179,2018


In [52]:
# page 2
results2 = client.get(socrata_dataset_identifier, limit = 50000, offset = 50000, order='date DESC')
df2 = pd.DataFrame.from_dict(results2)
df2.shape

(50000, 22)

In [53]:
# page 3
results3 = client.get(socrata_dataset_identifier, limit = 50000, offset = 100000, order='date DESC')
df3 = pd.DataFrame.from_dict(results3)
df3.shape

(50000, 22)

In [54]:
# getting more data since min max date are in 2018. Getting at least one year of data.
# page 4
results4 = client.get(socrata_dataset_identifier, limit = 50000, offset = 150000, order='date DESC')
df4 = pd.DataFrame.from_dict(results4)
df4.shape

(50000, 22)

In [55]:
# getting more data since min max date are in 2018. Getting at least one year of data.
# page 5
results5 = client.get(socrata_dataset_identifier, limit = 50000, offset = 200000, order='date DESC')
df5 = pd.DataFrame.from_dict(results5)
df5.shape

(50000, 22)

In [56]:
results6 = client.get(socrata_dataset_identifier, limit = 50000, offset = 250000, order='date DESC')
df6 = pd.DataFrame.from_dict(results6)
df6.shape

(50000, 22)

In [57]:
# Combining pages 
df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)
df = df.append(df4, ignore_index=True)
df = df.append(df5, ignore_index=True)
df = df.append(df6, ignore_index=True)

In [58]:
df.shape

(300000, 22)

In [127]:
df.head(5)

Unnamed: 0,arrest,beat,block,case_number,community_area,date,description,district,domestic,fbi_code,...,primary_type,updated_on,ward,x_coordinate,y_coordinate,year,hour,minute,month,weekday
0,False,2424,069XX N WOLCOTT AVE,JB552555,1,2018-12-13 23:54:00,VIOLATE ORDER OF PROTECTION,24,True,26,...,OTHER OFFENSE,2018-12-20T16:12:36.000,49,1162345,1946295,2018,23,45,12,Thursday
1,True,225,0000X E GARFIELD BLVD,JB552587,40,2018-12-13 23:52:00,OBSTRUCTING IDENTIFICATION,2,False,24,...,INTERFERENCE WITH PUBLIC OFFICER,2018-12-20T16:12:36.000,3,1177761,1868567,2018,23,45,12,Thursday
2,True,423,086XX S MARQUETTE AVE,JB552450,46,2018-12-13 23:51:00,VIOLATE ORDER OF PROTECTION,4,False,26,...,OTHER OFFENSE,2018-12-20T16:12:36.000,7,1195656,1848179,2018,23,45,12,Thursday
3,False,2521,050XX W WOLFRAM ST,JB552435,19,2018-12-13 23:48:00,RECKLESS FIREARM DISCHARGE,25,False,15,...,WEAPONS VIOLATION,2018-12-20T16:12:36.000,31,1142191,1918478,2018,23,45,12,Thursday
4,True,1634,033XX N LAMON AVE,JB552485,15,2018-12-13 23:47:00,MANU/DEL:CANNABIS OVER 10 GMS,16,False,18,...,NARCOTICS,2018-12-20T16:12:36.000,31,1143109,1921908,2018,23,45,12,Thursday


In [59]:
# Total misiing values per column
df.isna().sum()

arrest                     0
beat                       0
block                      0
case_number                0
community_area             2
date                       0
description                0
district                   1
domestic                   0
fbi_code                   0
id                         0
iucr                       0
latitude                2429
location                2429
location_description     832
longitude               2429
primary_type               0
updated_on                 0
ward                       3
x_coordinate            2429
y_coordinate            2429
year                       0
dtype: int64

In [61]:
# removing missing values
df = df.dropna()

In [95]:
df.dtypes

arrest                            bool
beat                            object
block                           object
case_number                     object
community_area                  object
date                    datetime64[ns]
description                     object
district                        object
domestic                          bool
fbi_code                        object
id                              object
iucr                            object
latitude                        object
location                        object
location_description            object
longitude                       object
primary_type                    object
updated_on                      object
ward                            object
x_coordinate                    object
y_coordinate                    object
year                            object
hour                             int64
minute                           int64
month                            int64
dtype: object

In [63]:
df.max()

arrest                                                            True
beat                                                              2535
block                                               137XX S LEYDEN AVE
case_number                                                   XX424158
community_area                                                       9
date                                           2018-12-13T23:54:00.000
description             VIOLENT OFFENDER: FAIL TO REGISTER NEW ADDRESS
district                                                           031
domestic                                                          True
fbi_code                                                            26
id                                                               24346
iucr                                                              5132
latitude                                                  42.022671246
location_description                                              YMCA
longit

In [73]:
df.min()

arrest                                    False
beat                                       0111
block                          0000X E 100TH PL
case_number                           .JB299184
community_area                                1
date                        2017-10-24 14:30:00
description                      $500 AND UNDER
district                                    001
domestic                                  False
fbi_code                                    01A
id                                     11128328
iucr                                       0110
latitude                           41.644600433
location_description         ABANDONED BUILDING
longitude                         -87.524529378
primary_type                              ARSON
updated_on              2017-10-31T15:55:52.000
ward                                          1
x_coordinate                            1092706
y_coordinate                            1813910
year                                    

In [144]:
# Reading Illinois Uniform crime reporting codes
iucrc = pd.read_csv('Chicago_Police_Department_-_Illinois_Uniform_Crime_Reporting__IUCR__Codes.csv')

In [24]:
%matplotlib inline

In [94]:
# adding new columns for date before one-hot Encoding
df['month'] = df['date'].dt.month
df['minute'] = df['date'].dt.minute
df['hour'] = df['date'].dt.hour
df['weekday'] = pd.to_datetime(df['date'].dt.date).dt.weekday_name

In [137]:
# some more preprocessing
df['arrest'] = df['arrest']*1

In [160]:
dummies = pd.get_dummies(data=df, columns=['beat', 'community_area', 'district', 'ward', 'hour', 'minute',
                                           'weekday', 'month'])
dummies.head(5)

Unnamed: 0,arrest,block,case_number,date,description,domestic,fbi_code,id,iucr,latitude,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,0,069XX N WOLCOTT AVE,JB552555,2018-12-13 23:54:00,VIOLATE ORDER OF PROTECTION,True,26,11534876,4387,42.008296907,...,0,0,0,0,0,0,0,0,0,1
1,1,0000X E GARFIELD BLVD,JB552587,2018-12-13 23:52:00,OBSTRUCTING IDENTIFICATION,False,24,11534824,3731,41.794670043,...,0,0,0,0,0,0,0,0,0,1
2,1,086XX S MARQUETTE AVE,JB552450,2018-12-13 23:51:00,VIOLATE ORDER OF PROTECTION,False,26,11534731,4387,41.738299859,...,0,0,0,0,0,0,0,0,0,1
3,0,050XX W WOLFRAM ST,JB552435,2018-12-13 23:48:00,RECKLESS FIREARM DISCHARGE,False,15,11534688,1477,41.932364167,...,0,0,0,0,0,0,0,0,0,1
4,1,033XX N LAMON AVE,JB552485,2018-12-13 23:47:00,MANU/DEL:CANNABIS OVER 10 GMS,False,18,11534780,1822,41.941759312,...,0,0,0,0,0,0,0,0,0,1


In [153]:
y = df['iucr']

In [164]:
dummies = dummies.drop(columns=['block', 'case_number', 'date', 'description', 'domestic', 'fbi_code',
                                'id', 'iucr', 'location', 'location_description', 'primary_type',
                                'updated_on', 'year'])

In [186]:
dummies.dtypes

arrest                 int64
latitude             float64
longitude            float64
x_coordinate           int64
y_coordinate           int64
beat_0111              uint8
beat_0112              uint8
beat_0113              uint8
beat_0114              uint8
beat_0121              uint8
beat_0122              uint8
beat_0123              uint8
beat_0124              uint8
beat_0131              uint8
beat_0132              uint8
beat_0133              uint8
beat_0211              uint8
beat_0212              uint8
beat_0213              uint8
beat_0214              uint8
beat_0215              uint8
beat_0221              uint8
beat_0222              uint8
beat_0223              uint8
beat_0224              uint8
beat_0225              uint8
beat_0231              uint8
beat_0232              uint8
beat_0233              uint8
beat_0234              uint8
                      ...   
hour_17                uint8
hour_18                uint8
hour_19                uint8
hour_20       

In [174]:
dummies = dummies.apply(pd.to_numeric)

In [181]:
feature_list = list(dummies.columns)
features = np.array(dummies)
x_train, x_test, y_train, y_test = train_test_split(features, y, test_size = 0.25, random_state = 42)

In [183]:
print('Training Features Shape:', x_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', x_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (222659, 476)
Training Labels Shape: (222659,)
Testing Features Shape: (74220, 476)
Testing Labels Shape: (74220,)


In [185]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(x_train, y_train);

ValueError: could not convert string to float: '143A'