# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


In [1]:
import pandas as pd
import time
import bodo

## Load Crimes Data in Chicago

In [3]:
@bodo.jit(cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv')
    crimes = crimes.sort_values(by="ID")
    print("Reading time: ", (time.time() - t1), " (s)")    
    return crimes

crimes1 = load_chicago_crimes()
print(crimes1.head())

Reading time:  24.456013999999982  (s)
         Unnamed: 0     ID Case Number                    Date  \
1267592     4105311  20224    HV101396  01/02/2012 02:22:00 AM   
1267593     4105388  20225    HV102221  01/02/2012 05:58:00 PM   
1267594     4105463  20226    HV102145  01/02/2012 05:53:00 PM   
1267595     4105549  20227    HV101433  01/02/2012 05:15:00 AM   
1267596     4105635  20228    HV102986  01/03/2012 12:07:00 PM   

                             Block  IUCR Primary Type          Description  \
1267592       030XX W LAWRENCE AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1267593            024XX E 78TH ST  0110     HOMICIDE  FIRST DEGREE MURDER   
1267594        066XX S WOLCOTT AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1267595  107XX S COTTAGE GROVE AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1267596         010XX N PULASKI RD  0110     HOMICIDE  FIRST DEGREE MURDER   

        Location Description Arrest  ...  Ward  Community Area  FBI Code  \
1267592            

## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [7]:
@bodo.jit(cache=True)
def data_cleanup(crimes):
    t1 = time.time()    
    crimes = crimes.drop_duplicates()    
    crimes = crimes.drop(['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], axis=1)
    crimes = crimes.assign(Date=pd.to_datetime(crimes.Date, format='%m/%d/%Y %I:%M:%S %p'))
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor('D')
    crimes = crimes.sort_values(by="ID")    
    print("Data cleanup time: ", (time.time() - t1), " (s)")
    return crimes

crimes = data_cleanup(crimes1)
print(crimes.head())

Data cleanup time:  1.39205800000002  (s)
            ID                Date                      Block Primary Type  \
1267592  20224 2012-01-02 02:22:00       030XX W LAWRENCE AVE     HOMICIDE   
1267593  20225 2012-01-02 17:58:00            024XX E 78TH ST     HOMICIDE   
1267594  20226 2012-01-02 17:53:00        066XX S WOLCOTT AVE     HOMICIDE   
1267595  20227 2012-01-02 05:15:00  107XX S COTTAGE GROVE AVE     HOMICIDE   
1267596  20228 2012-01-03 12:07:00         010XX N PULASKI RD     HOMICIDE   

                 Description Location Description Arrest Domestic  District  \
1267592  FIRST DEGREE MURDER                 AUTO  False    False      17.0   
1267593  FIRST DEGREE MURDER               STREET  False    False       4.0   
1267594  FIRST DEGREE MURDER                HOUSE  False    False       7.0   
1267595  FIRST DEGREE MURDER               STREET   True    False       5.0   
1267596  FIRST DEGREE MURDER               STREET  False    False      11.0   

         X Coo

In [8]:
@bodo.jit(cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes['Primary Type'].value_counts().index[0:10]
    print("Getting top crimes Time: ", (time.time() - t1), " (s)")
    return top_crime_types

top_crime_types = get_top_crime_types(crimes)
print(top_crime_types)

Getting top crimes Time:  0.2646310000000085  (s)
Index(['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT',
       'OTHER OFFENSE', 'BURGLARY', 'DECEPTIVE PRACTICE',
       'MOTOR VEHICLE THEFT', 'ROBBERY'],
      dtype='string', name='Primary Type')


In [9]:
@bodo.jit(cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes['Primary Type'].isin(top_crime_types)]
    print("Filtering crimes Time: ", (time.time() - t1), " (s)")
    return top_crimes

crimes = filter_crimes(crimes, top_crime_types)
print(crimes.head())

Filtering crimes Time:  0.1708210000000463  (s)
            ID                Date                                Block  \
77270  8421394 2012-01-01 00:15:00                  004XX E ILLINOIS ST   
77272  8421398 2012-01-01 00:23:00                   033XX N HALSTED ST   
77273  8421402 2012-01-01 00:30:00  092XX S DR MARTIN LUTHER KING JR DR   
77274  8421404 2012-01-01 00:23:00                     002XX W 118TH ST   
77276  8421408 2012-01-01 00:40:00                      008XX E 79TH ST   

          Primary Type                     Description  \
77270          BATTERY                          SIMPLE   
77272          ASSAULT  AGGRAVATED:KNIFE/CUTTING INSTR   
77273          BATTERY   AGGRAVATED: OTHER DANG WEAPON   
77274  CRIMINAL DAMAGE     TO CITY OF CHICAGO PROPERTY   
77276        NARCOTICS    POSS: CANNABIS 30GMS OR LESS   

                 Location Description Arrest Domestic  District  X Coordinate  \
77270                           OTHER  False    False      18.0     117

## Crime Analysis

### Find Pattern of each crime over the years



In [10]:
@bodo.jit(cache=True)
def get_crimes_count_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(index='date only', columns='Primary Type', values='ID', aggfunc="count")
    print("Computing Crime Pattern Time: ", (time.time() - t1), " (s)")
    return crimes_count_date

crimes_count_date = get_crimes_count_date(crimes)

Computing Crime Pattern Time:  0.4228880000000572  (s)


  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)


In [11]:
@bodo.jit
def get_crimes_type_date(crimes_count_date):
    t1 = time.time()
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", (time.time() - t1), " (s)")
    return result

get_crimes_type_date = get_crimes_type_date(crimes_count_date)
print(get_crimes_type_date.head())

Computing Crime Pattern Time:  0.4815959999999677  (s)
            ROBBERY    THEFT  OTHER OFFENSE  ASSAULT  BATTERY  NARCOTICS  \
2017-01-18  11125.0  63588.0        17553.0  17849.0  50975.0    25565.0   
2017-01-17  11205.0  63111.0        17120.0  17652.0  50433.0    23925.0   
2017-01-16  10919.0  63563.0        17215.0  18165.0  51560.0    25033.0   
2017-01-15  11208.0  63090.0        17115.0  17623.0  50457.0    23961.0   
2017-01-14  10915.0  63573.0        17213.0  18164.0  51606.0    25101.0   

            DECEPTIVE PRACTICE  CRIMINAL DAMAGE  MOTOR VEHICLE THEFT  BURGLARY  
2017-01-18             15237.0          30372.0              11988.0   15874.0  
2017-01-17             15439.0          29895.0              11640.0   15391.0  
2017-01-16             15125.0          30568.0              11528.0   15784.0  
2017-01-15             15431.0          29872.0              11660.0   15387.0  
2017-01-14             15136.0          30586.0              11524.0   15776.0  


  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gat

## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [12]:
@bodo.jit(cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = crimes.groupby('dow', as_index=False)['ID'].count().sort_values(by='dow')
    print("Group by days Time: ", (time.time() - t1), " (s)")
    return crimes_days
    
crimes_days = get_crimes_by_days(crimes)
print(crimes_days.head())

Group by days Time:  0.011236000000053536  (s)
   dow      ID
4    0  190485
1    1  189223
2    2  191247
3    3  189308
0    4  200886


### Determining the pattern on monthly basis

In [13]:
@bodo.jit(cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes['month'] = crimes["Date"].dt.month
    crimes_months = crimes.groupby('month', as_index=False)['ID'].count().sort_values(by='month')
    print("Group by days Time: ", (time.time() - t1), " (s)")
    return crimes_months
    
crimes_months = get_crimes_by_months(crimes)
print(crimes_months.head())

Group by days Time:  0.013942999999926542  (s)
    month      ID
6       1  113675
7       2   90123
8       3  109104
3       4  108457
10      5  119081


### Determining the pattern by crime type

In [14]:
@bodo.jit(cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = crimes.groupby('Primary Type', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", (time.time() - t1), " (s)")
    return crimes_type
    
crimes_type = get_crimes_by_type(crimes)
print(crimes_type.head())

Group by days Time:  0.024769999999989523  (s)
      Primary Type      ID
1            THEFT  329460
4          BATTERY  263700
7  CRIMINAL DAMAGE  155455
5        NARCOTICS  135240
2          ASSAULT   91289


### Determining the pattern by location

In [15]:
@bodo.jit(cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = crimes.groupby('Location Description', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", (time.time() - t1), " (s)")
    return crimes_location
    
crimes_location = get_crimes_by_location(crimes)
print(crimes_location.head())

Group by days Time:  0.4604380000000674  (s)
   Location Description      ID
31               STREET  306860
39            RESIDENCE  216611
32            APARTMENT  173373
74             SIDEWALK  147414
95                OTHER   51854
