# Chicago Crimes
This examples shows an exploratory data analysis (EDA)  of crimes in Chicago. 

Original example can be found [here](https://medium.com/@ahsanzafar222/chicago-crime-data-cleaning-and-eda-a744c687a291) and [here](https://www.kaggle.com/fahd09/eda-of-crime-in-chicago-2005-2016).


In [2]:
import pandas as pd
import time
import bodo

In [3]:
@bodo.jit
def hello():
    print("Hello World from rank", bodo.get_rank(), " Total ranks =", bodo.get_size())

hello()

    conda install openjdk=11 -c conda-forge
and then reactivate your environment via
    conda deactivate && conda activate /Users/scottroutledge/miniforge3


Hello World from rank 0  Total ranks = 10
Hello World from rank 2  Total ranks = 10
Hello World from rank 8  Total ranks = 10
Hello World from rank 7  Total ranks = 10
Hello World from rank 6  Total ranks = 10
Hello World from rank 4  Total ranks = 10
Hello World from rank 5  Total ranks = 10
Hello World from rank 1  Total ranks = 10
Hello World from rank 9  Total ranks = 10
Hello World from rank 3  Total ranks = 10


## Load Crimes Data in Chicago 2005 - 2017

In [4]:
@bodo.jit(cache=True)
def load_chicago_crimes():
    t1 = time.time()
    crimes1 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2005_to_2007.csv')
    crimes2 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2008_to_2011.csv')
    crimes3 = pd.read_csv('s3://bodo-example-data/chicago-crimes/Chicago_Crimes_2012_to_2017.csv')
    crimes = pd.concat([crimes1, crimes2, crimes3], ignore_index=False, axis=0)
    crimes = crimes.sort_values(by="ID")
    print("Reading time: ", ((time.time() - t1) * 1000), " (ms)")    
    return crimes

crimes1 = load_chicago_crimes()

print(crimes1.head())

Reading time:  92526.16999999998  (ms)
         Unnamed: 0    ID Case Number                    Date  \
1324003     4897380  3012    HL101040  01/01/2005 01:15:00 PM   
1324004     4898204  3013    HK826899  01/02/2005 09:45:00 PM   
1324005     4898986  3014    HL106602  01/04/2005 04:39:00 PM   
1324006     4899770  3015    HL107444  01/05/2005 04:07:00 AM   
1324007     4900593  3016    HL112637  01/08/2005 03:15:00 AM   

                         Block  IUCR Primary Type          Description  \
1324003  076XX S GREENWOOD AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324004        029XX E 82ND ST  0110     HOMICIDE  FIRST DEGREE MURDER   
1324005  070XX S CONSTANCE AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324006     095XX S COLFAX AVE  0110     HOMICIDE  FIRST DEGREE MURDER   
1324007      015XX N DAYTON ST  0110     HOMICIDE  FIRST DEGREE MURDER   

        Location Description Arrest  ...  Ward  Community Area  FBI Code  \
1324003           VACANT LOT   True  ...   8.0   

## Preprocessing and Cleaning
 1. Drop duplicated cases, filter unused columns, and add day of week and date of the crime.
 2. Keep only the most frequent crime type categories.


In [5]:
@bodo.jit(cache=True)
def data_cleanup(crimes):
    t1 = time.time()    
    crimes = crimes.drop_duplicates()    
    crimes.drop(['Unnamed: 0', 'Case Number', 'IUCR','Updated On','Year', 'FBI Code', 'Beat','Ward','Community Area', 'Location'], inplace=True, axis=1)
    crimes.Date = pd.to_datetime(crimes.Date, format='%m/%d/%Y %I:%M:%S %p')
    crimes["dow"] = crimes["Date"].dt.dayofweek
    crimes["date only"] = crimes["Date"].dt.floor('D')
    crimes = crimes.sort_values(by="ID")    
    print("Data cleanup time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes

crimes = data_cleanup(crimes1)

print(crimes.head())

Data cleanup time:  3806.2260000001515  (ms)
           ID                Date                  Block Primary Type  \
1324003  3012 2005-01-01 13:15:00  076XX S GREENWOOD AVE     HOMICIDE   
1324004  3013 2005-01-02 21:45:00        029XX E 82ND ST     HOMICIDE   
1324005  3014 2005-01-04 16:39:00  070XX S CONSTANCE AVE     HOMICIDE   
1324006  3015 2005-01-05 04:07:00     095XX S COLFAX AVE     HOMICIDE   
1324007  3016 2005-01-08 03:15:00      015XX N DAYTON ST     HOMICIDE   

                 Description Location Description Arrest Domestic  District  \
1324003  FIRST DEGREE MURDER           VACANT LOT   True    False       6.0   
1324004  FIRST DEGREE MURDER               STREET   True    False       4.0   
1324005  FIRST DEGREE MURDER               STREET  False    False       3.0   
1324006  FIRST DEGREE MURDER                 AUTO  False    False       4.0   
1324007  FIRST DEGREE MURDER                 CLUB   True    False      18.0   

         X Coordinate  Y Coordinate   Lat

In [6]:
@bodo.jit(cache=True)
def get_top_crime_types(crimes):
    t1 = time.time()
    top_crime_types = crimes['Primary Type'].value_counts().index[0:10]
    print("Getting top crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crime_types

top_crime_types = get_top_crime_types(crimes)

print(top_crime_types)

Getting top crimes Time:  50.731000000041604  (ms)
Index(['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'OTHER OFFENSE',
       'BURGLARY', 'ASSAULT', 'MOTOR VEHICLE THEFT', 'DECEPTIVE PRACTICE',
       'ROBBERY'],
      dtype='string', name='Primary Type')


In [7]:
@bodo.jit(cache=True)
def filter_crimes(crimes, top_crime_types):
    t1 = time.time()
    top_crimes = crimes[crimes['Primary Type'].isin(top_crime_types)]
    print("Filtering crimes Time: ", ((time.time() - t1) * 1000), " (ms)")
    return top_crimes

crimes = filter_crimes(crimes, top_crime_types)
print(crimes.head())

Filtering crimes Time:  773.866999999882  (ms)
              ID                Date                          Block  \
1325007  3730318 2005-01-01 00:04:00            031XX W HARRISON ST   
1325010  3730326 2005-01-01 00:05:00  012XX N LUIS MUNOZ MARIN DR W   
1325012  3730338 2005-01-01 01:13:00              019XX N DRAKE AVE   
1325013  3730341 2005-01-01 01:30:00               002XX N CANAL ST   
1325015  3730348 2005-01-01 00:00:00              006XX E GRAND AVE   

            Primary Type                  Description  \
1325007  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325010  CRIMINAL DAMAGE  TO CITY OF CHICAGO PROPERTY   
1325012  CRIMINAL DAMAGE                   TO VEHICLE   
1325013            THEFT               POCKET-PICKING   
1325015            THEFT               POCKET-PICKING   

                    Location Description Arrest Domestic  District  \
1325007  POLICE FACILITY/VEH PARKING LOT  False    False      11.0   
1325010                           STREET  F

## Crime Analysis

### Find Pattern of each crime over the years



In [8]:
@bodo.jit(cache=True)
def get_crimes_count_date(crimes):
    t1 = time.time()
    crimes_count_date = crimes.pivot_table(index='date only', columns='Primary Type', values='ID', aggfunc="count")
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_count_date

crimes_count_date = get_crimes_count_date(crimes)

Computing Crime Pattern Time:  297.29999999995016  (ms)


  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)


In [9]:
@bodo.jit
def get_crimes_type_date(crimes_count_date):
    t1 = time.time()
    crimes_count_date.index = pd.DatetimeIndex(crimes_count_date.index)
    result = crimes_count_date.fillna(0).rolling(365).sum()
    result = result.sort_index(ascending=False)
    print("Computing Crime Pattern Time: ", ((time.time() - t1) * 1000), " (ms)")
    return result

get_crimes_type_date = get_crimes_type_date(crimes_count_date)
print(get_crimes_type_date.head())

Computing Crime Pattern Time:  218.49900000006528  (ms)
            ASSAULT  DECEPTIVE PRACTICE  BURGLARY  OTHER OFFENSE  ROBBERY  \
2017-01-18  20674.0             14518.0   21618.0        21413.0  13580.0   
2017-01-17  20993.0             14076.0   21608.0        20826.0  13456.0   
2017-01-16  21002.0             14011.0   21571.0        20972.0  13176.0   
2017-01-15  20651.0             14545.0   21124.0        20762.0  13087.0   
2017-01-14  21029.0             14016.0   21593.0        21013.0  13185.0   

            NARCOTICS    THEFT  BATTERY  MOTOR VEHICLE THEFT  CRIMINAL DAMAGE  
2017-01-18    36592.0  74123.0  61146.0              15841.0          39269.0  
2017-01-17    36543.0  73490.0  60887.0              15281.0          38725.0  
2017-01-16    37105.0  73003.0  62734.0              15472.0          39620.0  
2017-01-15    35255.0  72906.0  60667.0              15208.0          37882.0  
2017-01-14    37265.0  73073.0  62834.0              15504.0          39729.0  


  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gatherv_impl_wrapper(data, allgather, warn_if_rep, root, comm_ptr)
  return gat

## A general view of crime records by time, type and location

### Determining the pattern on daily basis

In [10]:
@bodo.jit(cache=True)
def get_crimes_by_days(crimes):
    t1 = time.time()
    crimes_days = crimes.groupby('dow', as_index=False)['ID'].count().sort_values(by='dow')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_days
    
crimes_days = get_crimes_by_days(crimes)
print(crimes_days.head())

Group by days Time:  137.69699999988916  (ms)
   dow      ID
1    0  562811
3    1  568747
0    2  572240
6    3  566206
4    4  599764


### Determining the pattern on monthly basis

In [11]:
@bodo.jit(cache=True)
def get_crimes_by_months(crimes):
    t1 = time.time()
    crimes['month'] = crimes["Date"].dt.month
    crimes_months = crimes.groupby('month', as_index=False)['ID'].count().sort_values(by='month')
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_months
    
crimes_months = get_crimes_by_months(crimes)
print(crimes_months.head())

Group by days Time:  53.773000000092  (ms)
    month      ID
3       1  317796
1       2  267986
9       3  327381
6       4  328439
10      5  355734


### Determining the pattern by crime type

In [12]:
@bodo.jit(cache=True)
def get_crimes_by_type(crimes):
    t1 = time.time()
    crimes_type = crimes.groupby('Primary Type', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_type
    
crimes_type = get_crimes_by_type(crimes)
print(crimes_type.head())

Group by days Time:  333.7589999998727  (ms)
      Primary Type      ID
7            THEFT  907831
8          BATTERY  778164
6  CRIMINAL DAMAGE  499426
4        NARCOTICS  473790
3    OTHER OFFENSE  264200


### Determining the pattern by location

In [13]:
@bodo.jit(cache=True)
def get_crimes_by_location(crimes):
    t1 = time.time()
    crimes_location = crimes.groupby('Location Description', as_index=False)['ID'].count().sort_values(by='ID', ascending=False)
    print("Group by days Time: ", ((time.time() - t1) * 1000), " (ms)")
    return crimes_location
    
crimes_location = get_crimes_by_location(crimes)
print(crimes_location.head())

Group by days Time:  31.35999999994965  (ms)
   Location Description       ID
0                STREET  1001415
34            RESIDENCE   662907
89            APARTMENT   458007
88             SIDEWALK   443551
10                OTHER   145402
