In [131]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely.wkt
import pandas as pd
from sklearn import preprocessing
from shapely.geometry import Point

In [44]:
crime_data = pd.read_csv('Crimes.csv')
weather_data = pd.read_csv('chicago_weather_data.csv')
weather_data = weather_data[weather_data['NAME'] == 'CHICAGO OHARE INTERNATIONAL AIRPORT, IL US']

In [132]:
weather_data = weather_data[['STATION', 'NAME', 'DATE', 'TAVG', 'TMAX', 'TMIN']]

In [155]:
crime_types = ['THEFT', 'HOMICIDE', 'ROBBERY']
small_crimes = crime_data[crime_data['Primary Type'].isin(crime_types)]

Import block group & join with crimes

In [139]:
def process_blocks():
    df = pd.read_csv('blocks.csv')
    df['GEOID10'] = df['GEOID10'].astype(str)
    df["block_group"] = df["GEOID10"].apply(lambda block: block[:12])
    df["the_geom"] = df["the_geom"].apply(shapely.wkt.loads)
    gdf = gpd.GeoDataFrame(df).set_geometry("the_geom").drop(columns=df.columns.difference(["block_group", "the_geom"]))
    gdf = gpd.GeoDataFrame(gdf).set_geometry('the_geom')
    return gdf

In [156]:
blocks_df = process_blocks()

In [157]:
small_crimes = small_crimes[small_crimes['Latitude'].notnull()]
small_crimes['the_geom'] = small_crimes.apply(lambda row: Point(float(row["Longitude"]), float(row["Latitude"])), axis=1)
crimes_gdf = gpd.GeoDataFrame(small_crimes).set_geometry('the_geom')

In [158]:
joined = gpd.sjoin(crimes_gdf, blocks_df, how="left", op='intersects')
joined.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,the_geom,index_right,block_group
1,11709914,JC290748,05/31/2019 12:00:00 AM,014XX N WELLS ST,0890,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,...,1174441.0,1909982.0,2019,06/30/2019 03:56:27 PM,41.90839,-87.634623,"(41.908390488, -87.634623176)",POINT (-87.63462317600001 41.908390488),11298.0,170310803004
10,11705821,JC285616,05/30/2019 11:56:00 PM,054XX S PAULINA ST,031A,ROBBERY,ARMED: HANDGUN,ALLEY,False,False,...,1165934.0,1868706.0,2019,06/30/2019 03:56:27 PM,41.795311,-87.66705,"(41.795311203, -87.66705)",POINT (-87.66705 41.795311203),16898.0,170316118001
11,11705819,JC285634,05/30/2019 11:55:00 PM,015XX E MIDWAY PLAISANCE,031A,ROBBERY,ARMED: HANDGUN,PARK PROPERTY,False,False,...,1187448.0,1866114.0,2019,06/30/2019 03:56:27 PM,41.787714,-87.588241,"(41.787713898, -87.588240934)",POINT (-87.588240934 41.787713898),15006.0,170314111002
18,11705762,JC285613,05/30/2019 11:37:00 PM,111XX S WESTERN AVE,0326,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,PARKING LOT/GARAGE(NON.RESID.),False,False,...,1162389.0,1830454.0,2019,06/30/2019 03:56:27 PM,41.690417,-87.681113,"(41.690416519, -87.681112787)",POINT (-87.681112787 41.690416519),34183.0,170317504001
26,11705925,JC285831,05/30/2019 11:30:00 PM,064XX S TRIPP AVE,0820,THEFT,$500 AND UNDER,STREET,False,False,...,1149147.0,1861558.0,2019,06/30/2019 03:56:27 PM,41.776037,-87.728793,"(41.776036508, -87.7287932)",POINT (-87.7287932 41.776036508),37289.0,170316503013


In [159]:
joined['Date'] = pd.to_datetime(joined['Date'], infer_datetime_format=True)
weather_data['Date'] = pd.to_datetime(weather_data['DATE'], infer_datetime_format=True)

In [166]:
combined = pd.merge(joined, weather_data, on='Date', how='inner')
combined.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Location,the_geom,index_right,block_group,STATION,NAME,DATE,TAVG,TMAX,TMIN
0,11709914,JC290748,2019-05-31,014XX N WELLS ST,890,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,...,"(41.908390488, -87.634623176)",POINT (-87.63462317600001 41.908390488),11298.0,170310803004,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2019-05-31,71.0,81.0,60.0
1,11707744,JC287808,2019-05-30,100XX S WENTWORTH AVE,810,THEFT,OVER $500,RESIDENTIAL YARD (FRONT/BACK),False,False,...,"(41.712076597, -87.628439412)",POINT (-87.62843941200001 41.712076597),13636.0,170314907003,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2019-05-30,66.0,78.0,60.0
2,11705595,JC285462,2019-05-30,048XX N RIDGEWAY AVE,820,THEFT,$500 AND UNDER,RESIDENTIAL YARD (FRONT/BACK),False,False,...,"(41.969247745, -87.721943053)",POINT (-87.72194305299999 41.969247745),31998.0,170311403011,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2019-05-30,66.0,78.0,60.0
3,11725644,JC310143,2019-05-30,023XX W AUGUSTA BLVD,820,THEFT,$500 AND UNDER,STREET,False,False,...,"(41.899402154, -87.685473714)",POINT (-87.685473714 41.899402154),37801.0,170312424002,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2019-05-30,66.0,78.0,60.0
4,11702647,JC281879,2019-05-28,043XX S GREENWOOD AVE,810,THEFT,OVER $500,RESIDENTIAL YARD (FRONT/BACK),False,False,...,"(41.816161569, -87.599400806)",POINT (-87.59940080600001 41.816161569),36071.0,170313902002,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2019-05-28,65.0,70.0,52.0


Data Cleaning & Feature Generation

In [192]:
dfs_to_concat = [combined]
for col in ['Primary Type', 'Arrest', 'Domestic']:
    dummy_df = pd.get_dummies(combined[col], prefix=col)
    dfs_to_concat.append(dummy_df)
dataframe = pd.concat(dfs_to_concat, axis=1)

In [194]:
dataframe.columns

Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location', 'the_geom', 'index_right', 'block_group', 'STATION', 'NAME',
       'DATE', 'TAVG', 'TMAX', 'TMIN', 'Primary Type_HOMICIDE',
       'Primary Type_ROBBERY', 'Primary Type_THEFT', 'Arrest_False',
       'Arrest_True', 'Domestic_False', 'Domestic_True'],
      dtype='object')

In [198]:
grouped = dataframe.groupby(['Date', 'block_group'])[['Primary Type_HOMICIDE',
       'Primary Type_ROBBERY', 'Primary Type_THEFT', 'Arrest_False',
       'Arrest_True', 'Domestic_False', 'Domestic_True', 'TAVG']].sum().reset_index()

In [206]:
scaler = preprocessing.MinMaxScaler()
scaler.fit(grouped[['TAVG']])
preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
grouped['TAVG'] = scaler.transform(grouped[['TAVG']])
grouped.head()

Unnamed: 0,Date,block_group,Primary Type_HOMICIDE,Primary Type_ROBBERY,Primary Type_THEFT,Arrest_False,Arrest_True,Domestic_False,Domestic_True,TAVG
0,2016-06-01,170316605001,0,0,1,1,0,1,0,0.406977
1,2016-06-02,170317003013,0,0,1,1,0,1,0,0.418605
2,2016-06-02,170318320001,0,0,1,1,0,1,0,0.418605
3,2016-06-02,170318391002,0,1,0,1,0,1,0,0.418605
4,2016-06-04,170318325001,0,0,1,1,0,1,0,0.412791
