In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from datetime import datetime, timedelta

In [5]:
crime_df = pd.read_csv('../data/Crimes_-_2001_to_2021.csv', parse_dates=['Date'], infer_datetime_format=True)
crime_df.drop(columns=[ 'Case Number','Block','IUCR','Description','Arrest',
                        'Domestic','Beat','District','Ward','Community Area',
                        'FBI Code','Updated On','Location',], inplace=True)
crime_df = crime_df.rename(columns={'X Coordinate':'x', 'Y Coordinate':'y'})
crime_df.dropna(inplace=True)
crime_df.drop(crime_df[crime_df['x'] == 0].index, inplace=True)
crime_df.drop(crime_df[crime_df['Year'] <= 2010].index, inplace=True)
crime_df['Date'] = crime_df['Date'].dt.date

In [6]:
# Create grid cells based on x and y coordinates
path = '../data/shapefile/geo_export.shp'
chicago = gpd.read_file(path)
chicago = chicago.dissolve()
xmin, ymin, xmax, ymax = chicago.total_bounds
n_x_cells = 50
x_cell_size = (xmax - xmin) / n_x_cells
n_y_cells = round(((xmax - xmin)/(ymax - ymin))*n_x_cells)
y_cell_size = (ymax - ymin) / n_y_cells

# Determine grid cell coordinate based on x and y coordinates
crime_df['x_cell'] = ((crime_df['Longitude'] - xmin) // x_cell_size).astype(int)
crime_df['y_cell'] = n_y_cells - ((crime_df['Latitude'] - ymin) // y_cell_size).astype(int) - 1
crime_df['x_y_cell'] = list(zip(crime_df['x_cell'], crime_df['y_cell']))

In [7]:
display(crime_df.head())

Unnamed: 0,ID,Date,Primary Type,Location Description,x,y,Year,Latitude,Longitude,x_cell,y_cell,x_y_cell
0,10224738,2015-09-05,BATTERY,RESIDENCE,1165074.0,1875917.0,2015,41.815117,-87.67,32,30,"(32, 30)"
1,10224739,2015-09-04,THEFT,CTA BUS,1138875.0,1904869.0,2015,41.89508,-87.7654,21,18,"(21, 18)"
3,10224740,2015-09-05,NARCOTICS,SIDEWALK,1152037.0,1920384.0,2015,41.937406,-87.71665,26,12,"(26, 12)"
4,10224741,2015-09-05,ASSAULT,APARTMENT,1141706.0,1900086.0,2015,41.881903,-87.755121,22,20,"(22, 20)"
5,10224742,2015-09-05,BURGLARY,RESIDENCE,1168430.0,1850165.0,2015,41.744379,-87.658431,33,40,"(33, 40)"


In [5]:
print(crime_df['Primary Type'].unique())

soft_list = [   
    'DECEPTIVE PRACTICE', 'INTERFERENCE WITH PUBLIC OFFICER', 'GAMBLING', 'LIQUOR LAW VIOLATION',
    'NON - CRIMINAL', 'NON-CRIMINAL', 'NON-CRIMINAL (SUBJECT SPECIFIED)', 'PUBLIC PEACE VIOLATION'
    ]
soft_list = list(zip(soft_list, [0]*len(soft_list)))

middle_list = [
    'INTIMIDATION', 'PROSTITUTION', 'OTHER OFFENSE', 'CRIMINAL DAMAGE', 'STALKING', 'RITUALISM',
    'CONCEALED CARRY LICENSE VIOLATION', 'PUBLIC INDECENCY', 'NARCOTICS', 'CRIMINAL TRESPASS', 'OTHER NARCOTIC VIOLATION'
    ]
middle_list = list(zip(middle_list, [0.5]*len(middle_list)))

bad_list = [
    'BATTERY', 'THEFT', 'ASSAULT', 'BURGLARY', 'ROBBERY', 'WEAPONS VIOLATION', 'MOTOR VEHICLE THEFT', 'SEX OFFENSE', 
    'OFFENSE INVOLVING CHILDREN', 'CRIM SEXUAL ASSAULT','CRIMINAL SEXUAL ASSAULT', 'HUMAN TRAFFICKING', 'OBSCENITY',
    'ARSON', 'KIDNAPPING', 'HOMICIDE', 'DOMESTIC VIOLENCE'
    ]
bad_list = list(zip(bad_list, [1]*len(bad_list)))

score_dict = dict(soft_list+middle_list+bad_list)
test_crime_df = crime_df.replace({"Primary Type": score_dict})

['BATTERY' 'THEFT' 'NARCOTICS' 'ASSAULT' 'BURGLARY' 'ROBBERY'
 'OTHER OFFENSE' 'CRIMINAL DAMAGE' 'WEAPONS VIOLATION'
 'DECEPTIVE PRACTICE' 'CRIMINAL TRESPASS' 'MOTOR VEHICLE THEFT'
 'SEX OFFENSE' 'INTERFERENCE WITH PUBLIC OFFICER'
 'OFFENSE INVOLVING CHILDREN' 'PUBLIC PEACE VIOLATION' 'PROSTITUTION'
 'GAMBLING' 'CRIM SEXUAL ASSAULT' 'LIQUOR LAW VIOLATION' 'ARSON'
 'STALKING' 'KIDNAPPING' 'INTIMIDATION'
 'CONCEALED CARRY LICENSE VIOLATION' 'NON - CRIMINAL' 'HUMAN TRAFFICKING'
 'OBSCENITY' 'CRIMINAL SEXUAL ASSAULT' 'PUBLIC INDECENCY'
 'OTHER NARCOTIC VIOLATION' 'NON-CRIMINAL' 'HOMICIDE'
 'NON-CRIMINAL (SUBJECT SPECIFIED)' 'RITUALISM']


In [6]:
# Group crime data by date and grid cell and create a new column for the number of crimes
grouped_crimes_df = test_crime_df.groupby(['Date','x_y_cell']).agg(
    {   
        'Primary Type': 'mean',
        'ID':'count'
    }).reset_index()
grouped_crimes_df = grouped_crimes_df.rename(columns={'ID':'Count'})

grouped_crimes_df[['x_cell', 'y_cell']] = grouped_crimes_df['x_y_cell'].to_list()

display(grouped_crimes_df)

Unnamed: 0,Date,x_y_cell,Primary Type,Count,x_cell,y_cell
0,2011-01-01,"(4, 6)",0.500000,2,4,6
1,2011-01-01,"(11, 6)",1.000000,1,11,6
2,2011-01-01,"(11, 7)",0.500000,3,11,7
3,2011-01-01,"(12, 6)",0.833333,3,12,6
4,2011-01-01,"(12, 10)",0.500000,1,12,10
...,...,...,...,...,...,...
1714493,2021-12-31,"(46, 38)",0.500000,1,46,38
1714494,2021-12-31,"(46, 39)",1.000000,2,46,39
1714495,2021-12-31,"(46, 41)",1.000000,1,46,41
1714496,2021-12-31,"(47, 41)",0.500000,1,47,41


In [7]:
# Check for missing days
deltas = grouped_crimes_df['Date'].diff()[1:]
gaps = deltas[deltas > timedelta(days=1)]
print(f'{len(gaps)} gaps with average gap duration: {gaps.mean()}')

0 gaps with average gap duration: NaT


Shape input matrix: (1, 365*20, 55, 50) voor tijdens het testen gebruiken we alleen Count als feature

In [8]:
day_group_crimes_df = grouped_crimes_df.groupby('Date')
all_days = []
for day_df in day_group_crimes_df:
    day_array = np.zeros((n_y_cells, n_x_cells))
    for index, row in day_df[1].iterrows():
        day_array[row['y_cell'], row['x_cell']] = row['Count']
    all_days.append(list(day_array))
all_days = np.array(all_days)

In [9]:
np.save('../data/train_data.npy', all_days[:-365])
np.save('../data/test_data.npy', all_days[-365:])

In [10]:
all_days_severity = []
for day_df in day_group_crimes_df:
    day_array = np.zeros((n_y_cells, n_x_cells))
    for index, row in day_df[1].iterrows():
        day_array[row['y_cell'], row['x_cell']] = row['Primary Type']
    all_days_severity.append(list(day_array))
all_days_severity = np.array(all_days_severity)

In [11]:
np.save('../data/train_data_severity.npy', all_days_severity[:-365])
np.save('../data/test_data_severity.npy', all_days_severity[-365:])

### Crime seasonality

In [30]:
dates = [item.timetuple().tm_yday for item, _ in crime_df.groupby('Date')]
dates2 = []
for val in dates:
    dates2.append(np.tile(val, (55,50)))
dates2 = np.array(dates2)
np.save('../data/train_data_seasonality.npy', dates2[:-365])
np.save('../data/test_data_seasonality.npy', dates2[-365:])

### Meteorological Data

In [None]:
df = pd.read_csv('../data/weather.csv')
df = df[df['datetime'] >= '2011-01-01']

visibility = df['visibility'].values
l = []
for val in visibility:
    l.append(np.tile(val, (55,50)))
l = np.array(l)
np.save('../data/visibility_train.npy', l[:-365])
np.save('../data/visibility_test.npy', l[-365:])

cloudcover = df['cloudcover'].values
l = []
for val in cloudcover:
    l.append(np.tile(val, (55,50)))
l = np.array(l)
np.save('../data/cloudcover_train.npy', l[:-365])
np.save('../data/cloudcover_test.npy', l[-365:])