# Dataframe Generation for Model Input
* Author: Callie Clark
* Date updated: 4/3/2021

In [43]:
import pandas as pd
import requests
import datetime as dt
import numpy as np

import dataframe_generation_files as gen

## Format Incident Data

In [44]:
df=gen.format_file('Chicago_Data/Crimes_-_2014.zip','Crimes_-_2014.csv',timezone="America/Chicago")
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Timestamp
0,9470797,HX123824,2014-01-22,031XX S ASHLAND AVE,460,BATTERY,SIMPLE,RESTAURANT,True,False,...,59,08B,1166178.0,1883833.0,2014,05/24/2017 03:50:24 PM,41.836816,-87.665724,"(41.836816162, -87.665724279)",2014-01-22 16:02:00-06:00
5,9729405,HX379028,2014-08-08,044XX W JACKSON BLVD,4651,OTHER OFFENSE,SEX OFFENDER: FAIL REG NEW ADD,RESIDENCE,True,False,...,26,26,1146739.0,1898315.0,2014,05/21/2017 03:53:28 PM,41.876949,-87.736685,"(41.876949072, -87.736685059)",2014-08-08 10:50:00-05:00
6,9510600,HX165568,2014-02-26,037XX N MONTICELLO AVE,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,...,16,05,1151395.0,1924539.0,2014,05/19/2017 03:49:49 PM,41.94882,-87.7189,"(41.948820056, -87.718899701)",2014-02-26 07:40:00-06:00
12,9539289,HX192580,2014-03-20,111XX S STATE ST,460,BATTERY,SIMPLE,GAS STATION,True,False,...,49,08B,1178229.0,1831287.0,2014,05/15/2017 03:51:52 PM,41.692359,-87.623097,"(41.692358646, -87.623096746)",2014-03-20 19:25:00-05:00
15,9801642,HX450761,2014-09-13,070XX S MERRILL AVE,1725,OFFENSE INVOLVING CHILDREN,CONTRIBUTE CRIM DELINQUENCY JUVENILE,OTHER,True,True,...,43,20,1191762.0,1858706.0,2014,05/09/2017 03:51:05 PM,41.767282,-87.572664,"(41.767282129, -87.572663824)",2014-09-13 14:00:00-05:00


## Add Murder Data 
https://home.chicagopolice.org/wp-content/uploads/2014/12/2011-Murder-Report.pdf
(removed from CFS)
TO DO: Clean up and create a function

In [45]:
import geopandas as gpd
from random_timestamp import random_timestamp
import random
from shapely.geometry import Point
from numpy import asarray


def generate_random(number, polygon):
    points = []
    timestamps=[]
    minx, miny, maxx, maxy = polygon.bounds
    while len(points) < number:
        pnt = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if polygon.contains(pnt):
            points.append(asarray(pnt))
            timestamps.append(random_timestamp(year=2014))
            #lat_long.append(asarray(pnt))
    return points, timestamps

fp = "Simulation_Data/Chicago_PB.shp"
polys  = gpd.read_file(fp)

In [46]:
murders_df=pd.read_csv('Chicago_Data/Chicago_Murders_District.csv')
murders_df['Average Murders']=murders_df[['2010 Murders','2011 Murders']].mean(axis=1)
murders_2014=murders_df[['District','Average Murders']]#.set_index('District')
murders_2014['District']=murders_2014['District'].astype(str)
polys=polys.merge(murders_2014,how='left',left_on='dist_num',right_on='District').drop(columns=['District']).fillna(0)
polys.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  murders_2014['District']=murders_2014['District'].astype(str)


Unnamed: 0,dist_label,dist_num,geometry,Average Murders
0,17TH,17,"POLYGON ((-87.71067 41.99737, -87.71067 41.997...",7.0
1,20TH,20,"POLYGON ((-87.66029 41.99092, -87.66029 41.990...",3.5
2,31ST,31,"POLYGON ((-87.82818 41.98384, -87.82816 41.983...",0.0
3,31ST,31,"POLYGON ((-87.83365 41.97535, -87.83366 41.974...",0.0
4,19TH,19,"POLYGON ((-87.64492 41.96973, -87.64431 41.969...",2.0


In [47]:
murder_by_district={}
murders_loc=[]
timestamp_murders=[]
for index_loc in polys.index:
    district=polys.iloc[index_loc,1]
    murder_num=polys.iloc[index_loc,3]
    #print(district,murder_num)
    points, timestamps=generate_random(int(murder_num), polys.geometry[index_loc])
    #print(len(timestamps),len(points))
    #murder_by_district[int(district)]=points
    timestamp_murders=timestamp_murders+timestamps
    murders_loc=murders_loc+points

    

In [48]:
len(murders_loc)

417

In [49]:
df_2 = pd.DataFrame(0, index=np.arange(len(timestamp_murders)), columns=df.columns)
df_2['Timestamp']=timestamp_murders
df_2['Timestamp']=df_2['Timestamp'].dt.tz_localize("America/Chicago",nonexistent='shift_forward',ambiguous='NaT')
df_2['Location']=murders_loc
df_2['Latitude']=df_2.apply((lambda x: x['Location'][1]),axis=1)
df_2['Longitude']=df_2.apply((lambda x: x['Location'][0]),axis=1)
df_2['Date']=df_2.apply((lambda x: x['Timestamp'].date()),axis=1)
df_2['FBI Code']=["01A" for i in range(len(timestamp_murders))]
df_2.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Timestamp
0,0,0,2014-04-26,0,0,0,0,0,0,0,...,0,01A,0,0,0,0,41.98852,-87.737081,"[-87.73708088098995, 41.98851988526843]",2014-04-26 16:09:06-05:00
1,0,0,2014-08-17,0,0,0,0,0,0,0,...,0,01A,0,0,0,0,41.994776,-87.730809,"[-87.73080949106954, 41.994776217303645]",2014-08-17 21:00:07-05:00
2,0,0,2014-08-16,0,0,0,0,0,0,0,...,0,01A,0,0,0,0,41.949941,-87.702104,"[-87.7021038225984, 41.949941201179634]",2014-08-16 02:29:55-05:00
3,0,0,2014-03-21,0,0,0,0,0,0,0,...,0,01A,0,0,0,0,41.950869,-87.702338,"[-87.70233755633693, 41.950868780330076]",2014-03-21 16:23:17-05:00
4,0,0,2014-08-29,0,0,0,0,0,0,0,...,0,01A,0,0,0,0,41.982499,-87.730351,"[-87.73035071528899, 41.98249937119677]",2014-08-29 02:23:29-05:00


In [51]:
df=pd.concat([df,df_2]) #add murders to original DataFrame

In [52]:
gen.check_lat_long(df)

Data has Latitude and Longitude


In [53]:
violent_crime=["01A",'02','03','04A','04B']
property_crime=['05','06','07','09']
index_crime=["01A",'02','03','04A','04B','05','06','07','09']
nonindex_crime=['01B', '08A', '08B','10','11','12','13','14','15','16','17','18','19','20','22','24','26']
nonviolent_crime=nonindex_crime+property_crime
    
df=gen.classify_response_type(df, crime_responder='Police', crime_types=index_crime, UCRcol='FBI Code' )
#TODO may want to indicate the crime type in csv name
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Timestamp,Police
0,9470797,HX123824,2014-01-22,031XX S ASHLAND AVE,460,BATTERY,SIMPLE,RESTAURANT,1,0,...,08B,1166178.0,1883833.0,2014,05/24/2017 03:50:24 PM,41.836816,-87.665724,"(41.836816162, -87.665724279)",2014-01-22 16:02:00-06:00,0
5,9729405,HX379028,2014-08-08,044XX W JACKSON BLVD,4651,OTHER OFFENSE,SEX OFFENDER: FAIL REG NEW ADD,RESIDENCE,1,0,...,26,1146739.0,1898315.0,2014,05/21/2017 03:53:28 PM,41.876949,-87.736685,"(41.876949072, -87.736685059)",2014-08-08 10:50:00-05:00,0
6,9510600,HX165568,2014-02-26,037XX N MONTICELLO AVE,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,0,0,...,05,1151395.0,1924539.0,2014,05/19/2017 03:49:49 PM,41.94882,-87.7189,"(41.948820056, -87.718899701)",2014-02-26 07:40:00-06:00,1
12,9539289,HX192580,2014-03-20,111XX S STATE ST,460,BATTERY,SIMPLE,GAS STATION,1,0,...,08B,1178229.0,1831287.0,2014,05/15/2017 03:51:52 PM,41.692359,-87.623097,"(41.692358646, -87.623096746)",2014-03-20 19:25:00-05:00,0
15,9801642,HX450761,2014-09-13,070XX S MERRILL AVE,1725,OFFENSE INVOLVING CHILDREN,CONTRIBUTE CRIM DELINQUENCY JUVENILE,OTHER,1,1,...,20,1191762.0,1858706.0,2014,05/09/2017 03:51:05 PM,41.767282,-87.572664,"(41.767282129, -87.572663824)",2014-09-13 14:00:00-05:00,0


In [54]:
df=gen.set_priority(df)
df=gen.set_number_responders(df,number_per_call=2)

In [55]:
df=gen.log_norm_service_time(df, average_time=23.2,s=0.18)

Stats of dist: min 9.087332933476365 max 53.32968036420076 mean 23.58141295373639


In [56]:
#Double Service Times for Murders
df.loc[df['FBI Code']=='01A','Service Time']=df.loc[df['FBI Code']=='01A','Service Time']*2

In [57]:
df.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Year,Updated On,Latitude,Longitude,Location,Timestamp,Police,Priority,#Responders,Service Time
412,0,0,2014-05-25,0,0,0,0,0,0,0,...,0,0,41.900226,-87.769835,"[-87.76983508034058, 41.90022610783778]",2014-05-25 16:52:55-05:00,1,1,2,0 days 00:44:00
413,0,0,2014-04-11,0,0,0,0,0,0,0,...,0,0,41.89047,-87.774596,"[-87.77459631150859, 41.890470128305644]",2014-04-11 00:24:29-05:00,1,1,2,0 days 00:48:00
414,0,0,2014-10-29,0,0,0,0,0,0,0,...,0,0,41.882301,-87.761309,"[-87.76130904461667, 41.88230137802864]",2014-10-29 20:04:13-05:00,1,1,2,0 days 00:40:00
415,0,0,2014-07-02,0,0,0,0,0,0,0,...,0,0,41.881338,-87.762123,"[-87.76212340520009, 41.88133758498186]",2014-07-02 20:03:44-05:00,1,1,2,0 days 01:02:00
416,0,0,2014-08-28,0,0,0,0,0,0,0,...,0,0,41.86855,-87.754907,"[-87.75490713096309, 41.86854982108598]",2014-08-28 20:15:53-05:00,1,1,2,0 days 00:46:00


In [78]:
len(df[df['Priority']==1])/len(df)#violent

0.0835322903536739

In [81]:
(len(df[df['Priority']==1])+len(df[df['Priority']==2]))/len(df)#index

0.3990398161192259

## Create Seasonal Profiles
*  Crime frequency and type varies by season

In [58]:
season_map={}
df=gen.create_seasons(df,season_map)
df=gen.calculate_frequency(df)
df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Latitude,Longitude,Location,Timestamp,Police,Priority,#Responders,Service Time,season,Frequency
0,9470797,HX123824,2014-01-22,031XX S ASHLAND AVE,460,BATTERY,SIMPLE,RESTAURANT,1,0,...,41.836816,-87.665724,"(41.836816162, -87.665724279)",2014-01-22 16:02:00-06:00,0,3,2,0 days 00:27:00,winter,622
5,9729405,HX379028,2014-08-08,044XX W JACKSON BLVD,4651,OTHER OFFENSE,SEX OFFENDER: FAIL REG NEW ADD,RESIDENCE,1,0,...,41.876949,-87.736685,"(41.876949072, -87.736685059)",2014-08-08 10:50:00-05:00,0,3,2,0 days 00:35:00,summer,810
6,9510600,HX165568,2014-02-26,037XX N MONTICELLO AVE,610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,0,0,...,41.94882,-87.7189,"(41.948820056, -87.718899701)",2014-02-26 07:40:00-06:00,1,2,2,0 days 00:24:00,winter,662
12,9539289,HX192580,2014-03-20,111XX S STATE ST,460,BATTERY,SIMPLE,GAS STATION,1,0,...,41.692359,-87.623097,"(41.692358646, -87.623096746)",2014-03-20 19:25:00-05:00,0,3,2,0 days 00:27:00,spring,772
15,9801642,HX450761,2014-09-13,070XX S MERRILL AVE,1725,OFFENSE INVOLVING CHILDREN,CONTRIBUTE CRIM DELINQUENCY JUVENILE,OTHER,1,1,...,41.767282,-87.572664,"(41.767282129, -87.572663824)",2014-09-13 14:00:00-05:00,0,3,2,0 days 00:22:00,summer,667


In [59]:
df.tail()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Latitude,Longitude,Location,Timestamp,Police,Priority,#Responders,Service Time,season,Frequency
412,0,0,2014-05-25,0,0,0,0,0,0,0,...,41.900226,-87.769835,"[-87.76983508034058, 41.90022610783778]",2014-05-25 16:52:55-05:00,1,1,2,0 days 00:44:00,spring,809
413,0,0,2014-04-11,0,0,0,0,0,0,0,...,41.89047,-87.774596,"[-87.77459631150859, 41.890470128305644]",2014-04-11 00:24:29-05:00,1,1,2,0 days 00:48:00,spring,772
414,0,0,2014-10-29,0,0,0,0,0,0,0,...,41.882301,-87.761309,"[-87.76130904461667, 41.88230137802864]",2014-10-29 20:04:13-05:00,1,1,2,0 days 00:40:00,fall,781
415,0,0,2014-07-02,0,0,0,0,0,0,0,...,41.881338,-87.762123,"[-87.76212340520009, 41.88133758498186]",2014-07-02 20:03:44-05:00,1,1,2,0 days 01:02:00,summer,814
416,0,0,2014-08-28,0,0,0,0,0,0,0,...,41.86855,-87.754907,"[-87.75490713096309, 41.86854982108598]",2014-08-28 20:15:53-05:00,1,1,2,0 days 00:46:00,summer,724


In [64]:
seasonal_profiles={}
seasonal_profiles=gen.generate_seasonal_profiles(df,seasonal_profiles,quantile_list=[0.5],iterations=1, number_profiles=1)
seasonal_profiles
#To Do fix allowing seasonal and all worst case to repeat

{'summer_0.5': [datetime.date(2014, 7, 13)],
 'fall_0.5': [datetime.date(2014, 10, 22)],
 'spring_0.5': [datetime.date(2014, 5, 14)],
 'winter_0.5': [datetime.date(2014, 1, 11)],
 'summer_worst_case': [datetime.date(2014, 8, 1)],
 'fall_worst_case': [datetime.date(2014, 10, 1)],
 'spring_worst_case': [datetime.date(2014, 6, 1)],
 'winter_worst_case': [datetime.date(2014, 1, 1)]}

In [72]:
#To Do fix allowing seasonal and all worst case to repeat -->currently hardcoding
seasonal_profiles['summer_worst_case']=[dt.date(2014, 9, 1)]

In [73]:
seasonal_profiles

{'summer_0.5': [datetime.date(2014, 7, 13)],
 'fall_0.5': [datetime.date(2014, 10, 22)],
 'spring_0.5': [datetime.date(2014, 5, 14)],
 'winter_0.5': [datetime.date(2014, 1, 11)],
 'summer_worst_case': [datetime.date(2014, 9, 1)],
 'fall_worst_case': [datetime.date(2014, 10, 1)],
 'spring_worst_case': [datetime.date(2014, 6, 1)],
 'winter_worst_case': [datetime.date(2014, 1, 1)]}

In [69]:
daily_profiles={}
gen.quantile_daily_profiles(df, daily_profiles,quantiles=[0.25,0.50,0.75,0.95],iterations=1)
gen.worst_case_profiles(df, daily_profiles,number_profiles=1)
daily_profiles

{'all_data_0.25': [datetime.date(2014, 12, 8)],
 'all_data_0.5': [datetime.date(2014, 5, 7)],
 'all_data_0.75': [datetime.date(2014, 7, 20)],
 'all_data_0.95': [datetime.date(2014, 7, 27)],
 'all_data_worst_case': [datetime.date(2014, 8, 1)]}

## Generate Incident Profiles and send to csvs

In [70]:
gen.create_csv(daily_profiles,df,folder='Chicago_Data')

In [74]:
gen.create_csv(seasonal_profiles,df,folder='Chicago_Data')