In [25]:
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import requests
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

## gpd.read_file on GEOjson
crashes = gpd.read_file('Traffic Crashes - Crashes.geojson')

In [26]:
endpoint_crashes = 'https://data.cityofchicago.org/resource/85ca-t3if.geojson?$select=crash_record_id,crash_date,posted_speed_limit,traffic_control_device,device_condition,first_crash_type,trafficway_type,lane_cnt,road_defect,prim_contributory_cause,sec_contributory_cause,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,injuries_unknown,crash_hour,crash_day_of_week,crash_month,latitude,longitude,location&$limit=650000'
res = requests.get(endpoint_crashes)
res = res.json()
res

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'geometry': {'type': 'Point',
    'coordinates': [-87.743379906115, 41.807493414229]},
   'properties': {'injuries_fatal': '0',
    'injuries_non_incapacitating': '0',
    'crash_record_id': '593ce1c002b749dcb18ece3390527b4b862dcc8db1856f46d54151fd8441f41262b2401c0b6b33d2bac143429af1b9f7e02c8673385ef7d8f8b124f83df5994a',
    'injuries_incapacitating': '0',
    'injuries_no_indication': '2',
    'latitude': '41.807493414',
    'injuries_unknown': '0',
    'device_condition': 'NO CONTROLS',
    'crash_date': '2022-08-04T09:10:00.000',
    'trafficway_type': 'NOT DIVIDED',
    'traffic_control_device': 'NO CONTROLS',
    'road_defect': 'NO DEFECTS',
    'longitude': '-87.743379906',
    'crash_month': '8',
    'crash_day_of_week': '5',
    'crash_hour': '9',
    'first_crash_type': 'REAR END',
    'injuries_reported_not_evident': '0',
    'most_severe_injury': 'NO INDICATION OF INJURY',
    'prim_contributory_cause': 'FOLL

In [27]:
# this is the line that works when you don't do any select= in the endpoint
crashes_df = gpd.GeoDataFrame.from_features(res['features'])
#crashes_df = pd.DataFrame(res)
#gpd.GeoDataFrame(res)
#crashes_df.columns

In [28]:
crashes_df.head()

Unnamed: 0,geometry,injuries_fatal,injuries_non_incapacitating,crash_record_id,injuries_incapacitating,injuries_no_indication,latitude,injuries_unknown,device_condition,crash_date,...,crash_day_of_week,crash_hour,first_crash_type,injuries_reported_not_evident,most_severe_injury,prim_contributory_cause,sec_contributory_cause,posted_speed_limit,injuries_total,lane_cnt
0,POINT (-87.74338 41.80749),0,0,593ce1c002b749dcb18ece3390527b4b862dcc8db1856f...,0,2,41.807493414,0,NO CONTROLS,2022-08-04T09:10:00.000,...,5,9,REAR END,0,NO INDICATION OF INJURY,FOLLOWING TOO CLOSELY,UNABLE TO DETERMINE,30,0,
1,POINT (-87.64959 41.93625),0,0,eaafdda3cf0ead6ff1d15dd60a2842b0c68b427b136345...,0,2,41.936251149,0,NO CONTROLS,2022-08-04T09:10:00.000,...,5,9,ANGLE,0,NO INDICATION OF INJURY,FAILING TO YIELD RIGHT-OF-WAY,NOT APPLICABLE,30,0,
2,POINT (-87.72255 41.95313),0,0,2ff27a341040d94ffdad5617e393eabb7e73bf3fe002d3...,0,1,41.953127486,0,NO CONTROLS,2022-08-04T08:45:00.000,...,5,8,PARKED MOTOR VEHICLE,0,NO INDICATION OF INJURY,UNABLE TO DETERMINE,UNABLE TO DETERMINE,30,0,
3,POINT (-87.64942 41.94003),0,0,3df614731dd52c246507ea4d51cca21bd351257366b0ba...,0,2,41.940032781,0,FUNCTIONING PROPERLY,2022-08-04T08:40:00.000,...,5,8,SIDESWIPE SAME DIRECTION,0,NO INDICATION OF INJURY,UNABLE TO DETERMINE,UNABLE TO DETERMINE,30,0,
4,POINT (-87.77630 41.94566),0,0,409dbb4ceee8e7967dbe8687ecea232e8d2087a9a6be01...,0,3,41.945664231,0,FUNCTIONING PROPERLY,2022-08-04T08:30:00.000,...,5,8,SIDESWIPE OPPOSITE DIRECTION,0,NO INDICATION OF INJURY,IMPROPER OVERTAKING/PASSING,UNABLE TO DETERMINE,30,0,


In [30]:
type(crashes_df)

geopandas.geodataframe.GeoDataFrame

In [31]:
#filter out crashes where there was debris on roadway
crashes_df = crashes_df.loc[crashes_df['road_defect'] != 'DEBRIS ON ROADWAY'] 

In [33]:
#filter out crashes where the traffic control device was functioning improperly or not functioning
device_not_working = ['FUNCTIONING IMPROPERLY','NOT FUNCTIONING']
crashes_df = crashes_df.loc[~crashes_df['device_condition'].isin(device_not_working)]
#crashes.loc[crashes['crash_record_id'].isin(crash_id_peds)]

In [36]:
crashes_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 632237 entries, 0 to 637934
Data columns (total 25 columns):
 #   Column                         Non-Null Count   Dtype   
---  ------                         --------------   -----   
 0   geometry                       628368 non-null  geometry
 1   injuries_fatal                 630908 non-null  object  
 2   injuries_non_incapacitating    630908 non-null  object  
 3   crash_record_id                632237 non-null  object  
 4   injuries_incapacitating        630908 non-null  object  
 5   injuries_no_indication         630908 non-null  object  
 6   latitude                       628368 non-null  object  
 7   injuries_unknown               630908 non-null  object  
 8   device_condition               632237 non-null  object  
 9   crash_date                     632237 non-null  object  
 10  trafficway_type                632237 non-null  object  
 11  traffic_control_device         632237 non-null  object  
 12  road_def

In [35]:
#cleaning lane count - jk, turns out when you call from the API, there's no 2 & 2.0, 4 & 4.0 problem
crashes_df['lane_cnt'].value_counts()

2          90181
4          48960
1          32335
3           8574
0           7952
6           4444
5           1905
8           1886
7            181
10           158
99           108
9             63
11            27
12            24
20            14
22            13
15             7
30             5
16             5
14             5
40             4
60             3
100            2
21             2
25             2
19             1
41             1
17             1
45             1
28             1
13             1
400            1
433634         1
299679         1
902            1
218474         1
1191625        1
24             1
80             1
44             1
35             1
Name: lane_cnt, dtype: int64

In [47]:
#cleaning crash_date
test_time = crashes_df.loc[15,'crash_date']
type(test_time)
#it's a string so convert to datetime

str

In [69]:
#try it on one single value
timestamp=pd.to_datetime(test_time, format = '%Y/%m/%d %H:%M:%S')

In [58]:
#gives us a Timestamp
type(timestamp)
timestamp

Timestamp('2022-08-04 06:45:00')

In [59]:
#can use Timestamp methods on it
#Timestamp is pandas equivalent of Python datetime 
#https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html
timestamp.month_name()

'August'

In [55]:
#convert whole df
crashes_df['crash_date'] = pd.to_datetime(crashes_df['crash_date'], format = '%Y/%m/%d %H:%M:%S')

In [68]:
#crash day of week, crash hour and crash month look fine as far as their range
crashes_df['crash_day_of_week'].value_counts()

6    102794
7     94000
5     90397
3     89908
4     89524
2     87544
1     78070
Name: crash_day_of_week, dtype: int64

In [None]:
#injury columns to integers
#leave posted_speed_limit, lane_cnt as objects, they are more like categories than numerical measures
#leave crash_month, crash_day_of_week and crash_hour as objects for now, not sure what to make them

In [70]:
crashes_df['injuries_fatal'] = pd.to_numeric(crashes_df['injuries_fatal'])
crashes_df['injuries_non_incapacitating'] = pd.to_numeric(crashes_df['injuries_non_incapacitating'])
crashes_df['injuries_no_indication'] = pd.to_numeric(crashes_df['injuries_no_indication'])
crashes_df['injuries_incapacitating'] = pd.to_numeric(crashes_df['injuries_incapacitating'])
crashes_df['injuries_unknown'] = pd.to_numeric(crashes_df['injuries_unknown'])
crashes_df['injuries_total'] = pd.to_numeric(crashes_df['injuries_total'])
crashes_df['injuries_reported_not_evident'] = pd.to_numeric(crashes_df['injuries_reported_not_evident'])

In [71]:
crashes_df.dtypes

geometry                               geometry
injuries_fatal                          float64
injuries_non_incapacitating             float64
crash_record_id                          object
injuries_incapacitating                 float64
injuries_no_indication                  float64
latitude                                 object
injuries_unknown                        float64
device_condition                         object
crash_date                       datetime64[ns]
trafficway_type                          object
traffic_control_device                   object
road_defect                              object
longitude                                object
crash_month                              object
crash_day_of_week                        object
crash_hour                               object
first_crash_type                         object
injuries_reported_not_evident           float64
most_severe_injury                       object
prim_contributory_cause                 

## read_csv on people.csv
people_df = pd.read_csv('Traffic_Crashes_-_People.csv', usecols = ['PERSON_ID', 'PERSON_TYPE', 'CRASH_RECORD_ID', 'INJURY_CLASSIFICATION','DRIVER_ACTION', 'CRASH_DATE'])

In [72]:
endpoint_people = 'https://data.cityofchicago.org/resource/u6pd-qa9d.json?person_type=PEDESTRIAN&$limit=16000'
res = requests.get(endpoint_people)
res = res.json()
peds_df = pd.DataFrame(res)

In [78]:
peds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15836 entries, 0 to 15835
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   person_id              15836 non-null  object
 1   person_type            15836 non-null  object
 2   crash_record_id        15836 non-null  object
 3   crash_date             15836 non-null  object
 4   city                   14649 non-null  object
 5   state                  14497 non-null  object
 6   zipcode                10890 non-null  object
 7   sex                    15667 non-null  object
 8   age                    14562 non-null  object
 9   safety_equipment       12774 non-null  object
 10  injury_classification  15836 non-null  object
 11  hospital               11371 non-null  object
 12  ems_agency             9744 non-null   object
 13  driver_action          14007 non-null  object
 14  driver_vision          13688 non-null  object
 15  physical_condition 

In [79]:
#remove peds who were impaired by drugs or alcohol or both
physical_condition_list = ['NORMAL','UNKNOWN','REMOVED BY EMS','OTHER','EMOTIONAL']
peds_df = peds_df.loc[peds_df['physical_condition'].isin(physical_condition_list)]

In [80]:
peds_df['injury_classification'].value_counts()

NONINCAPACITATING INJURY    7562
INCAPACITATING INJURY       2391
REPORTED, NOT EVIDENT       2226
NO INDICATION OF INJURY     1923
FATAL                        169
Name: injury_classification, dtype: int64

In [83]:
peds_fatal_df = peds_df.loc[peds_df['injury_classification'] == 'FATAL']
peds_fatal_df.shape

(169, 23)

In [84]:
peds_incapacitating_df = peds_df.loc[peds_df['injury_classification'] == 'INCAPACITATING INJURY']
peds_incapacitating_df.shape

(2391, 23)

In [85]:
peds_nonincapacitating_df = peds_df.loc[peds_df['injury_classification'] == 'NONINCAPACITATING INJURY']
peds_nonincapacitating_df.shape

(7562, 23)

In [90]:
peds_no_signif_injury_df = peds_df.loc[peds_df['injury_classification'].isin(['REPORTED, NOT EVIDENT','NO INDICATION OF INJURY'])]
peds_no_signif_injury_df.shape

(4149, 23)

In [None]:
peds_df = people_df.loc[people_df['PERSON_TYPE'] == 'PEDESTRIAN']
crash_id_peds = peds_df['CRASH_RECORD_ID'].tolist()
crash_id_peds

In [None]:
ped_crashes_df = crashes_df.loc[crashes_df['crash_record_id'].isin(crash_id_peds)].reset_index(drop=True)

In [None]:
ped_crashes_df

In [None]:
ped_crashes_df.loc[0, 'geometry'].x

In [None]:
print(ped_crashes_df.crs)

In [None]:
ped_crashes_df.loc[0, 'geometry']
print(ped_crashes_df.loc[0, 'geometry'])

In [None]:
#there is one row in there with POINT of (0,0) and it throws off plotting the map, so get rid of it
ped_crashes_df = ped_crashes_df[ped_crashes_df['geometry'].x != 0]

In [None]:
#filter out null geometries
ped_crashes_df = ped_crashes_df[ped_crashes_df['geometry'].notna()]

In [None]:
ped_crashes_df.plot();

In [None]:
area_center = [41.881288,-87.686729]
peds_cluster_map = folium.Map(location =  area_center, zoom_start = 11)
marker_cluster = MarkerCluster().add_to(peds_cluster_map)
folium.GeoJson(ped_crashes_df).add_to(peds_cluster_map)

for row_index, row_values in ped_crashes_df.iterrows():
    loc = (row_values['geometry'].y, row_values['geometry'].x)
    
    marker = folium.Marker(location = loc)
    
    marker.add_to(marker_cluster)


peds_cluster_map

In [None]:
#split POINT geometries into separate lat and long columns
ped_crashes_df['lon'] = ped_crashes_df.geometry.apply(lambda p: p.x)
ped_crashes_df['lat'] = ped_crashes_df.geometry.apply(lambda p: p.y)

In [None]:
ped_crashes_df

In [None]:
area_center = [41.881288,-87.686729]
cluster_only_map = folium.Map(location =  area_center, zoom_start = 11)
folium.GeoJson(ped_crashes_df).add_to(cluster_only_map)

locations = ped_crashes_df[['lat', 'lon']].values.tolist()
cluster_only_map.add_child(
    FastMarkerCluster(locations)
)

cluster_only_map

map_37207 = folium.Map(location =  area_center, zoom_start = 12)

folium.GeoJson(polygon37207).add_to(map_37207)

#create a list of locations and pass them to FastMarkerCluster()
#FastMarkerCluster gives us locations only, no detail when you hover over
locations = stops_in_37207[['lat', 'lng']].values.tolist()
map_37207.add_child(
    FastMarkerCluster(locations)
)



#display our map
map_37207

In [None]:
peds_map