In [2]:
from shapely.geometry import Point, MultiLineString
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
#from geopy.geocoders import Nominatim
%matplotlib inline 

## get crashes_df from the API and clean

In [None]:
endpoint_crashes = 'https://data.cityofchicago.org/resource/85ca-t3if.geojson?$select=crash_record_id,crash_date,posted_speed_limit,traffic_control_device,device_condition,first_crash_type,trafficway_type,lane_cnt,road_defect,prim_contributory_cause,sec_contributory_cause,most_severe_injury,injuries_total,injuries_fatal,injuries_incapacitating,injuries_non_incapacitating,injuries_reported_not_evident,injuries_no_indication,injuries_unknown,crash_hour,crash_day_of_week,crash_month,street_no,street_direction,street_name,latitude,longitude,location&$limit=650000'
res = requests.get(endpoint_crashes)
res = res.json()

In [None]:
crashes_df = gpd.GeoDataFrame.from_features(res['features'])

In [None]:
crashes_df.info()

In [None]:
type(crashes_df)

In [None]:
#filter out crashes where there was debris on roadway
crashes_df = crashes_df.loc[crashes_df['road_defect'] != 'DEBRIS ON ROADWAY']

In [None]:
#filter out crashes where the traffic control device was functioning improperly or not functioning
device_not_working = ['FUNCTIONING IMPROPERLY','NOT FUNCTIONING']
crashes_df = crashes_df.loc[~crashes_df['device_condition'].isin(device_not_working)]
#crashes.loc[crashes['crash_record_id'].isin(crash_id_peds)]

In [None]:
#there is one row in there with POINT of (0,0) and it throws off plotting the map later on, so get rid of it
crashes_df = crashes_df[crashes_df['geometry'].x != 0]

In [None]:
#concat full address
crashes_df['full_address'] = crashes_df['street_no'] + ' ' + crashes_df['street_direction'] + ' ' + crashes_df['street_name'] + ' CHICAGO IL'

In [None]:
#filter out 5 rows with null full_address
crashes_df = crashes_df[crashes_df['full_address'].notna()]

In [None]:
#how many null geometries are there? 3877
len(crashes_df[crashes_df['geometry'].isna()])

In [None]:
#geopy code. this takes an address and generates long/lat
#checking to see if it works for one single row before applying to larger set
geolocator = Nominatim(user_agent="colin")
location = geolocator.geocode(crashes_df.loc[1,'full_address'])
print(location.address)
print((location.longitude, location.latitude))

In [None]:
#check what data type is generated
type(location.longitude)

In [None]:
#could try counter, progress bar
#this code looks at every row of crashes_df. it tries to generate long/lat using the full_address. if it can't, it prints the index. if it can, it makes it into a POINT geometry and puts it in the geometry column in the row it got the address from
for index, row in crashes_df.iterrows():
    if row.geometry is None:
        try:
            geolocator = Nominatim(user_agent="colin")
            location = geolocator.geocode(row['full_address'])
            crashes_df.at[index, 'geometry'] = Point((location.longitude, location.latitude))
        except: 
            print(index)

In [None]:
#how many null geometries are there now that we populated missing ones from the address?
#there are still 2097. that must be how many weird addresses there are that it can't generate a lat/long for.
#addresses like '1 w parking lot a' 
crashes_df[crashes_df['geometry'].isna()]

In [None]:
#remove null geometries that we could not generate a POINT for from the address
crashes_df = crashes_df[crashes_df['geometry'].notna()]

In [1]:
crashes_df.info()

NameError: name 'crashes_df' is not defined

In [9]:
crashes_df.columns

Index(['injuries_fatal', 'injuries_non_incapacitating', 'crash_record_id',
       'injuries_incapacitating', 'injuries_no_indication', 'latitude',
       'street_no', 'device_condition', 'crash_date', 'trafficway_type',
       'traffic_control_device', 'road_defect', 'longitude', 'crash_month',
       'street_name', 'crash_day_of_week', 'crash_hour', 'first_crash_type',
       'injuries_reported_not_evident', 'most_severe_injury',
       'prim_contributory_cause', 'sec_contributory_cause', 'street_direction',
       'posted_speed_limit', 'injuries_total', 'lane_cnt', 'full_address',
       'month_name', 'year', 'geometry', 'injuries_none'],
      dtype='object')

In [None]:
#convert injury counts from strings to integers
crashes_df['injuries_fatal'] = pd.to_numeric(crashes_df['injuries_fatal'])
crashes_df['injuries_non_incapacitating'] = pd.to_numeric(crashes_df['injuries_non_incapacitating'])
crashes_df['injuries_no_indication'] = pd.to_numeric(crashes_df['injuries_no_indication'])
crashes_df['injuries_incapacitating'] = pd.to_numeric(crashes_df['injuries_incapacitating'])
crashes_df['injuries_unknown'] = pd.to_numeric(crashes_df['injuries_unknown'])
crashes_df['injuries_total'] = pd.to_numeric(crashes_df['injuries_total'])
crashes_df['injuries_reported_not_evident'] = pd.to_numeric(crashes_df['injuries_reported_not_evident'])

In [6]:
#combine "injuries reported not evident" and "injuries no indication" into category called "injuries none"
crashes_df['injuries_none'] = crashes_df['injuries_reported_not_evident'] + crashes_df['injuries_no_indication']

In [None]:
#check that 2nd and 3rd add up to 1st
crashes_df[['injuries_none', 'injuries_reported_not_evident', 'injuries_no_indication']]

In [8]:
#had to go back in and rename to "injuries none" and delete 2 created columns (first 2) and 1 unnecessary original column (unknown)
crashes_df = crashes_df.drop(['no_significant_injury', 'injuries_not_significant','injuries_unknown'], axis=1)

In [None]:
crashes_df = crashes_df.reset_index(drop=True)

### create separate year & month/year columns

In [None]:
crashes_df['month_name'] = crashes_df['crash_date'].dt.month_name()

In [None]:
crashes_df['year'] = crashes_df['crash_date'].dt.year

In [None]:
crashes_df.info()

In [10]:
#save crashes_df as geojson
crashes_df.to_file("data/crashes_cleaned.geojson", driver='GeoJSON')

## get pedestrians from the people API, clean, export to CSV

In [None]:
endpoint_people = 'https://data.cityofchicago.org/resource/u6pd-qa9d.json?person_type=PEDESTRIAN&$limit=16000'
res = requests.get(endpoint_people)
res = res.json()
peds_df = pd.DataFrame(res)

In [None]:
peds_df['injury_classification'].value_counts()

In [None]:
#remove peds who were impaired by drugs or alcohol or both
#DON'T filter in ['NORMAL','UNKNOWN','REMOVED BY EMS','OTHER','EMOTIONAL'] b/c you lose 1000 NaNs 
#instead, filter OUT with impaired_list and ~

impaired_list = ['IMPAIRED - ALCOHOL',
'HAD BEEN DRINKING',
'IMPAIRED - DRUGS',
'IMPAIRED - ALCOHOL AND DRUGS',
'FATIGUED/ASLEEP',
'ILLNESS/FAINTED',
'MEDICATED']
peds_df = peds_df.loc[~peds_df['physical_condition'].isin(impaired_list)]

In [None]:
#remove pedpedal_action "INTOXICATED"
peds_df = peds_df.loc[peds_df['pedpedal_action'] != 'INTOXICATED PED/PEDAL']

In [None]:
peds_df = peds_df.reset_index(drop=True)

In [None]:
#create new value called "NO SIGNIFICANT INJURY" in the injury_classification column by combining "REPORTED, NOT EVIDENT" and "NO INDICATION OF INJURY"
#this makes more semantic sense for our purposes (the 2 original categories probably exist for police/legal purposes)
for index, row in peds_df.iterrows():
    if row.injury_classification == "REPORTED, NOT EVIDENT":
        peds_df.loc[index, 'injury_classification'] = 'NO INJURY'
    elif row.injury_classification == "NO INDICATION OF INJURY":
        peds_df.loc[index, 'injury_classification'] = 'NO INJURY'

In [13]:
#went back in and changed "NO SIGNIFICANT INJURY" to "NO INJURY" cuz semantically it makes more sense. it's people who were involved in the crash but who were no injured. the field is NOT used in calculating injuries_total
for index, row in peds_df.iterrows():
    if row.injury_classification == "NO SIGNIFICANT INJURY":
        peds_df.loc[index, 'injury_classification'] = 'NO INJURY'

In [14]:
#make sure it worked
peds_df['injury_classification'].value_counts()

NONINCAPACITATING INJURY    8075
NO INJURY                   4467
INCAPACITATING INJURY       2575
FATAL                        191
Name: injury_classification, dtype: int64

In [15]:
peds_df.to_csv('data/peds_cleaned_df.csv')

## get cyclists from the people API, clean, export to CSV

In [None]:
endpoint_people = 'https://data.cityofchicago.org/resource/u6pd-qa9d.json?person_type=BICYCLE&$limit=16000'
res = requests.get(endpoint_people)
res = res.json()
cyclists_df = pd.DataFrame(res)

In [None]:
cyclists_df.info()

In [None]:
#remove cyclists who were impaired by drugs or alcohol or both
#DON'T filter in ['NORMAL','UNKNOWN','REMOVED BY EMS','OTHER','EMOTIONAL'] b/c you lose NaNs 
#instead, filter OUT with impaired_list and ~

impaired_list = ['IMPAIRED - ALCOHOL',
'HAD BEEN DRINKING',
'IMPAIRED - DRUGS',
'IMPAIRED - ALCOHOL AND DRUGS',
'FATIGUED/ASLEEP',
'ILLNESS/FAINTED',
'MEDICATED']
cyclists_df = cyclists_df.loc[~cyclists_df['physical_condition'].isin(impaired_list)]

In [None]:
#remove pedpedal_action "INTOXICATED"
cyclists_df = cyclists_df.loc[cyclists_df['pedpedal_action'] != 'INTOXICATED PED/PEDAL']

In [None]:
cyclists_df = cyclists_df.reset_index(drop=True)

In [None]:
for index, row in cyclists_df.iterrows():
    if row.injury_classification == "REPORTED, NOT EVIDENT":
        cyclists_df.loc[index, 'injury_classification'] = 'NO SIGNIFICANT INJURY'
    elif row.injury_classification == "NO INDICATION OF INJURY":
        cyclists_df.loc[index, 'injury_classification'] = 'NO SIGNIFICANT INJURY'

In [18]:
#went back in and changed "NO SIGNIFICANT INJURY" to "NO INJURY" cuz semantically it makes more sense. it's people who were involved in the crash but who were no injured. the field is NOT used in calculating injuries_total
for index, row in cyclists_df.iterrows():
    if row.injury_classification == "NO SIGNIFICANT INJURY":
        cyclists_df.loc[index, 'injury_classification'] = 'NO INJURY'

In [19]:
cyclists_df['injury_classification'].value_counts()

NONINCAPACITATING INJURY    4556
NO INJURY                   4026
INCAPACITATING INJURY        862
FATAL                         29
Name: injury_classification, dtype: int64

In [20]:
cyclists_df.to_csv('data/cyclists_cleaned_df.csv')

# crash_date cleaning

In [23]:
cyclists_df['crash_date'] = pd.to_datetime(cyclists_df['crash_date'])

In [14]:
peds_df['crash_date'] = pd.to_datetime(peds_df['crash_date'])

0       2022-08-08 17:22:00
1       2022-08-08 17:12:00
2       2022-08-08 15:34:00
3       2022-08-08 14:54:00
4       2022-08-08 09:28:00
                ...        
15303   2015-09-10 14:00:00
15304   2015-09-07 16:30:00
15305   2015-09-04 23:10:00
15306   2015-08-15 12:30:00
15307   2015-08-11 06:30:00
Name: crash_date, Length: 15308, dtype: datetime64[ns]

In [None]:
#gives us a Timestamp
type(timestamp)
timestamp

In [None]:
#can use Timestamp methods on it
#Timestamp is pandas equivalent of Python datetime 
#https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html
timestamp.month_name()

### making this a Timestamp screwed up my folium map. folium map wants a string
### having as Timestamp throws TypeError: Object of type Timestamp is not JSON serializable
#convert crash_date of whole df
#crashes_df['crash_date'] = pd.to_datetime(crashes_df['crash_date'], format = '%Y/%m/%d %H:%M:%S')

In [None]:
#crash day of week, crash hour and crash month look fine as far as their range
crashes_df['crash_day_of_week'].value_counts()

In [None]:
#injury columns to integers
#leave posted_speed_limit, lane_cnt as objects, they are more like categories than numerical measures
#leave crash_month, crash_day_of_week and crash_hour as objects for now, not sure what to make them

In [None]:
crashes_df.dtypes

# get roadway class for each crash by joining crashes with street center lines
## I took the POINTs from crashes and left joined them ON the nearest point within the multiline strings contained in street center lines, augmenting crashes with the info from street center lines

In [21]:
crashes_df = gpd.read_file('data/crashes_cleaned.geojson')
crashes_df.set_crs(crs='EPSG:4326', inplace=True)

Unnamed: 0,injuries_fatal,injuries_non_incapacitating,crash_record_id,injuries_incapacitating,injuries_no_indication,latitude,street_no,device_condition,crash_date,trafficway_type,...,sec_contributory_cause,street_direction,posted_speed_limit,injuries_total,lane_cnt,full_address,month_name,year,injuries_none,geometry
0,,,0edc78f89df5b72ddaa1c1f567e229ad39c0e098953a9d...,,,41.873520168,3357,NO CONTROLS,2022-08-09 01:40:00,OTHER,...,NOT APPLICABLE,W,30,,,3357 W HARRISON ST CHICAGO IL,August,2022,,POINT (-87.71064 41.87352)
1,0.0,0.0,49ff3f04d16f5a71e6d66436c94f6acfd939c20d6c3651...,0.0,2.0,41.831835811,3410,FUNCTIONING PROPERLY,2022-08-09 00:45:00,NOT DIVIDED,...,NOT APPLICABLE,S,30,0.0,,3410 S DAMEN AVE CHICAGO IL,August,2022,2.0,POINT (-87.67546 41.83184)
2,,,e580e89f187525bf685101a36fc64df499a72be926d5a9...,,,41.838371536,130,FUNCTIONING PROPERLY,2022-08-08 23:55:00,DIVIDED - W/MEDIAN (NOT RAISED),...,NOT APPLICABLE,W,35,,,130 W 31ST ST CHICAGO IL,August,2022,,POINT (-87.63002 41.83837)
3,0.0,0.0,cdd7c5d90668e4d1bda12805ad19cec305667643a88806...,0.0,1.0,41.861481407,2747,NO CONTROLS,2022-08-08 23:00:00,OTHER,...,UNABLE TO DETERMINE,W,30,0.0,,2747 W OGDEN AVE CHICAGO IL,August,2022,1.0,POINT (-87.69494 41.86148)
4,0.0,0.0,e7c26ad2dd7f250b14acafe2d86265616f59f7c642a505...,0.0,2.0,41.837964154,3100,FUNCTIONING PROPERLY,2022-08-08 23:00:00,FOUR WAY,...,DISREGARDING TRAFFIC SIGNALS,S,35,0.0,,3100 S HALSTED ST CHICAGO IL,August,2022,2.0,POINT (-87.64642 41.83796)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631400,0.0,0.0,964aaaeb569e364886cfbdf89ca73e09ca15cd916d87b2...,0.0,2.0,41.835886103,3132,NO CONTROLS,2014-06-25 19:00:00,NOT DIVIDED,...,NOT APPLICABLE,S,30,0.0,,3132 S PULASKI RD CHICAGO IL,June,2014,2.0,POINT (-87.72447 41.83589)
631401,0.0,0.0,1d0232afecbdfd01968555aa956a688fd6f55a2bd1984f...,0.0,2.0,41.884016475,199,FUNCTIONING PROPERLY,2014-02-24 19:45:00,NOT DIVIDED,...,UNABLE TO DETERMINE,N,30,0.0,,199 N SACRAMENTO BLVD CHICAGO IL,February,2014,2.0,POINT (-87.70114 41.88402)
631402,0.0,1.0,957783a4787318f005a7dbc920e4c84cb9ac8aa7329a62...,0.0,1.0,41.760710194,7400,NO CONTROLS,2014-01-21 07:40:00,DIVIDED - W/MEDIAN (NOT RAISED),...,NOT APPLICABLE,S,30,1.0,,7400 S EXCHANGE AVE CHICAGO IL,January,2014,1.0,POINT (-87.56195 41.76071)
631403,0.0,0.0,f62e27317feb174811cf4fefeb9fa1064fea6c0619a873...,0.0,2.0,41.885609917,415,NO CONTROLS,2014-01-18 18:14:00,DIVIDED - W/MEDIAN BARRIER,...,UNABLE TO DETERMINE,W,30,0.0,,415 W LAKE ST CHICAGO IL,January,2014,2.0,POINT (-87.63876 41.88561)


In [None]:
crashes_df.crs

## we only need to get roadway class for ped and cyclist crashes, not all crashes. create crashes_ped_df and crashes_cyclist_df, then join each to street center lines

In [13]:
peds_df = pd.read_csv('data/peds_cleaned_df.csv')

In [3]:
cyclists_df = pd.read_csv('data/cyclists_cleaned_df.csv')

In [24]:
crash_id_peds = peds_df['crash_record_id'].tolist()
crashes_ped_df = crashes_df.loc[crashes_df['crash_record_id'].isin(crash_id_peds)].reset_index(drop=True)

In [25]:
crash_id_cyclists = cyclists_df['crash_record_id'].tolist()
crashes_cyclist_df = crashes_df.loc[crashes_df['crash_record_id'].isin(crash_id_cyclists)].reset_index(drop=True)

## load street center lines from geojson

In [26]:
street_center_lines_geo = gpd.read_file('data/street_center_lines.geojson')
# gpd.explode this would explode multi part geometries into multiple single part geometries

In [27]:
street_center_lines_geo['geometry'] = street_center_lines_geo['geometry'].to_crs(crs='EPSG:4326')

In [28]:
street_center_lines_geo = street_center_lines_geo[['geometry', 'class']]

In [29]:
crashes_ped_aug_df = gpd.sjoin_nearest(crashes_ped_df, street_center_lines_geo, how = 'left', max_distance = 0.001, distance_col= 'distance')




### common sense check - do the roadway classes match up with what you would expect from the addresses? sure, seems accurate that 500 S LSD is marked as class 1, seems plausible that 7100 S Western would be class 2, but seems wrong that 1000 N Cicero would be class 4...two possibilities: 1) that when it joined on the nearest point, it actually got a point from August Blvd which would be class 4 over there. if the crash actually happened *ON CICERO* then the roadway class would be wrong. but, possibility 2) is that the address entered for the crash is wrong, that the crash actually happened on Augusta, not Cicero, based on the long/lat. so that 4 listed as roadway class would actually be right.


In [33]:
crashes_ped_aug_df

Unnamed: 0,injuries_fatal,injuries_non_incapacitating,crash_record_id,injuries_incapacitating,injuries_no_indication,latitude,street_no,device_condition,crash_date,trafficway_type,...,lane_cnt,full_address,month_name,year,injuries_none,geometry,index_right,class,distance,roadway_class
0,0.0,0.0,10c02ad83b1b241f057db7e3f4bebdb9a9a2798dc57db4...,1.0,1.0,41.876656066,400,FUNCTIONING PROPERLY,2022-08-08 17:22:00,FOUR WAY,...,,400 S HALSTED ST CHICAGO IL,August,2022,1.0,POINT (-87.64735 41.87666),21960.0,3,0.000013,collector
1,0.0,0.0,e287152209a3cf769764c1df04fa42bc3eddafd1df32ef...,0.0,2.0,41.898729831,1000,FUNCTIONING PROPERLY,2022-08-08 17:12:00,DIVIDED - W/MEDIAN (NOT RAISED),...,,1000 N CICERO AVE CHICAGO IL,August,2022,2.0,POINT (-87.74604 41.89873),1269.0,4,0.000051,other streets
2,0.0,0.0,862babdabad4148c0e2345ae956733d6c0d3e4fb5b5f11...,0.0,1.0,41.880437392,601,FUNCTIONING PROPERLY,2022-08-08 15:34:00,FOUR WAY,...,,601 W MONROE ST CHICAGO IL,August,2022,2.0,POINT (-87.64269 41.88044),40998.0,4,0.000075,other streets
3,0.0,1.0,1a12b6c50c25d0f049e4d33dce433272217e0d220f8963...,0.0,1.0,41.875756255,500,FUNCTIONING PROPERLY,2022-08-08 14:54:00,DIVIDED - W/MEDIAN BARRIER,...,,500 S LAKE SHORE DR NB CHICAGO IL,August,2022,1.0,POINT (-87.61752 41.87576),13034.0,1,0.000147,expressway
4,0.0,0.0,94d427ca74695d355d9dad4d717140f436aeb16f447a24...,0.0,1.0,41.842510273,2701,NO CONTROLS,2022-08-08 09:28:00,PARKING LOT,...,,2701 S CALIFORNIA AVE CHICAGO IL,August,2022,2.0,POINT (-87.69500 41.84251),50025.0,4,0.000068,other streets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14384,0.0,1.0,5c2f4f8b30e69e1de1f45f57e8a38dec2dcf7fc04d7fbe...,0.0,1.0,41.871010349,3400,FUNCTIONING PROPERLY,2015-09-10 14:00:00,NOT DIVIDED,...,2,3400 W POLK ST CHICAGO IL,September,2015,1.0,POINT (-87.71081 41.87101),27603.0,4,0.000110,other streets
14385,0.0,1.0,752aff26b79c9532bec8561c5f3096c2fe4c1fe39ff4c1...,0.0,1.0,41.791283706,5600,NO CONTROLS,2015-09-07 16:30:00,OTHER,...,3,5600 S CICERO AVE CHICAGO IL,September,2015,1.0,POINT (-87.74170 41.79128),35447.0,2,0.000147,arterial
14386,0.0,0.0,55fce492bac7d0ef0e025a0b1d99a6b748436bfa095cf3...,1.0,2.0,41.764608912,7100,NO CONTROLS,2015-09-04 23:10:00,PARKING LOT,...,0,7100 S WESTERN AVE CHICAGO IL,September,2015,2.0,POINT (-87.68339 41.76461),1870.0,2,0.000090,arterial
14387,0.0,1.0,b81b574c31921480db8a53b6e3472948e42f356906dfc1...,0.0,1.0,41.88061965,3997,FUNCTIONING PROPERLY,2015-08-15 12:30:00,NOT DIVIDED,...,4,3997 W MADISON ST CHICAGO IL,August,2015,1.0,POINT (-87.72555 41.88062),54291.0,2,0.000110,arterial


In [31]:
for index, row in crashes_ped_aug_df.iterrows():
    if row['class'] == '1':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'expressway'
    elif row['class'] == '2':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'arterial'
    elif row['class'] == '3':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'collector'
    elif row['class'] == '4':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'other streets'
    elif row['class'] == '5':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'named alley'
    elif row['class'] == '7':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'tiered'
    elif row['class'] == '9':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'ramps'
    elif row['class'] == 'E':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'extent'
    elif row['class'] == 'RIV':
        crashes_ped_aug_df.loc[index, 'roadway_class'] = 'river'

In [32]:
crashes_ped_aug_df['roadway_class'].value_counts()

other streets    6425
arterial         4120
collector        3502
expressway         99
extent             86
tiered             80
ramps              47
named alley        19
river              10
Name: roadway_class, dtype: int64

In [34]:
#save to geojson
crashes_ped_aug_df.to_file("data/crashes_ped_aug_df.geojson", driver='GeoJSON')

In [38]:
#save to csv
crashes_ped_aug_df.to_csv('data/crashes_ped_aug_df.csv')

In [35]:
crashes_cyclist_aug_df = gpd.sjoin_nearest(crashes_cyclist_df, street_center_lines_geo, how = 'left', max_distance = 0.001, distance_col= 'distance')




In [16]:
crashes_cyclist_aug_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 9292 entries, 0 to 9291
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   injuries_fatal                 9292 non-null   float64       
 1   injuries_non_incapacitating    9292 non-null   float64       
 2   crash_record_id                9292 non-null   object        
 3   injuries_incapacitating        9292 non-null   float64       
 4   injuries_no_indication         9292 non-null   float64       
 5   latitude                       9278 non-null   object        
 6   street_no                      9292 non-null   object        
 7   injuries_unknown               9292 non-null   float64       
 8   device_condition               9292 non-null   object        
 9   crash_date                     9292 non-null   datetime64[ns]
 10  trafficway_type                9292 non-null   object        
 11  traffic_c

In [17]:
crashes_cyclist_aug_df['class'].value_counts()

4      4110
3      2869
2      2083
1        64
E        54
9        48
7        33
5        21
RIV      10
Name: class, dtype: int64

In [36]:
for index, row in crashes_cyclist_aug_df.iterrows():
    if row['class'] == '1':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'expressway'
    elif row['class'] == '2':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'arterial'
    elif row['class'] == '3':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'collector'
    elif row['class'] == '4':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'other streets'
    elif row['class'] == '5':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'named alley'
    elif row['class'] == '7':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'tiered'
    elif row['class'] == '9':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'ramps'
    elif row['class'] == 'E':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'extent'
    elif row['class'] == 'RIV':
        crashes_cyclist_aug_df.loc[index, 'roadway_class'] = 'river'

In [37]:
crashes_cyclist_aug_df

Unnamed: 0,injuries_fatal,injuries_non_incapacitating,crash_record_id,injuries_incapacitating,injuries_no_indication,latitude,street_no,device_condition,crash_date,trafficway_type,...,lane_cnt,full_address,month_name,year,injuries_none,geometry,index_right,class,distance,roadway_class
0,0.0,0.0,73c7c72402a0325ee3217b85b97f1cd880dce1e273294e...,0.0,2.0,42.012821129,1600,NO CONTROLS,2022-08-08 16:35:00,NOT DIVIDED,...,,1600 W TOUHY AVE CHICAGO IL,August,2022,2.0,POINT (-87.67031 42.01282),15801,4,0.000108,other streets
1,0.0,0.0,e7b51330cb3cffbe562d80eb7d25cce413dcf4d3d6f8f2...,0.0,2.0,41.890595335,440,NO CONTROLS,2022-08-08 14:12:00,DIVIDED - W/MEDIAN BARRIER,...,,440 N MICHIGAN AVE CHICAGO IL,August,2022,2.0,POINT (-87.62427 41.89060),55009,7,0.000116,tiered
2,0.0,0.0,8fda3f8d995b2c9ba0cd7cbe35f60aacd6708defbd91db...,0.0,2.0,41.758291098,2,NO CONTROLS,2022-08-08 13:00:00,NOT DIVIDED,...,,2 W 75TH ST CHICAGO IL,August,2022,2.0,POINT (-87.62479 41.75829),32236,2,0.000029,arterial
3,0.0,1.0,e5bb72ff7b983e5d06b7419143e5bf9a6f6028c84cb0de...,0.0,1.0,41.800837848,2899,NO CONTROLS,2022-08-07 19:35:00,NOT DIVIDED,...,,2899 W 51ST ST CHICAGO IL,August,2022,1.0,POINT (-87.69642 41.80084),2237,4,0.000014,other streets
4,0.0,1.0,37a4a38e3879c073141bbe1259d755e6c9706c732e77c6...,0.0,1.0,41.758292422,118,NO CONTROLS,2022-08-07 17:19:00,DIVIDED - W/MEDIAN (NOT RAISED),...,,118 W 75TH ST CHICAGO IL,August,2022,1.0,POINT (-87.62797 41.75829),7348,4,0.000041,other streets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9287,0.0,0.0,0ebcfc991b206474cf68ef0099609684dffb3b0c1e078e...,0.0,1.0,41.915249987,1816,FUNCTIONING PROPERLY,2015-08-26 08:41:00,NOT DIVIDED,...,4,1816 N CLARK ST CHICAGO IL,August,2015,2.0,POINT (-87.63429 41.91525),1177,3,0.000138,collector
9288,0.0,1.0,a2458d8a19e7f502ec81f85ecdbd971c1e4bb12dc34da8...,0.0,1.0,41.89558431,752,FUNCTIONING PROPERLY,2015-08-21 09:30:00,NOT DIVIDED,...,2,752 N OGDEN AVE CHICAGO IL,August,2015,1.0,POINT (-87.65539 41.89558),54633,4,0.000009,other streets
9289,0.0,0.0,3d25ae31f7adea360bd4b6fb138c541a84443dae0fad58...,0.0,3.0,41.87440974,105,NO CONTROLS,2015-08-19 18:45:00,NOT DIVIDED,...,2,105 W HARRISON ST CHICAGO IL,August,2015,3.0,POINT (-87.63085 41.87441),8767,3,0.000110,collector
9290,0.0,0.0,960536ce2853fdd8c904e8ed3ae0ba186535e0049a999b...,0.0,1.0,41.891134744,495,NO CONTROLS,2015-08-19 09:15:00,DIVIDED - W/MEDIAN (NOT RAISED),...,2,495 N MILWAUKEE AVE CHICAGO IL,August,2015,2.0,POINT (-87.64730 41.89113),52900,3,0.000051,collector


In [23]:
crashes_cyclist_aug_df['roadway_class'].value_counts()

other streets    4110
collector        2869
arterial         2083
expressway         64
extent             54
ramps              48
tiered             33
named alley        21
river              10
Name: roadway_class, dtype: int64

In [39]:
#save to geojson
crashes_cyclist_aug_df.to_file("data/crashes_cyclist_aug_df.geojson", driver='GeoJSON')

In [40]:
#save to csv
crashes_cyclist_aug_df.to_csv('data/crashes_cyclist_aug_df.csv')

### how many of the crashes in crashes_ped_aug_df.geojson and crashes_cyclist_aug_df.geojson have the ped or cyclist for that crash_record_id as '1' for unit_no in Vehicles? Unit 1 can mean one of 2 things: it is the "known or perceived vehicle at-fault" OR "if the at-fault vehicle is not evident, the striking unit should be entered as Unit 1."

In [4]:
vehicles_df = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  vehicles_df = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')


In [5]:
vehicles_df.head()

Unnamed: 0,CRASH_UNIT_ID,CRASH_RECORD_ID,RD_NO,CRASH_DATE,UNIT_NO,UNIT_TYPE,NUM_PASSENGERS,VEHICLE_ID,CMRC_VEH_I,MAKE,...,TRAILER1_LENGTH,TRAILER2_LENGTH,TOTAL_VEHICLE_LENGTH,AXLE_CNT,VEHICLE_CONFIG,CARGO_BODY_TYPE,LOAD_TYPE,HAZMAT_OUT_OF_SERVICE_I,MCS_OUT_OF_SERVICE_I,HAZMAT_CLASS
0,829999,24ddf9fd8542199d832e1c223cc474e5601b356f1d77a6...,JD124535,01/22/2020 06:25:00 AM,1,DRIVER,,796949.0,,INFINITI,...,,,,,,,,,,
1,749947,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,1,DRIVER,,834816.0,,HONDA,...,,,,,,,,,,
2,749949,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,2,PARKED,,834819.0,,TOYOTA,...,,,,,,,,,,
3,749950,81dc0de2ed92aa62baccab641fa377be7feb1cc47e6554...,JC451435,09/28/2019 03:30:00 AM,3,PARKED,,834817.0,,GENERAL MOTORS CORPORATION (GMC),...,,,,,,,,,,
4,871921,af84fb5c8d996fcd3aefd36593c3a02e6e7509eeb27568...,JD208731,04/13/2020 10:50:00 PM,2,DRIVER,,827212.0,,BUICK,...,,,,,,,,,,


In [15]:
vehicles_df['UNIT_TYPE'].value_counts()

DRIVER                 1095565
PARKED                  171961
PEDESTRIAN               15945
DRIVERLESS               11582
BICYCLE                   9704
NON-MOTOR VEHICLE         1156
NON-CONTACT VEHICLE        254
DISABLED VEHICLE           150
EQUESTRIAN                   6
Name: UNIT_TYPE, dtype: int64

In [23]:
peds_unit_no = vehicles_df[vehicles_df['UNIT_TYPE'] == 'PEDESTRIAN']

In [24]:
peds_fault_df = peds_unit_no[peds_unit_no['UNIT_NO'] == 1]

In [41]:
peds_fault_df.shape
#1277 peds listed as Unit 1 in Vehicles

(1277, 72)

In [25]:
peds_not_fault_df = peds_unit_no[peds_unit_no['UNIT_NO'] != 1]

In [27]:
peds_not_fault_df.shape

(14668, 72)

In [28]:
cyclists_unit_no = vehicles_df[vehicles_df['UNIT_TYPE'] == 'BICYCLE']

In [29]:
cyclists_fault_df = cyclists_unit_no[cyclists_unit_no['UNIT_NO'] == 1]

In [45]:
cyclists_fault_df.shape
#2761 cyclists listed as Unit 1 in Vehicles

(2761, 72)

In [30]:
cyclists_not_fault_df = cyclists_unit_no[cyclists_unit_no['UNIT_NO'] != 1]

In [32]:
cyclists_not_fault_df.shape

(6943, 72)

In [37]:
crashes_ped_aug_df = gpd.read_file('data/crashes_ped_aug_df.geojson')

In [52]:
crashes_ped_aug_df.shape

(14389, 35)

In [47]:
crashes_cyclist_aug_df = gpd.read_file('data/crashes_cyclist_aug_df.geojson')

In [53]:
crashes_cyclist_aug_df.shape

(9292, 35)

In [42]:
crash_id_peds = peds_fault_df['CRASH_RECORD_ID'].tolist()
crashes_ped_aug_df.loc[crashes_ped_aug_df['crash_record_id'].isin(crash_id_peds)].reset_index(drop=True).shape
# 1131 crashes in crashes_ped_aug_df have a pedestrian listed as unit 1 in Vehicles

(1131, 35)

In [51]:
crash_id_cyclists = cyclists_fault_df['CRASH_RECORD_ID'].tolist()
crashes_cyclist_aug_df.loc[crashes_cyclist_aug_df['crash_record_id'].isin(crash_id_cyclists)].reset_index(drop=True).shape
#2626 crashes in crashes_cyclist_aug_df have a cyclist listed as unit 1 in Vehicles

(2626, 35)