In [113]:
import pandas as pd
import pyparsing as pp


In [2]:

DATA1 = "../data/cycling_safety_louisville.csv"
# DATA1 points to crash data from 2010 to 2017
# This data came from: https://zenodo.org/records/5603036
# Source: https://zenodo.org/records/5603036/files/louisville.zip

DATA2 = "../data/Louisville_Metro_KY_-_Traffic_Fatalities_and_Suspected_Serious_Injuries.csv"
# DATA2 points to crash data from 2016 to 2023.
# This data came from the Louisville Open Data portal
# Source: https://data.louisvilleky.gov/datasets/LOJIC::louisville-metro-ky-traffic-fatalities-and-suspected-serious-injuries-1/explore


In [3]:
df1 = pd.read_csv(DATA1)
df2 = pd.read_csv(DATA2)

## Geolocation fields
`['GPS LATITUDE DECIMAL', 'GPS LONGITUDE DECIMAL', 'Latitude', 'Longitude', 'geometry']`


In [4]:
geodata = df1[['GPS LATITUDE DECIMAL', 'GPS LONGITUDE DECIMAL', 'Latitude', 'Longitude', 'geometry']]
geodata


Unnamed: 0,GPS LATITUDE DECIMAL,GPS LONGITUDE DECIMAL,Latitude,Longitude,geometry
0,38.231850,-85.707933,38.231850,-85.707933,POINT (-85.707933333 38.23185)
1,38.273995,-85.696572,38.273995,-85.696572,POINT (-85.6965716 38.2739947)
2,38.258551,-85.703576,38.258551,-85.703576,POINT (-85.70357610000001 38.2585512)
3,38.250012,-85.697265,38.250012,-85.697265,POINT (-85.6972652 38.2500121)
4,38.195890,-85.793380,38.195890,-85.793380,POINT (-85.7933803 38.1958905)
...,...,...,...,...,...
1268,38.153815,-85.733644,38.153815,-85.733644,POINT (-85.73364410000001 38.1538153)
1269,38.163618,-85.688008,38.163618,-85.688008,POINT (-85.6880079 38.1636178)
1270,38.160030,-85.671480,38.160030,-85.671480,POINT (-85.6714798 38.1600301)
1271,38.198257,-85.626309,38.198257,-85.626309,POINT (-85.62630919999999 38.1982569)


In [5]:

df1[df1['GPS LATITUDE DECIMAL'] != df1['Latitude']]
# This returns an empty dataframe. Great. These two fields are identical
# Test the same for longitude
df1[df1['GPS LONGITUDE DECIMAL'] != df1['Longitude']]
# Same result: an empty data frame

Unnamed: 0.1,Unnamed: 0,MASTER FILE NUMBER,INVESTIGATING AGENCY,LOCAL CODE,COLLISION STATUS CODE,COUNTY NAME,ROADWAY NUMBER,BLOCK/HOUSE #,ROADWAY NAME,ROADWAY SUFFIX,...,RAMP FROM ROADWAY ID,RAMP TO ROADWAY ID,SECONDARY COLLISION INDICATOR,hour,minute,Date,Latitude,Longitude,geometry,index_right


In [6]:
# Dropping the columns with the more cumbersome names to type
#geodata = geodata.drop(['GPS LATITUDE DECIMAL', 'GPS LONGITUDE DECIMAL'], axis=1)
    # The above line is commented out to workaround jupyter. 
    # The first time you run this cell, it will work. After that, jupyter tries to drop columns
    # from the table that no longer exist. 
# TODO Fix how this works.
geodata


Unnamed: 0,GPS LATITUDE DECIMAL,GPS LONGITUDE DECIMAL,Latitude,Longitude,geometry
0,38.231850,-85.707933,38.231850,-85.707933,POINT (-85.707933333 38.23185)
1,38.273995,-85.696572,38.273995,-85.696572,POINT (-85.6965716 38.2739947)
2,38.258551,-85.703576,38.258551,-85.703576,POINT (-85.70357610000001 38.2585512)
3,38.250012,-85.697265,38.250012,-85.697265,POINT (-85.6972652 38.2500121)
4,38.195890,-85.793380,38.195890,-85.793380,POINT (-85.7933803 38.1958905)
...,...,...,...,...,...
1268,38.153815,-85.733644,38.153815,-85.733644,POINT (-85.73364410000001 38.1538153)
1269,38.163618,-85.688008,38.163618,-85.688008,POINT (-85.6880079 38.1636178)
1270,38.160030,-85.671480,38.160030,-85.671480,POINT (-85.6714798 38.1600301)
1271,38.198257,-85.626309,38.198257,-85.626309,POINT (-85.62630919999999 38.1982569)


In [7]:
Float = pp.Word(pp.nums + "-.")
point_expr = Float + Float

test = df1['geometry'][1271]
point_expr.search_string(test).asList()[0]

['-85.62630919999999', '38.1982569']

In [8]:
def parse_point(point:str):
    longtitude, latitude = point_expr.search_string(point).asList()[0]
    return {"Longitude": float(longtitude), "Latitude":float(latitude)}
parse_point(test)
    

{'Longitude': -85.6263092, 'Latitude': 38.1982569}

In [9]:
def validate_point(location):
    record = df1.iloc[location]
    point = parse_point(record['geometry'])
    return ( point['Latitude'] == record['Latitude'] and 
            point['Longitude'] == record['Longitude'])
                 
validate = pd.Series(validate_point(loc) for loc in range(len(df1)))
validate.value_counts()
# Great. All the geolocation data matches. I'll drop the geometry column for now, since I can always 
# generate a new one that will be accurate probably

True    1273
Name: count, dtype: int64

## Time fields

### Date formats galore!

Each data set has date and time field(s) associated with each crash report. The data sets have different formats, and df1 has redundant date / time fields which are also inconsistently formatted. This information is stored as strings.


In [10]:
df1[['COLLISION DATE', 'Date']]
# This dataset has 2 redundant date fields. I  parse each field and deal with the descrete elements

Unnamed: 0,COLLISION DATE,Date
0,2/20/2010,2010-02-20 16:20:00
1,1/13/2010,2010-01-13 13:40:00
2,1/13/2010,2010-01-13 10:00:00
3,1/15/2010,2010-01-15 15:50:00
4,2/2/2010,2010-02-02 06:11:00
...,...,...
1268,12/5/2017,2017-12-05 07:07:00
1269,12/14/2017,2017-12-14 17:09:00
1270,12/19/2017,2017-12-19 10:00:00
1271,12/21/2017,2017-12-21 19:56:00


In [11]:
df2['CollisionDate']

0       2016/10/11 03:08:00+00
1       2016/10/12 13:02:00+00
2       2016/10/12 13:02:00+00
3       2016/10/12 19:31:00+00
4       2016/10/12 23:51:00+00
                 ...          
4896    2022/09/12 22:55:00+00
4897    2022/09/16 05:47:00+00
4898    2022/09/16 22:47:00+00
4899    2022/09/17 00:14:00+00
4900    2022/09/17 02:10:00+00
Name: CollisionDate, Length: 4901, dtype: object


I'm using the module pyparsing to break apart the string into year, month, day, hour, minute, second, etc. I'll use that to build my own columns with the relevant data. Pyparsing is a string parsing module that allows the user to create combinatory grammars that closely map Python syntax and natural language intuitions about structured strings i.e. language.

In [12]:
integer = pp.Word(pp.nums).set_name("integer")
# Define a "Word" in pyparsing which represents an integer:
# integer is a string of numeric characters


In [13]:
date_expr = integer("year") + "/" + integer("month") + "/" + integer("day")
# date_expr has the form year/month/day
# define words as integers with name "year", "month", "day" and combine them into a parsing expression
# (pyparsing ignores whitespace by default)
time_expr = integer("hour") + ":" + integer("minute") + ":" + integer("seconds") + "+" + integer("extra")
# time_expr has the format hour:minute:second+milliseconds (I guess)
# I'm going to ignore everything after "+" but I still need to parse it out.
# define words as integers with the name "hour", "minute", "second", "extra" (probably ignore seconds and extra)

df2_COLLISION_DATE = date_expr + time_expr
# COLLISION DATE is a string of the form date_expr + (whitespace we are ignoring +) time_expr
# combine these into an expression that parses the date-time string in the data



In [14]:
result = df1_COLLISION_DATE.parseString("2016/10/11 03:08:00+00")
# Parse an example string from the data
result
# looks like it works as expected

NameError: name 'df1_COLLISION_DATE' is not defined

In [None]:
r = result.asDict()
# get a dictionary of values from that result

# cast the values of that result dictionary as int
def convert_to_int(dictionary:dict) -> dict:
    for key, value in dictionary.items():
        dictionary[key] = int(value)
    return dictionary
# I'm going to reuse this code on all my parsingresults, so I have defined this function.

r =convert_to_int(r)
r


{'year': 2016,
 'month': 10,
 'day': 11,
 'hour': 3,
 'minute': 8,
 'seconds': 0,
 'extra': 0}

In [None]:
# Dealing with the second dataset
CD = integer("day") + "/" + integer("month") + "/" + integer("year")
# CD: "COLLISION DATE" is a string day/month/year
# day, month, year are integer strings
CD_r = CD.parseString("2/20/2010").asDict()
# Parse an example string from the data and get the dictionary of parsed results
CD_r = convert_to_int(CD_r)
# cast the dictionary values as int
CD_r
# looks good!


{'day': 2, 'month': 20, 'year': 2010}

In [None]:
# deal with df2's "Date" column values
# Date column has two parts:
Date_date_part = integer("year") + '-' + integer("month") + '-' + integer("day")
# the date_part is a string of the form: "year-month-day"
Date_time_part = integer("hour") + ":" + integer("minute") + ":" + integer("second")
# the time_part is a string of the form "hour:minute:second"
Date = Date_date_part + Date_time_part
# combine these into a parsing expression
Date_r = convert_to_int(Date.parseString("2017-12-22 21:51:00").asDict())
# cast the results of the parsed dictionary as int
Date_r


{'year': 2017, 'month': 12, 'day': 22, 'hour': 21, 'minute': 51, 'second': 0}

Now I can parse the complex date formats into dictionaries with keys like: year, month, day, hour, minute, etc.
I'll put this info into new columns I can use to merge the datasets

### Other time fields

Time of the crashes is expressed in different ways between the two datasets.
This is redundant data, but I'm going to try to parse it to verify whether the data is consistent.

In [None]:
df1[["COLLISION TIME", "hour", "minute"]]

Unnamed: 0,COLLISION TIME,hour,minute
0,1620,16,20
1,1340,13,40
2,100008,10,0
3,1550,15,50
4,611,6,11
...,...,...,...
1268,707,7,7
1269,1709,17,9
1270,100002,10,0
1271,1956,19,56


In [None]:
CT = df1['COLLISION TIME']
CT.info()


<class 'pandas.core.series.Series'>
RangeIndex: 1273 entries, 0 to 1272
Series name: COLLISION TIME
Non-Null Count  Dtype
--------------  -----
1273 non-null   int64
dtypes: int64(1)
memory usage: 10.1 KB


In [None]:
def parse_CT(time:int):
    time_input = time
    out = list()
    count = 0
    while time:
        if count >= 3:
            raise ValueError(f"time input is too big: {time_input}")
        else:
            out.append(time % 100)
            time //= 100
            count += 1
    out.reverse()
    return out

In [None]:
parse_CT(231121)

[23, 11, 21]

In [None]:
CT_parsed = CT.apply(parse_CT)
# This will run through the whole series without failure. 
CT_parsed
        

0         [16, 20]
1         [13, 40]
2       [10, 0, 8]
3         [15, 50]
4          [6, 11]
           ...    
1268        [7, 7]
1269       [17, 9]
1270    [10, 0, 2]
1271      [19, 56]
1272      [21, 51]
Name: COLLISION TIME, Length: 1273, dtype: object

In [None]:

CTP = pd.concat((CT_parsed, df1[['hour', 'minute']]), axis=1)
CTP[CTP.hour == 0]

Unnamed: 0,COLLISION TIME,hour,minute
18,[4],0,0
89,[5],0,0
98,[9],0,0
100,[5],0,0
102,[9],0,0
...,...,...,...
1118,[45],0,45
1155,[8],0,0
1165,[1],0,0
1177,[],0,0


In [None]:
df2[['CollisionTime', "HOUR_OF_DAY", "DAY_OF_WEEK"]]

Unnamed: 0,CollisionTime,HOUR_OF_DAY,DAY_OF_WEEK
0,2308.0,11PM,MONDAY
1,902.0,9AM,WEDNESDAY
2,902.0,9AM,WEDNESDAY
3,1531.0,3PM,WEDNESDAY
4,1951.0,7PM,WEDNESDAY
...,...,...,...
4896,1855.0,6PM,MONDAY
4897,147.0,1AM,FRIDAY
4898,1847.0,6PM,FRIDAY
4899,2014.0,8PM,FRIDAY


In [None]:
df2['CollisionTime']
CT2.agg(("min", "max"))

min       0.0
max    2359.0
Name: CollisionTime, dtype: float64

In [None]:
# CT2.apply(parse_CT)
# This won't work because there are NaN in CT2
CT2.dropna().apply(parse_CT)
# Look for other places to put this .dropna in. Super useful.


0        [23.0, 8.0]
1         [9.0, 2.0]
2         [9.0, 2.0]
3       [15.0, 31.0]
4       [19.0, 51.0]
            ...     
4896    [18.0, 55.0]
4897     [1.0, 47.0]
4898    [18.0, 47.0]
4899    [20.0, 14.0]
4900    [22.0, 10.0]
Name: CollisionTime, Length: 4896, dtype: object

## Boolean indicators

`['HIT & RUN INDICATOR', 'SECONDARY COLLISION INDICATOR']`

In [112]:
booleans = df1[['HIT & RUN INDICATOR', 'SECONDARY COLLISION INDICATOR']]
#booleans.info()
booleans.agg((lambda x:x.unique()))
out = booleans.apply(lambda x:x.value_counts())
sums = out[out.columns].agg(sum)


  sums = out[out.columns].agg(sum)


In [159]:
outt = out.T
outt.insert(2, 'sum', sums)
outt.T

Unnamed: 0,HIT & RUN INDICATOR,SECONDARY COLLISION INDICATOR
N,1078,1188
Y,195,13
sum,1273,1201


In [70]:
def normalize_bool(obj):
    if obj == "Y":
        out = True
    elif obj == "N":
        out = False
    else:
        return obj
    return out

In [80]:
booleans[['HIT & RUN INDICATOR', 'SECONDARY COLLISION INDICATOR']].apply(lambda x:x.apply(normalize_bool))

Unnamed: 0,HIT & RUN INDICATOR,SECONDARY COLLISION INDICATOR
0,False,
1,True,False
2,False,False
3,True,False
4,False,False
...,...,...
1268,False,False
1269,False,False
1270,False,False
1271,False,False


In [78]:
booleans

Unnamed: 0,HIT & RUN INDICATOR,SECONDARY COLLISION INDICATOR
0,False,
1,True,False
2,False,False
3,True,False
4,False,False
...,...,...
1268,False,False
1269,False,False
1270,False,False
1271,False,False


In [79]:
booleans.agg(lambda x : x.value_counts())

Unnamed: 0,HIT & RUN INDICATOR,SECONDARY COLLISION INDICATOR
False,1078,1188
True,195,13


In [76]:
# KILLED/INJURED

df1[['KILLED', 'INJURED']].agg(lambda x:x.value_counts())
df1['KILLED'].apply(lambda x:True if x == 1 else False).value_counts()
# Cast values in Killed as boolean values

KILLED
False    1262
True       11
Name: count, dtype: int64