In [3]:
# Look at the two pre-cleaned datasets.
# Do any further discovery, cleaning
# then merge
import pandas as pd
from os import chdir

chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")


path_to_cycling_safety_cleaned = "data/preclean/cycling_safety_louisville_clean.csv"
path_to_LOJIC_cleaned = "data/preclean/LOJIC_cycling_data.csv"

CSAFE = pd.read_csv(path_to_cycling_safety_cleaned)
LOJIC = pd.read_csv(path_to_LOJIC_cleaned)

# Data dictionary for joined data

| column name | LOJIC dtype | CSAFE dtype | notes |
|-------------|-------------|-------------|-------|
|between_street_name_1|object|object||
|between_street_name_2|object|object||
|building_number|object|object|convert CSAFE value to str(int(...))|
|date|object|object|done|
|directional_analysis|object|object|compatible, but no common values|
|hit_and_run|bool|bool|done|
|intersection_roadway_name|object|object||
|investigating_agency|object|object|done|
|latitude|float64|float64|done|
|light_condition|object|object|deal with OTHER and Nan values esp. in LOJIC|
|longitude|float64|float64|done|
|manner_of_collision|object|object|done|
|milepoint|float64|float64|done|
|motor_vehicles_involved|int64|int64|done|
|roadway_character|object|object|done|
|roadway_condition|object|object|done|
|roadway_direction|object|object|done|
|roadway_name|object|object||
|roadway_number|object|object|get rid of whitespace in some strings|
|roadway_suffix|object|object|done|
|secondary_collision|bool|bool|done|
|units_involved|int64|int64|done|
|weather|object|object|done|
|between_street_1|object|| similar to between_street_number_1 |
|between_street_2|object|| similar to between_street_number_2|
|between_street_number_1||object| similar to between_street_1|
|between_street_number_2||object| similar to between_street_2|
|between_street_suffix_1||object||
|between_street_suffix_2||object||
|collision_status_code||object||
|council_district|int64|||
|day_of_week|object|| generate day_of_week for CSAFE |
|directional_analysis_code||float64||
|fatality_indicator|bool||generate fatality_indicator for CSAFE|
|incident_id|int64|||
|injured||int64||
|injury_indicator|bool||generate injury_indicator foor CSAFE|
|intersection_roadway|object||similar to CSAFE[intersection_roadway_number] rename to that to make it compatible|
|intersection_roadway_number||object| similar to LOJIC[intersection_roadway]; no rename|
|intersection_roadway_suffix||object||
|killed||int64||
|light_condition_code||float64|numeric code not present in LOJIC|
|local_code||object||
|manner_of_collision_code||float64||
|master_file_number||int64||
|mode|object|||
|object_id|int64|||
|owner|object|||
|ramp_from_roadway_id||float64||
|ramp_to_roadway_id||float64||
|road_classification|object|||
|roadway_character_code||float64|numeric code not present in LOJIC|
|roadway_condition_code||float64|numeric code not present in LOJIC|
|roadway_type||object||
|roadway_type_code||float64||
|weather_code||float64|numeric code not present in LOJIC|


## Date overlap



Conveniently, my two data sets have an overlap in terms of the date ranges they cover. "CSAFE" has records from 2010-2017 and LOJIC has records from 2016 to 2023. I'll check these records to see if there are any problems merging them.

Also, I may be able to gain insights into how each dataset codes different information.l

In [17]:
import numpy as np

CSAFE['date'] = CSAFE['date'].apply(pd.Timestamp)
LOJIC['date'] = LOJIC['date'].apply(pd.Timestamp)

# I'm loading data from CSV, which stores Timestamps as strings.
# I have to convert them back to pd.Timestamp if I want to use comparisons
# This is annoying. Perhaps break up date/time into year/month/day/hour/etc... columns?

# Find all Timestamps that are common between all my data.
date_intersect = np.intersect1d(CSAFE['date'], LOJIC['date'])
date_intersect.sort()
date_intersect

# Select the rows corresponding to the common Timestamps from each dataframe
# With these dataframes, set the index to "date" since that's what we'll be comparing against. 
CSAFE_intersect = CSAFE[CSAFE['date'].isin(date_intersect)]
LOJIC_intersect = LOJIC[LOJIC['date'].isin(date_intersect)]
CS = CSAFE_intersect.set_index('date').sort_index()
LO = LOJIC_intersect.set_index('date').sort_index()
#assert all(CSAFE_intersect.index == LOJIC_intersect.index)
cols = list(np.intersect1d(CSAFE_intersect.columns, LOJIC_intersect.columns))



In [18]:
def do(date_index):
    date = date_intersect[date_index]

    CS = CSAFE_intersect[CSAFE_intersect.date == date]
    LO = LOJIC_intersect[LOJIC_intersect.date == date]
    df = pd.concat((CS, LO)).transpose()
    df['TEST'] = df.iloc[:,0] == df.iloc[:,1]
    dropnulls = df[(df.iloc[:,0].notnull() | df.iloc[:,1].notnull())]
    return dropnulls[dropnulls['TEST'] == False]


In [19]:
date_intersect = np.intersect1d(CSAFE['date'], LOJIC['date'])
CSAFE_intersect = CSAFE[CSAFE['date'].isin(date_intersect)]
LOJIC_intersect = LOJIC[LOJIC['date'].isin(date_intersect)]

m = pd.merge(CSAFE_intersect, LOJIC_intersect, on='date', how='outer', suffixes=("_CSAFE", "_LOJIC"))

column_intersect = list(np.intersect1d(CSAFE.columns, LOJIC.columns))
column_intersect.remove('date')
column_intersect


['between_street_name_1',
 'between_street_name_2',
 'between_street_number_1',
 'between_street_number_2',
 'building_number',
 'day_of_week',
 'directional_analysis',
 'fatality_indicator',
 'hit_and_run',
 'injury_indicator',
 'intersection_roadway_name',
 'intersection_roadway_number',
 'investigating_agency',
 'latitude',
 'light_condition',
 'longitude',
 'manner_of_collision',
 'milepoint',
 'motor_vehicles_involved',
 'roadway_character',
 'roadway_condition',
 'roadway_direction',
 'roadway_name',
 'roadway_number',
 'roadway_suffix',
 'roadway_type',
 'secondary_collision',
 'units_involved',
 'weather']

As it turns out, there's no meaningful information in LOJIC for the overlapping records which isn't already in CSAFE. To join these datasets, I'll first remove the operlap from LOJIC, then concat LOJIC to CSAFE.

In [20]:
to_remove = LOJIC[LOJIC['date'].isin(date_intersect)].index
LOJIC_clean = LOJIC.drop(to_remove, axis=0)

In [21]:
full = pd.concat((CSAFE, LOJIC_clean), ignore_index=True)
full

Unnamed: 0,investigating_agency,roadway_number,building_number,roadway_name,roadway_suffix,roadway_direction,milepoint,intersection_roadway_number,intersection_roadway_name,between_street_number_1,...,manner_of_collision,roadway_character,light_condition,secondary_collision,date,latitude,longitude,injury_indicator,fatality_indicator,day_of_week
0,LOUISVILLE METRO POLICE DEPT,US0042,,BROWNSBORO,RD,,1.744,,JANE,,...,SINGLE VEHICLE,STRAIGHT & GRADE,DARK-HWY LIGHTED/ON,False,2010-01-13 10:00:00-05:00,38.258551,-85.703576,False,False,WEDNESDAY
1,LOUISVILLE METRO POLICE DEPT,,,ZORN,AVE,,1.476,I 0071,I71 N EXIT2 OFF RAMP TO ZORN AVE,,...,SINGLE VEHICLE,CURVE & LEVEL,DAYLIGHT,False,2010-01-13 13:40:00-05:00,38.273995,-85.696572,True,False,WEDNESDAY
2,LOUISVILLE METRO POLICE DEPT,,,PETERSON,AVE,S,0.278,,,,...,SINGLE VEHICLE,STRAIGHT & GRADE,DAYLIGHT,False,2010-01-15 15:50:00-05:00,38.250012,-85.697265,False,False,FRIDAY
3,LOUISVILLE METRO POLICE DEPT,,,GEORGETOWN,PL,,0.051,,CONN,,...,SINGLE VEHICLE,STRAIGHT & LEVEL,DAYLIGHT,False,2010-02-02 06:11:00-05:00,38.195890,-85.793380,False,False,TUESDAY
4,LOUISVILLE METRO POLICE DEPT,,,PEE WEE REESE,RD,,2.116,,,,...,SINGLE VEHICLE,STRAIGHT & LEVEL,DAYLIGHT,False,2010-02-05 18:20:00-05:00,38.246136,-85.664685,False,False,FRIDAY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369,LYNDON POLICE DEPARTMENT,,,LA GRANGE,RD,,0.706,,,,...,ANGLE,STRAIGHT & LEVEL,DAYLIGHT,False,2023-08-15 07:19:00-04:00,38.265397,-85.594638,False,True,TUESDAY
1370,LOUISVILLE METRO POLICE DEPT,,,FEGENBUSH,LN,,0.233,,NORFOLK,,...,SINGLE VEHICLE,STRAIGHT & LEVEL,DAYLIGHT,False,2023-08-21 14:57:00-04:00,38.183875,-85.638266,True,False,MONDAY
1371,LOUISVILLE METRO POLICE DEPT,,,DELOR,AVE,,0.937,,PACKARD,,...,SINGLE VEHICLE,STRAIGHT & LEVEL,DAYLIGHT,False,2023-08-30 07:49:00-04:00,38.210348,-85.733536,True,False,WEDNESDAY
1372,LOUISVILLE METRO POLICE DEPT,,,OLD BROWNSBORO HILLS,RD,,0.131,,,,...,SINGLE VEHICLE,STRAIGHT & LEVEL,DAYLIGHT,False,2023-09-01 16:20:00-04:00,38.301243,-85.595479,False,True,FRIDAY


In [10]:
path_to_signalized_intersections = "data/raw/Jefferson_County_KY_Signalized_Intersections.csv"
sig = pd.read_csv(path_to_signalized_intersections)
['OBJECTID', "MAINSTREET", "CROSSSTREET", "ROUTE"]
sig = sig[['OBJECTID', "MAINSTREET", "CROSSSTREET", "ROUTE"]]
sig

Unnamed: 0,OBJECTID,MAINSTREET,CROSSSTREET,ROUTE
0,1,OLD HENRY,I 265 RAMP,KY-3084
1,2,SHELBYVILLE,BECKLEY WOODS,US 60
2,3,SHELBYVILLE,LAKE FOREST,US-60
3,4,BROWNSBORO,I 265 RAMP,KY-22
4,5,HERR,LIME KILN,KY-22
...,...,...,...,...
1086,1087,BILLTOWN RD,I 265 RAMP,KY-1819
1087,1088,OLD HENRY RD,TERRA CROSSING BLVD,KY-3084
1088,1089,WESTPORT RD,I 265 RAMP,KY-1447
1089,1090,BEULAH CHURCH RD,I 265 RAMP,KY-864


In [12]:
import pandasql

pandasql.sqldf("SELECT 

<module 'pandas' from '/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents/venv/lib/python3.11/site-packages/pandas/__init__.py'>