In [1]:
# Look at the two pre-cleaned datasets.
# DO any further discovert, cleaning
# then merge
import pandas as pd
from os import chdir

chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")


path_to_cycling_safety_cleaned = "data/clean/cycling_safety_louisville_clean.csv"
path_to_LOJIC_cleaned = "data/clean/LOJIC_cycling_data.csv"

CSAFE = pd.read_csv(path_to_cycling_safety_cleaned)
LOJIC = pd.read_csv(path_to_LOJIC_cleaned)

In [2]:
CSAFE_cols = set(CSAFE.columns.to_list())
LOJIC_cols = set(LOJIC.columns.to_list())
column_intersect = sorted(CSAFE_cols & LOJIC_cols)
column_xor = sorted(CSAFE_cols ^ LOJIC_cols)
all_cols = column_intersect + column_xor

only_CSAFE = CSAFE_cols - LOJIC_cols
only_LOJIC = LOJIC_cols - CSAFE_cols

In [3]:
only_CSAFE

{'between_street_suffix_1',
 'between_street_suffix_2',
 'injured',
 'intersection_roadway_suffix',
 'killed'}

In [4]:
LOJIC['between_street_name_2'].unique()
CSAFE['between_street_suffix_2'].unique()

array([nan, 'DR   ', 'LN   ', 'ST   ', 'AVE  ', 'PL   ', 'RD   ', 'CIR  ',
       'PKWY ', 'WAY  ', 'LOOP ', 'ALY  ', 'HWY  ', 'TER  ', 'BLVD ',
       'TRCE ', 'CT   ', 'TRL  ', '     '], dtype=object)

In [5]:
notes = {'between_street_name_1':None,
 'between_street_name_2':None,
 'building_number':"convert CSAFE value to str(int(...))",
 'date':"done",
 'directional_analysis':"compatible, but no common values",
 'hit_and_run':"done",
 'intersection_roadway_name':None,
 'investigating_agency':'done',
 'latitude':"done",
 'light_condition':'deal with OTHER and Nan values esp. in LOJIC',
 'longitude':"done",
 'manner_of_collision':'done',
 'milepoint':'done',
 'motor_vehicles_involved':'done',
 'roadway_character':'done',
 'roadway_condition':'done',
 'roadway_direction':'done',
 'roadway_name':None,
 'roadway_number':'get rid of whitespace in some strings',
 'roadway_suffix':'done',
 'secondary_collision':'done',
 'units_involved':'done',
 'weather':'done'}

pd.Series(notes)

between_street_name_1                                                None
between_street_name_2                                                None
building_number                      convert CSAFE value to str(int(...))
date                                                                 done
directional_analysis                     compatible, but no common values
hit_and_run                                                          done
intersection_roadway_name                                            None
investigating_agency                                                 done
latitude                                                             done
light_condition              deal with OTHER and Nan values esp. in LOJIC
longitude                                                            done
manner_of_collision                                                  done
milepoint                                                            done
motor_vehicles_involved               

In [6]:

data = pd.concat((LOJIC.dtypes, CSAFE.dtypes, pd.Series(notes)), axis=1)
data

Unnamed: 0,0,1,2
investigating_agency,object,object,done
roadway_number,object,object,get rid of whitespace in some strings
building_number,object,object,convert CSAFE value to str(int(...))
roadway_direction,object,object,done
roadway_name,object,object,
roadway_suffix,object,object,done
roadway_type,object,object,
road_classification,object,,
intersection_roadway_number,object,object,
intersection_roadway_name,object,object,


### General notes

Add a column to identify LOJIC data vs. CSAFE data in case there are any further discrepancies to deal with.

In [7]:

def print_table(data):
    header = "| column name | LOJIC dtype | CSAFE dtype | notes |" 
    spaner = "|-------------|-------------|-------------|-------|"
    row = "|{name}|{LOJICDT}|{CSAFEDT}|{note}|"

    ds = lambda s:"" if s == "nan" else s
    dn = lambda s:ds(str(s))

    print(header)
    print(spaner)
    for name in all_cols:
        record = data.loc[name]
        LODT = dn(record[0])
        CSDT = dn(record[1])
        note = record[2]
        print(row.format(name=name, LOJICDT=LODT, CSAFEDT=CSDT, note=note))
    
    
print_table(data)

| column name | LOJIC dtype | CSAFE dtype | notes |
|-------------|-------------|-------------|-------|
|between_street_name_1|object|object|None|
|between_street_name_2|object|object|None|
|between_street_number_1|object|object|nan|
|between_street_number_2|object|object|nan|
|building_number|object|object|convert CSAFE value to str(int(...))|
|date|object|object|done|
|day_of_week|object|object|nan|
|directional_analysis|object|object|compatible, but no common values|
|fatality_indicator|bool|bool|nan|
|hit_and_run|bool|bool|done|
|injury_indicator|bool|bool|nan|
|intersection_roadway_name|object|object|None|
|intersection_roadway_number|object|object|nan|
|investigating_agency|object|object|done|
|latitude|float64|float64|done|
|light_condition|object|object|deal with OTHER and Nan values esp. in LOJIC|
|longitude|float64|float64|done|
|manner_of_collision|object|object|done|
|milepoint|float64|float64|done|
|motor_vehicles_involved|int64|int64|done|
|roadway_character|object|object

# Data dictionary for joined data

| column name | LOJIC dtype | CSAFE dtype | notes |
|-------------|-------------|-------------|-------|
|between_street_name_1|object|object||
|between_street_name_2|object|object||
|building_number|object|object|convert CSAFE value to str(int(...))|
|date|object|object|done|
|directional_analysis|object|object|compatible, but no common values|
|hit_and_run|bool|bool|done|
|intersection_roadway_name|object|object||
|investigating_agency|object|object|done|
|latitude|float64|float64|done|
|light_condition|object|object|deal with OTHER and Nan values esp. in LOJIC|
|longitude|float64|float64|done|
|manner_of_collision|object|object|done|
|milepoint|float64|float64|done|
|motor_vehicles_involved|int64|int64|done|
|roadway_character|object|object|done|
|roadway_condition|object|object|done|
|roadway_direction|object|object|done|
|roadway_name|object|object||
|roadway_number|object|object|get rid of whitespace in some strings|
|roadway_suffix|object|object|done|
|secondary_collision|bool|bool|done|
|units_involved|int64|int64|done|
|weather|object|object|done|
|between_street_1|object|| similar to between_street_number_1 |
|between_street_2|object|| similar to between_street_number_2|
|between_street_number_1||object| similar to between_street_1|
|between_street_number_2||object| similar to between_street_2|
|between_street_suffix_1||object||
|between_street_suffix_2||object||
|collision_status_code||object||
|council_district|int64|||
|day_of_week|object|| generate day_of_week for CSAFE |
|directional_analysis_code||float64||
|fatality_indicator|bool||generate fatality_indicator for CSAFE|
|incident_id|int64|||
|injured||int64||
|injury_indicator|bool||generate injury_indicator foor CSAFE|
|intersection_roadway|object||similar to CSAFE[intersection_roadway_number] rename to that to make it compatible|
|intersection_roadway_number||object| similar to LOJIC[intersection_roadway]; no rename|
|intersection_roadway_suffix||object||
|killed||int64||
|light_condition_code||float64|numeric code not present in LOJIC|
|local_code||object||
|manner_of_collision_code||float64||
|master_file_number||int64||
|mode|object|||
|object_id|int64|||
|owner|object|||
|ramp_from_roadway_id||float64||
|ramp_to_roadway_id||float64||
|road_classification|object|||
|roadway_character_code||float64|numeric code not present in LOJIC|
|roadway_condition_code||float64|numeric code not present in LOJIC|
|roadway_type||object||
|roadway_type_code||float64||
|weather_code||float64|numeric code not present in LOJIC|


# Common column names:
`['between_street_name_1', 'between_street_name_2', 'building_number', 'date', 'directional_analysis',
    'hit_and_run', 'intersection_roadway_name', 'intersection_roadway_number', 'investigating_agency', 'latitude', 'light_condition', 'longitude', 'manner_of_collision', 'milepoint', 'motor_vehicles_involved', 'roadway_character', 'roadway_condition', 'roadway_direction', 'roadway_name', 'roadway_number', 'roadway_suffix', 'roadway_type',
    'secondary_collision', 'units_involved', 'weather']`


In [8]:
def pair(col_name):
    return pd.DataFrame({"CSAFE":CSAFE[col_name], "LOJIC": LOJIC[col_name]})

def join(col_name):
    return pd.concat((CSAFE[col_name], LOJIC[col_name]), axis=0)

In [9]:
pair('between_street_name_1').dropna()

Unnamed: 0,CSAFE,LOJIC
2,GRINSTEAD,STALLINGS
13,CARDINAL,LONGEST
28,SOUTHLAND,WILSON
32,TAYLOR,LORETTA
51,MADISON,INDUSTRIAL
114,BAXTER,BLUE VALE


In [10]:
pair('between_street_name_2').dropna()

Unnamed: 0,CSAFE,LOJIC
2,GRINSTEAD,KLAGES
13,BLOOM,BEECHWOOD
28,WABASH,DR WILLIAM G WEATHERS
32,WHITMORE,KREMER
51,OLD HARRODS CREEK,ULRICH
114,BARDSTOWN,AUTUMN RIDGE


In [11]:
pair("building_number")
# CSAFE type is stirng representation of an in or float.
# LOJIC is int / string
CSAFE["building_number"].dropna().unique()
#LOJIC["building_number"].dropna()
# Convert CSAFE values to str(int(...))


array(['2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '     ',
       '430', '9410', '634', '3340', '6600'], dtype=object)

In [12]:
CSAFE['building_number'].dropna().unique()


array(['2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '     ',
       '430', '9410', '634', '3340', '6600'], dtype=object)

In [13]:
noblank = CSAFE['building_number'].replace(to_replace='     ', value=pd.NA)
fix = noblank.dropna().apply(lambda x:str(int(float(x))))
CSAFE['building_number'].update(fix)
CSAFE['building_number'].unique()

array([nan, '2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '     ',
       '430', '9410', '634', '3340', '6600'], dtype=object)

In [14]:
pair('date')
# done

Unnamed: 0,CSAFE,LOJIC
0,2010-01-13 10:00:00-05:00,2016-02-03 08:55:00-05:00
1,2010-01-13 13:40:00-05:00,2016-04-15 23:28:00-04:00
2,2010-01-15 15:50:00-05:00,2016-08-03 00:57:00-04:00
3,2010-02-02 06:11:00-05:00,2016-08-22 16:15:00-04:00
4,2010-02-05 18:20:00-05:00,2016-09-11 15:25:00-04:00
...,...,...
1268,2017-12-05 07:07:00-05:00,
1269,2017-12-14 17:09:00-05:00,
1270,2017-12-19 10:00:00-05:00,
1271,2017-12-21 19:56:00-05:00,


In [15]:
DA = pair('directional_analysis')
DA["CSAFE"].unique()
DA["LOJIC"].unique() 
# Different sets of values, no intersection. 
# Maybe I have to drop this; might not be useful.

array(['COLLISION WITH BICYCLE IN INTERSECTION',
       'COLLISION WITH BICYCLIST NON INTERSECTION', nan,
       'COLLISION WITH PEDESTRIAN NON - INTERSECTION',
       'COLLISION WITH PEDESTRIAN IN INTERSECTION',
       'OTHER INTERSECTION COLLISIONS',
       '1 VEHICLE ENTERING/LEAVING ENTRANCE',
       'OTHER ROADWAY OR MID-BLOCK COLLISION'], dtype=object)

In [16]:
DA.apply(lambda x:x.value_counts(dropna=False))

Unnamed: 0,CSAFE,LOJIC
1 VEHICLE ENTERING/LEAVING ENTRANCE,,1.0
COLLISION WITH BICYCLE,704.0,
COLLISION WITH BICYCLE IN INTERSECTION,,45.0
COLLISION WITH BICYCLIST,569.0,
COLLISION WITH BICYCLIST NON INTERSECTION,,57.0
COLLISION WITH PEDESTRIAN IN INTERSECTION,,2.0
COLLISION WITH PEDESTRIAN NON - INTERSECTION,,2.0
OTHER INTERSECTION COLLISIONS,,2.0
OTHER ROADWAY OR MID-BLOCK COLLISION,,1.0
,,1163.0


In [17]:
# Boolean indicators. These are fully compatible. 
pair('hit_and_run'), pair('secondary_collision')


(      CSAFE  LOJIC
 0     False  False
 1      True  False
 2      True  False
 3     False  False
 4     False  False
 ...     ...    ...
 1268  False    NaN
 1269  False    NaN
 1270  False    NaN
 1271  False    NaN
 1272  False    NaN
 
 [1273 rows x 2 columns],
       CSAFE  LOJIC
 0     False  False
 1     False  False
 2     False  False
 3     False  False
 4     False  False
 ...     ...    ...
 1268  False    NaN
 1269  False    NaN
 1270  False    NaN
 1271  False    NaN
 1272  False    NaN
 
 [1273 rows x 2 columns])

In [18]:
pair('intersection_roadway_name')


Unnamed: 0,CSAFE,LOJIC
0,JANE,WINTER
1,I71 N EXIT2 OFF RAMP TO ZORN AVE,
2,,
3,CONN,
4,,3RD
...,...,...
1268,,
1269,WOODED,
1270,JEFFERSON,
1271,ARJAY,


In [19]:
pair('investigating_agency').apply(lambda x:x.value_counts(dropna=False))


Unnamed: 0,CSAFE,LOJIC
ANCHORAGE POLICE DEPARTMENT,1.0,
AUDUBON PARK POLICE DEPARTMENT,1.0,
GRAYMOOR-DEVONDALE POLICE DEPT,2.0,
INDIAN HILLS POLICE DEPARTMENT,5.0,
JEFFERSONTOWN POLICE DEPT,24.0,
LOUISVILLE METRO POLICE DEPT,1160.0,110.0
LYNDON POLICE DEPARTMENT,,2.0
NORTHFIELD POLICE DEPARTMENT,1.0,
PROSPECT POLICE DEPARTMENT,2.0,
SHIVELY POLICE DEPARTMENT,40.0,5.0


In [20]:
# These are all compatible
pair('latitude'), pair('longitude')


(          CSAFE      LOJIC
 0     38.258551  38.235708
 1     38.273995  38.091322
 2     38.250012  38.175111
 3     38.195890  38.151697
 4     38.246136  38.225816
 ...         ...        ...
 1268  38.153815        NaN
 1269  38.163618        NaN
 1270  38.160030        NaN
 1271  38.198257        NaN
 1272  38.124819        NaN
 
 [1273 rows x 2 columns],
           CSAFE      LOJIC
 0    -85.703576 -85.727638
 1    -85.696572 -85.701118
 2    -85.697265 -85.822282
 3    -85.793380 -85.764353
 4    -85.664685 -85.760326
 ...         ...        ...
 1268 -85.733644        NaN
 1269 -85.688008        NaN
 1270 -85.671480        NaN
 1271 -85.626309        NaN
 1272 -85.890658        NaN
 
 [1273 rows x 2 columns])

In [21]:
pair('light_condition').apply(lambda x:x.value_counts(dropna=False))
# 1 value "OTHER" in LOJIC
# Lots on Nan's in LOJIC.


Unnamed: 0,CSAFE,LOJIC
DARK (UNKNOWN ROADWAY LIGHTING),5.0,3
DARK-HWY LIGHTED/OFF,22.0,4
DARK-HWY LIGHTED/ON,159.0,20
DARK-HWY NOT LIGHTED,42.0,23
DAWN,29.0,3
DAYLIGHT,962.0,60
DUSK,54.0,5
OTHER,,1
,,1154


In [22]:
MOC = pair('manner_of_collision')
MOC
#MOC["CSAFE"].unique(), MOC['LOJIC'].unique()


Unnamed: 0,CSAFE,LOJIC
0,SINGLE VEHICLE,SINGLE VEHICLE
1,SINGLE VEHICLE,SINGLE VEHICLE
2,SINGLE VEHICLE,ANGLE
3,SINGLE VEHICLE,SINGLE VEHICLE
4,SINGLE VEHICLE,SINGLE VEHICLE
...,...,...
1268,SINGLE VEHICLE,
1269,SINGLE VEHICLE,
1270,SINGLE VEHICLE,
1271,SINGLE VEHICLE,


In [23]:
pair('milepoint').describe()


Unnamed: 0,CSAFE,LOJIC
count,1207.0,119.0
mean,4.037019,3.696748
std,6.48581,4.603552
min,0.0,0.004
25%,0.3125,0.556
50%,1.17,1.376
75%,5.462,5.277
max,124.526,18.399


In [24]:
pair('motor_vehicles_involved').apply(lambda x:x.unique())
# Slightly different types. Compatible.


CSAFE                         [1, 2, 3]
LOJIC    [1.0, 2.0, 0.0, 3.0, 4.0, nan]
dtype: object

In [25]:
RC = pair('roadway_character')
RC['CSAFE'].unique(), RC['LOJIC'].unique()
# No problems.

(array(['STRAIGHT & GRADE', 'CURVE & LEVEL', 'STRAIGHT & LEVEL',
        'CURVE & GRADE', 'CURVE & HILLCREST', 'STRAIGHT & HILLCREST'],
       dtype=object),
 array(['STRAIGHT & LEVEL', 'STRAIGHT & GRADE', 'CURVE & LEVEL',
        'STRAIGHT & HILLCREST', nan], dtype=object))

In [26]:
 pair('roadway_condition')
# No problem.


Unnamed: 0,CSAFE,LOJIC
0,DRY,DRY
1,DRY,DRY
2,DRY,DRY
3,DRY,DRY
4,DRY,DRY
...,...,...
1268,WET,
1269,DRY,
1270,DRY,
1271,DRY,


In [27]:
pair('roadway_direction')
# No problem.


Unnamed: 0,CSAFE,LOJIC
0,,
1,,
2,S,
3,,
4,,W
...,...,...
1268,,
1269,,
1270,,
1271,,


In [28]:
pair('roadway_name')


Unnamed: 0,CSAFE,LOJIC
0,BROWNSBORO,BARRET
1,ZORN,HUTCHERSON
2,PETERSON,DIXIE
3,GEORGETOWN,NATIONAL
4,PEE WEE REESE,HILL
...,...,...
1268,GRADE,
1269,JEANINE,
1270,POPLAR LEVEL,
1271,WOODGATE,


In [29]:
RN = pair("roadway_number")
join("roadway_number").value_counts()


roadway_number
US0031E    81
US0031W    72
KY1020     55
KY0061     42
US0060     36
US0150     35
US0060A    32
KY0864     23
KY1931     18
KY0155     17
KY1865     16
KY1065     13
KY1447     12
KY1747     11
US0042     10
US0031      9
KY1932      8
KY0907      7
KY2050      7
KY1934      7
KY2049      6
KY2048      6
KY0146      5
I 0264      5
KY3082      5
KY2052      5
KY1703      5
KY1450      5
I 0065      4
KY2051      4
KY1727      4
KY0841      4
KY2054      4
KY1819      4
KY3064      4
KY1631      3
KY2251      3
KY1142      3
KY2055      2
KY2845      2
KY0913      2
KY2860      1
I 0064      1
KY3077      1
KY1230      1
KY2053      1
KY0148      1
I 0265      1
KY1851      1
Name: count, dtype: int64

In [30]:
 join('roadway_suffix').unique()


array(['RD', 'AVE', 'PL', 'PKWY', 'ST', nan, 'TRCE', 'DR', 'LN', 'HWY',
       'BLVD', 'PLZ', 'TRL', 'ALY', 'WAY', 'BYP', 'CIR', 'LOOP', 'CT',
       'PARK', 'TPKE', 'TER', 'FWY'], dtype=object)

In [31]:
 join('units_involved').unique()
pair('units_involved').dtypes



CSAFE      int64
LOJIC    float64
dtype: object

In [32]:
join('weather').unique()
pair('weather')

Unnamed: 0,CSAFE,LOJIC
0,CLEAR,CLEAR
1,CLEAR,CLEAR
2,CLEAR,CLEAR
3,CLEAR,CLEAR
4,CLEAR,CLEAR
...,...,...
1268,RAINING,
1269,CLOUDY,
1270,CLEAR,
1271,CLEAR,


In [33]:
CSAFE_cols - LOJIC_cols

{'between_street_suffix_1',
 'between_street_suffix_2',
 'injured',
 'intersection_roadway_suffix',
 'killed'}

## Unmatched column names:
# LOJIC only

Similar data in CSAFE. Will need to change the name

`['between_street_1', 'between_street_2']`

Useful columns not in CSAFE. Generate these columns for CSAFE

`['council_district', 'day_of_week', 'injury_indicator', 'fatality_indicator']`

Identifiers for each record

 `['incident_id', 'object_id']`

 Other fields

 `['intersection_roadway', 'mode', 'owner', 'road_classification']`

# CSAFE only

Similar data in LOJIC. Will need renames.

`['between_street_number_1', 'between_street_number_2']`

??

`['between_street_suffix_1', 'between_street_suffix_2']`

Numeric codes for different conditions. These are paired with a human readable code in CSAFE.
Similar string values exist for many of these in LOJIC data.
I will drop these numeric columns as they are not directly comparable. 

`['directional_analysis_code', 'light_condition_code', 'manner_of_collision_code',
'roadway_condition_code', 'roadway_character_code', 'weather_code', 'roadway_type_code']`

Generate "fatality_indicator" and "injury_indicator" from these.
`['killed', 'injured']`



 `'intersection_roadway_suffix'`

 Identifiers for record 
 `['local_code', 'master_file_number']`
 
 Reporting code for the incident report: Probably not useful.
`'collision_status_code'`

In [34]:
LOJIC['between_street_number_1'].dropna() # Alphanumeric roadway designator like KY 2049, US31W
CSAFE['between_street_number_1'].dropna() # Same thing
LOJIC['between_street_number_2'].dropna()
CSAFE['between_street_number_2'].dropna()
# These all have the same kind of alphanumeric data. 
# Rectify these names to make them compatible
# * make sure this doesn't clash with anything. 

# DONE

4        KY2048
5        I 0264
42       KY1065
63       KY1020
74       US0060
76       US0150
106      KY0061
114     US0031E
115     US0060A
128      KY1020
160      KY2054
250      US0150
264      KY0061
272      KY1865
293      KY0864
299      US0150
311     US0031E
316     US0031E
329      KY0061
364      KY0864
373     US0031E
406      KY0061
419      US0150
427      KY1703
434     US0060A
436      US0150
443      KY2048
471      KY1020
496     US0031W
527      KY0061
584     US0031E
590     US0060A
627      KY2251
638      KY0061
676      KY1931
677      US0060
681     US0031W
694      KY1065
698      US0150
704      KY1931
741      KY0061
760      KY1020
784     US0031W
811      KY2840
852      US0150
938      KY2048
980     US0060A
984      KY0061
1082     KY2049
1101     KY2860
1157     I 0065
1260     I 0265
Name: between_street_number_2, dtype: object

In [35]:
CSAFE['intersection_roadway_suffix'].dropna()

0       ST   
3       ST   
7       BLVD 
8       AVE  
9       ST   
        ...  
1265    ST   
1266    DR   
1269    WAY  
1270    BLVD 
1271    LN   
Name: intersection_roadway_suffix, Length: 645, dtype: object

In [36]:
CSAFE['killed'].apply(lambda x:True if x > 0 else False).value_counts()

killed
False    1262
True       11
Name: count, dtype: int64

In [37]:
d =CSAFE['date'][0]
pd.Timestamp(d).day_name()


'Wednesday'

## Date overlap



Conveniently, my two data sets have an overlap in terms of the date ranges they cover. "CSAFE" has records from 2010-2017 and LOJIC has records from 2016 to 2023. I'll check these records to see if there are any problems merging them.

Also, I may be able to gain insights into how each dataset codes different information.l

In [38]:
import numpy as np

CSAFE['date'] = CSAFE['date'].apply(pd.Timestamp)
LOJIC['date'] = LOJIC['date'].apply(pd.Timestamp)

# I'm loading data from CSV, which stores Timestamps as strings.
# I have to convert them back to pd.Timestamp if I want to use comparisons
# This is annoying. Perhaps break up date/time into year/month/day/hour/etc... columns?

# Find all Timestamps that are common between all my data.
date_intersect = np.intersect1d(CSAFE['date'], LOJIC['date'])
date_intersect.sort()
date_intersect

# Select the rows corresponding to the common Timestamps from each dataframe
# With these dataframes, set the index to "date" since that's what we'll be comparing against. 
CSAFE_intersect = CSAFE[CSAFE['date'].isin(date_intersect)]
LOJIC_intersect = LOJIC[LOJIC['date'].isin(date_intersect)]
CS = CSAFE_intersect.set_index('date').sort_index()
LO = LOJIC_intersect.set_index('date').sort_index()
#assert all(CSAFE_intersect.index == LOJIC_intersect.index)
cols = list(np.intersect1d(CSAFE_intersect.columns, LOJIC_intersect.columns))



In [39]:
# Merging code for intersect

# Update LOJIC data, the use LOJIC to update CSAFE

m = pd.merge(CSAFE_intersect, LOJIC_intersect, on='date', how='outer', suffixes=(None, "_L"))
ma = m[all_cols]


In [40]:
loc = 16
pd.concat((CSAFE_intersect.iloc[loc], LOJIC_intersect.iloc[loc], ma.iloc[loc]), axis=1)

Unnamed: 0,1263,22,16
investigating_agency,LOUISVILLE METRO POLICE DEPT,LOUISVILLE METRO POLICE DEPT,LOUISVILLE METRO POLICE DEPT
roadway_number,,,
building_number,,,
roadway_name,WILSON,WILSON,WILSON
roadway_suffix,AVE,AVE,AVE
roadway_direction,,,
milepoint,1.679,1.679,1.679
intersection_roadway_number,,,
intersection_roadway_name,HEMLOCK,HEMLOCK,HEMLOCK
intersection_roadway_suffix,ST,,ST


In [41]:
CSAFE['building_number'].unique()

array([nan, '2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '     ',
       '430', '9410', '634', '3340', '6600'], dtype=object)

In [42]:
building_numbers = CSAFE['building_number']
building_numbers = building_numbers.replace(to_replace='     ', value=pd.NA)
building_numbers = building_numbers.dropna().apply(lambda x:str(int(float(x))))
building_numbers.unique()


array(['2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '430',
       '9410', '634', '3340', '6600'], dtype=object)

In [43]:
CSAFE['building_number'].unique()

array([nan, '2300', '4303', '1109', '5013', '100', '1049', '7490', '9111',
       '8006', '10821', '600', '1810', '1121', '3706', '12100', '4100',
       '4901', '8412', '2100', '3600', '3804', '5022', '2700', '5913',
       '6700', '3820', '2500', '7806', '4133', '6101', '2216', '9120',
       '8020', '1500', '1000', '700', '4124', '5501', '1600', '7000',
       '3410', '4111', '5543', '4032', '131', '1900', '3121', '7321',
       '5000', '2200', '9800', '3231', '3521', '4450', '7010', '900',
       '8019', '10300', '12305', '4224', '927', '1228', '967', '1784',
       '3200', '5244', '4911', '1132', '4000', '3742', '1365', '7121',
       '3208', '5604', '1020', '4200', '2106', '2633', '10400', '     ',
       '430', '9410', '634', '3340', '6600'], dtype=object)

In [44]:
#assert len(date_intersect) == 18
# There are 18 rows of data with common dates. This is small enough I can compare them manually. 

date_index = 15
# Change to test different values
date = date_intersect[date_index]

j = pd.concat((LO.loc[date], CS.loc[date]), axis=1, ignore_index=True)
j # Compare all columns/keys


Unnamed: 0,0,1
investigating_agency,SHIVELY POLICE DEPARTMENT,SHIVELY POLICE DEPARTMENT
roadway_number,,
building_number,,
roadway_direction,,
roadway_name,FARNSLEY,FARNSLEY
roadway_suffix,RD,RD
roadway_type,SHIVELY,LOCAL STREET
road_classification,LOCAL,
intersection_roadway_number,,
intersection_roadway_name,RIEDLEY,RIEDLEY


In [45]:
j[(j[0] !=j[1])][j[0].notnull() | j[1].notnull()]
# ^^ This line gives me all the common columns between CSAFE and LOJIC, where
# CSAFE['date'] == LOJIC['date'] AND for all column names common between CSAFE AND LOJIC:
#   CSAFE[column] is not null and != LOJIC[column] which is not null as well.

  j[(j[0] !=j[1])][j[0].notnull() | j[1].notnull()]


Unnamed: 0,0,1
building_number,,
roadway_type,SHIVELY,LOCAL STREET
road_classification,LOCAL,
latitude,38.203145,38.203145
longitude,-85.828062,-85.828062
mode,BICYCLE,
directional_analysis,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE
intersection_roadway_suffix,,RD
killed,,1
injured,,0


In [46]:
K = pd.DataFrame((LO['roadway_type'], CS['roadway_type']), index=("LOJIC", "CSAFE")).transpose()
K.value_counts()


LOJIC    CSAFE       
METRO    LOCAL STREET    8
STATE    STATE           5
         FEDERAL         4
SHIVELY  LOCAL STREET    1
Name: count, dtype: int64

In [47]:
K

Unnamed: 0_level_0,LOJIC,CSAFE
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-02-03 08:55:00-05:00,METRO,LOCAL STREET
2016-08-03 00:57:00-04:00,STATE,FEDERAL
2016-09-11 15:25:00-04:00,METRO,LOCAL STREET
2016-11-09 05:23:00-05:00,STATE,FEDERAL
2016-12-01 16:21:00-05:00,METRO,LOCAL STREET
2016-12-03 06:55:00-05:00,METRO,LOCAL STREET
2017-02-03 19:59:00-05:00,STATE,STATE
2017-03-19 22:38:00-04:00,STATE,STATE
2017-03-31 14:46:00-04:00,STATE,FEDERAL
2017-04-04 19:08:00-04:00,METRO,LOCAL STREET


#### Lat Long values

Some of these don't evaluate of equal. Pick whichever and normalize the resulting column

In [48]:

(CSAFE[CSAFE['date'] == date][['longitude', 'latitude']],
LOJIC[LOJIC['date'] == date][['longitude', 'latitude']])

(      longitude   latitude
 1224 -85.828062  38.203145,
     longitude   latitude
 20 -85.828062  38.203145)

In [49]:
# "directional_analysis"

pd.DataFrame((LO['directional_analysis'], CS['directional_analysis']), index=("LOJIC", "CSAFE")).transpose()

# LOJIC:: COLLISION WITH BICYCLE IN INTERSECTION == CSAFE:: COLLISION WITH BICYCLE
# LOJIC:: COLLISION WITH BICYCLIST IN INTERSECTION == CSAFE: COLLISION WITH BICYCLE

Unnamed: 0_level_0,LOJIC,CSAFE
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-02-03 08:55:00-05:00,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE
2016-08-03 00:57:00-04:00,COLLISION WITH BICYCLIST NON INTERSECTION,COLLISION WITH BICYCLIST
2016-09-11 15:25:00-04:00,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE
2016-11-09 05:23:00-05:00,COLLISION WITH BICYCLIST NON INTERSECTION,COLLISION WITH BICYCLIST
2016-12-01 16:21:00-05:00,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE
2016-12-03 06:55:00-05:00,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE
2017-02-03 19:59:00-05:00,COLLISION WITH BICYCLIST NON INTERSECTION,COLLISION WITH BICYCLIST
2017-03-19 22:38:00-04:00,COLLISION WITH BICYCLIST NON INTERSECTION,COLLISION WITH BICYCLIST
2017-03-31 14:46:00-04:00,COLLISION WITH BICYCLIST NON INTERSECTION,COLLISION WITH BICYCLIST
2017-04-04 19:08:00-04:00,COLLISION WITH BICYCLE IN INTERSECTION,COLLISION WITH BICYCLE


In [50]:
LOJIC['directional_analysis'].value_counts()

directional_analysis
COLLISION WITH BICYCLIST NON INTERSECTION       57
COLLISION WITH BICYCLE IN INTERSECTION          45
COLLISION WITH PEDESTRIAN NON - INTERSECTION     2
COLLISION WITH PEDESTRIAN IN INTERSECTION        2
OTHER INTERSECTION COLLISIONS                    2
1 VEHICLE ENTERING/LEAVING ENTRANCE              1
OTHER ROADWAY OR MID-BLOCK COLLISION             1
Name: count, dtype: int64

LOJIC: COLLISION WITH BICYCL* 
...IN INTERSECTION vs. NON - INTERSECTION
This is useful generate another column that's intersection / non intersection in analysis


`["incident_id", "object_id", "master_file_number", "local_code"]` None of these seem to match up in any meaningful way. Since I can index these on other values, I think I will drop these from the merged dataset. 

In [51]:
CSAFE['intersection_roadway_number'][1]

'I 0071'