In [1]:
# Look at the two pre-cleaned datasets.
# DO any further discovert, cleaning
# then merge
import pandas as pd
from os import chdir

chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")


path_to_cycling_safety_cleaned = "data/clean/cycling_safety_louisville_clean.csv"
path_to_LOJIC_cleaned = "data/clean/LOJIC_cycling_data.csv"

CSAFE = pd.read_csv(path_to_cycling_safety_cleaned)
LOJIC = pd.read_csv(path_to_LOJIC_cleaned)

In [2]:
CSAFE_cols = set(CSAFE.columns.to_list())
LOJIC_cols = set(LOJIC.columns.to_list())
intersect = sorted(CSAFE_cols & LOJIC_cols)
xor = sorted(CSAFE_cols ^ LOJIC_cols)
all_cols = intersect + xor

In [3]:
notes = {'between_street_name_1':None,
 'between_street_name_2':None,
 'building_number':"convert CSAFE value to str(int(...))",
 'date':"done",
 'directional_analysis':"compatible, but no common values",
 'hit_and_run':"done",
 'intersection_roadway_name':None,
 'investigating_agency':'done',
 'latitude':"done",
 'light_condition':'deal with OTHER and Nan values esp. in LOJIC',
 'longitude':"done",
 'manner_of_collision':'done',
 'milepoint':'done',
 'motor_vehicles_involved':'done',
 'roadway_character':'done',
 'roadway_condition':'done',
 'roadway_direction':'done',
 'roadway_name':None,
 'roadway_number':'get rid of whitespace in some strings',
 'roadway_suffix':'done',
 'secondary_collision':'done',
 'units_involved':'done',
 'weather':'done'}

pd.Series(notes)

between_street_name_1                                                None
between_street_name_2                                                None
building_number                      convert CSAFE value to str(int(...))
date                                                                 done
directional_analysis                     compatible, but no common values
hit_and_run                                                          done
intersection_roadway_name                                            None
investigating_agency                                                 done
latitude                                                             done
light_condition              deal with OTHER and Nan values esp. in LOJIC
longitude                                                            done
manner_of_collision                                                  done
milepoint                                                            done
motor_vehicles_involved               

In [None]:

data = pd.concat((LOJIC.dtypes, CSAFE.dtypes, pd.Series(notes)), axis=1)
data

In [None]:

def print_table(data):
    header = "| column name | LOJIC dtype | CSAFE dtype | notes |" 
    spaner = "|-------------|-------------|-------------|-------|"
    row = "|{name}|{LOJICDT}|{CSAFEDT}|{note}|"

    ds = lambda s:"" if s == "nan" else s
    dn = lambda s:ds(str(s))

    print(header)
    print(spaner)
    for name in all_cols:
        record = data.loc[name]
        LODT = dn(record[0])
        CSDT = dn(record[1])
        note = record[2]
        print(row.format(name=name, LOJICDT=LODT, CSAFEDT=CSDT, note=note))
    
    
print_table(data)

# Data dictionary for joined data

| column name | LOJIC dtype | CSAFE dtype | notes |
|-------------|-------------|-------------|-------|
|between_street_name_1|object|object||
|between_street_name_2|object|object||
|building_number|object|object|convert CSAFE value to str(int(...))|
|date|object|object|done|
|directional_analysis|object|object|compatible, but no common values|
|hit_and_run|bool|bool|done|
|intersection_roadway_name|object|object||
|investigating_agency|object|object|done|
|latitude|float64|float64|done|
|light_condition|object|object|deal with OTHER and Nan values esp. in LOJIC|
|longitude|float64|float64|done|
|manner_of_collision|object|object|done|
|milepoint|float64|float64|done|
|motor_vehicles_involved|int64|int64|done|
|roadway_character|object|object|done|
|roadway_condition|object|object|done|
|roadway_direction|object|object|done|
|roadway_name|object|object||
|roadway_number|object|object|get rid of whitespace in some strings|
|roadway_suffix|object|object|done|
|secondary_collision|bool|bool|done|
|units_involved|int64|int64|done|
|weather|object|object|done|
|between_street_1|object|| similar to between_street_number_1 |
|between_street_2|object|| similar to between_street_number_2|
|between_street_number_1||object| similar to between_street_1|
|between_street_number_2||object| similar to between_street_2|
|between_street_suffix_1||object||
|between_street_suffix_2||object||
|collision_status_code||object||
|council_district|int64|||
|day_of_week|object|| generate day_of_week for CSAFE |
|directional_analysis_code||float64||
|fatality_indicator|bool||generate fatality_indicator for CSAFE|
|incident_id|int64|||
|injured||int64||
|injury_indicator|bool||generate injury_indicator foor CSAFE|
|intersection_roadway|object|||
|intersection_roadway_number||object||
|intersection_roadway_suffix||object||
|killed||int64||
|light_condition_code||float64|numeric code not present in LOJIC|
|local_code||object||
|manner_of_collision_code||float64||
|master_file_number||int64||
|mode|object|||
|object_id|int64|||
|owner|object|||
|ramp_from_roadway_id||float64||
|ramp_to_roadway_id||float64||
|road_classification|object|||
|roadway_character_code||float64|numeric code not present in LOJIC|
|roadway_condition_code||float64|numeric code not present in LOJIC|
|roadway_type||object||
|roadway_type_code||float64||
|weather_code||float64|numeric code not present in LOJIC|


# Common column names:
`['between_street_name_1', 'between_street_name_2', 'building_number', 'date', 'directional_analysis',
    'hit_and_run', 'intersection_roadway_name', 'investigating_agency', 'latitude', 'light_condition',
    'longitude', 'manner_of_collision', 'milepoint', 'motor_vehicles_involved', 'roadway_character',
    'roadway_condition', 'roadway_direction', 'roadway_name', 'roadway_number', 'roadway_suffix',
    'secondary_collision', 'units_involved', 'weather']`


In [22]:
def pair(col_name):
    return pd.concat((CSAFE[col_name], LOJIC[col_name]), axis=1)

def join(col_name):
    return pd.concat((CSAFE[col_name], LOJIC[col_name]), axis=0)

In [18]:
pair('between_street_name_1').dropna()

Unnamed: 0,between_street_name_1,between_street_name_1.1
13,ELMORE,CAPTAIN
14,WILLIS,HAROLD
23,SHELBY,WATTERSON
36,FERN GRADE,KERRY
39,NOBEL,COLORADO
40,KENTUCKY,HUNTINGTON PARK
94,MAGNOLIA,BLUE VALE
101,PATTERSON,KY841 CROSSOVER
117,PARK,CAROL


In [20]:
pair('between_street_name_2').dropna()

Unnamed: 0,between_street_name_2,between_street_name_2.1
13,LAMAR,GLENMARY FARM
14,CANNONS,HARDESTY
23,LOGAN,WATTERSON
36,BEANBLOSSOM,DIXIE
39,NOBEL,COMPTON
40,BRECKINRIDGE,PRODUCE
94,ORMSBY,AUTUMN RIDGE
101,LUCIA,I65 S EXIT125 RAMP FROM KY481
117,IROQUOIS GARDENS,NOLTEMEYER WYNDE


In [21]:
pair("building_number")
# CSAFE type is stirng representation of an in or float.
# LOJIC is int / string
CSAFE["building_number"].dropna().unique()
#LOJIC["building_number"].dropna()
# Convert CSAFE values to str(int(...))


array(['2300.0', '4303.0', '1109.0', '5013.0', '100.0', '1049.0',
       '7490.0', '600.0', '9111.0', '8006.0', '10821.0', '1810.0',
       '1121.0', '3706.0', '12100.0', '4100.0', '4901.0', '8412.0',
       '2100.0', '3600.0', '3804.0', '5022.0', '2700.0', '5913.0',
       '6700.0', '3820.0', '2500.0', '7806.0', '4133.0', '6101.0',
       '2216.0', '9120.0', '8020.0', '1500.0', '1000.0', '700.0',
       '4124.0', '5501.0', '1600.0', '7000.0', '3410.0', '4111.0',
       '5543.0', '4032.0', '131.0', '1900.0', '7321.0', '3121.0',
       '5000.0', '2200.0', '9800.0', '3231.0', '3521.0', '4450.0',
       '7010.0', '900.0', '8019.0', '10300.0', '12305.0', '4224 ',
       '927  ', '1228 ', '967  ', '1784.0', '3200.0', '5244.0', '4911.0',
       '1132.0', '3742.0', '4000.0', '1365.0', '7121.0', '3208.0',
       '5604.0', '600  ', '1020 ', '4200 ', '2106 ', '2633 ', '10400',
       '2200 ', '9800 ', '     ', '430  ', '9410 ', '3200 ', '634  ',
       '3340 ', '6600 '], dtype=object)

In [446]:
 pair('date')
# done

Unnamed: 0,CSAFE,LOJIC
0,2010-02-20 16:20:00,2016-02-03 08:55:00-05:00
1,2010-01-13 13:40:00,2017-02-03 19:59:00-05:00
2,2010-01-13 10:00:00,2016-04-15 23:28:00-04:00
3,2010-01-15 15:50:00,2016-09-11 15:25:00-04:00
4,2010-02-02 06:11:00,2016-10-03 22:04:00-04:00
...,...,...
1268,2017-12-05 07:07:00,
1269,2017-12-14 17:09:00,
1270,2017-12-19 10:00:00,
1271,2017-12-21 19:56:00,


In [447]:
DA = pair('directional_analysis')
DA["CSAFE"].unique()
DA["LOJIC"].unique() 
# Different sets of values, no intersection. 
# Maybe I have to drop this; might not be useful.

array(['COLLISION WITH BICYCLE IN INTERSECTION',
       'COLLISION WITH BICYCLIST NON INTERSECTION',
       'COLLISION WITH PEDESTRIAN NON - INTERSECTION',
       'COLLISION WITH PEDESTRIAN IN INTERSECTION',
       'OTHER INTERSECTION COLLISIONS',
       'OTHER ROADWAY OR MID-BLOCK COLLISION',
       '1 VEHICLE ENTERING/LEAVING ENTRANCE', nan], dtype=object)

In [448]:
DA.apply(lambda x:x.value_counts(dropna=False))

Unnamed: 0,CSAFE,LOJIC
1 VEHICLE ENTERING/LEAVING ENTRANCE,,1.0
COLLISION WITH BICYCLE,704.0,
COLLISION WITH BICYCLE IN INTERSECTION,,45.0
COLLISION WITH BICYCLIST,569.0,
COLLISION WITH BICYCLIST NON INTERSECTION,,57.0
COLLISION WITH PEDESTRIAN IN INTERSECTION,,2.0
COLLISION WITH PEDESTRIAN NON - INTERSECTION,,2.0
OTHER INTERSECTION COLLISIONS,,2.0
OTHER ROADWAY OR MID-BLOCK COLLISION,,1.0
,,1163.0


In [449]:
# Boolean indicators. These are fully compatible. 
pair('hit_and_run'), pair('secondary_collision')


(      CSAFE  LOJIC
 0     False  False
 1      True  False
 2     False  False
 3      True  False
 4     False  False
 ...     ...    ...
 1268  False    NaN
 1269  False    NaN
 1270  False    NaN
 1271  False    NaN
 1272  False    NaN
 
 [1273 rows x 2 columns],
       CSAFE  LOJIC
 0     False  False
 1     False  False
 2     False  False
 3     False  False
 4     False  False
 ...     ...    ...
 1268  False    NaN
 1269  False    NaN
 1270  False    NaN
 1271  False    NaN
 1272  False    NaN
 
 [1273 rows x 2 columns])

In [450]:
pair('intersection_roadway_name')


Unnamed: 0,CSAFE,LOJIC
0,DEERPARK,WINTER
1,I71 N EXIT2 OFF RAMP TO ZORN AVE,3RD
2,JANE,JEFFERSON
3,CONN,LYNNBROOK
4,MUHAMMAD ALI,HANCOCK
...,...,...
699,JACKSON,
700,FEYHURST,
701,WOODED,
702,JEFFERSON,


In [451]:
pair('investigating_agency').apply(lambda x:x.value_counts(dropna=False))


Unnamed: 0,CSAFE,LOJIC
ANCHORAGE POLICE DEPARTMENT,1.0,
AUDUBON PARK POLICE DEPARTMENT,1.0,
GRAYMOOR-DEVONDALE POLICE DEPT,2.0,
INDIAN HILLS POLICE DEPARTMENT,5.0,
JEFFERSONTOWN POLICE DEPT,24.0,
LOUISVILLE METRO POLICE DEPT,1160.0,110.0
LYNDON POLICE DEPARTMENT,,2.0
NORTHFIELD POLICE DEPARTMENT,1.0,
PROSPECT POLICE DEPARTMENT,2.0,
SHIVELY POLICE DEPARTMENT,40.0,5.0


In [452]:
# These are all compatible
pair('latitude'), pair('longitude')


(          CSAFE      LOJIC
 0     38.231850  38.235708
 1     38.273995  38.154226
 2     38.258551  38.091322
 3     38.250012  38.225816
 4     38.195890  38.014473
 ...         ...        ...
 1268  38.153815        NaN
 1269  38.163618        NaN
 1270  38.160030        NaN
 1271  38.198257        NaN
 1272  38.124819        NaN
 
 [1273 rows x 2 columns],
           CSAFE      LOJIC
 0    -85.707933 -85.727638
 1    -85.696572 -85.885464
 2    -85.703576 -85.701118
 3    -85.697265 -85.760326
 4    -85.793380 -85.925157
 ...         ...        ...
 1268 -85.733644        NaN
 1269 -85.688008        NaN
 1270 -85.671480        NaN
 1271 -85.626309        NaN
 1272 -85.890658        NaN
 
 [1273 rows x 2 columns])

In [453]:
 pair('light_condition').apply(lambda x:x.value_counts(dropna=False))
# 1 value "OTHER" in LOJIC
# Lots on Nan's in LOJIC.


Unnamed: 0,CSAFE,LOJIC
DARK (UNKNOWN ROADWAY LIGHTING),5.0,3
DARK-HWY LIGHTED/OFF,22.0,4
DARK-HWY LIGHTED/ON,159.0,20
DARK-HWY NOT LIGHTED,42.0,23
DAWN,29.0,3
DAYLIGHT,962.0,60
DUSK,54.0,5
OTHER,,1
,,1154


In [462]:
MOC = pair('manner_of_collision')
MOC["CSAFE"].unique(), MOC['LOJIC'].unique()


(array(['ANGLE', 'SINGLE VEHICLE', 'SIDESWIPE-SAME DIRECTION', 'HEAD ON',
        'REAR END', 'OPPOSING LEFT TURN', 'SIDESWIPE-OPPOSITE DIRECTION'],
       dtype=object),
 array(['SINGLE VEHICLE', 'ANGLE', 'HEAD ON', nan], dtype=object))

In [455]:
pair('milepoint').describe()


Unnamed: 0,CSAFE,LOJIC
count,1207.0,119.0
mean,4.037019,3.696748
std,6.48581,4.603552
min,0.0,0.004
25%,0.3125,0.556
50%,1.17,1.376
75%,5.462,5.277
max,124.526,18.399


In [456]:
pair('motor_vehicles_involved').apply(lambda x:x.unique())
# Slightly different types. Compatible.


CSAFE                         [1, 2, 3]
LOJIC    [1.0, 2.0, 0.0, 3.0, 4.0, nan]
dtype: object

In [457]:
RC = pair('roadway_character')
RC['CSAFE'].unique(), RC['LOJIC'].unique()
# No problems.

(array(['STRAIGHT & GRADE', 'CURVE & LEVEL', 'STRAIGHT & LEVEL',
        'CURVE & GRADE', 'CURVE & HILLCREST', 'STRAIGHT & HILLCREST'],
       dtype=object),
 array(['STRAIGHT & LEVEL', 'STRAIGHT & GRADE', 'CURVE & LEVEL',
        'STRAIGHT & HILLCREST', nan], dtype=object))

In [459]:
 pair('roadway_condition')
# No problem.


Unnamed: 0,CSAFE,LOJIC
0,DRY,DRY
1,DRY,DRY
2,DRY,DRY
3,DRY,DRY
4,DRY,DRY
...,...,...
1268,WET,
1269,DRY,
1270,DRY,
1271,DRY,


In [None]:
 pair('roadway_direction')
# No problem.


Unnamed: 0,CSAFE,LOJIC
0,S,W
1,S,E
2,W,S
3,S,S
4,S,S
...,...,...
429,,
430,,
431,,
432,,


In [None]:
 pair('roadway_name')


Unnamed: 0,CSAFE,LOJIC
0,BARDSTOWN,BARRET
1,ZORN,GREENBELT
2,BROWNSBORO,HUTCHERSON
3,PETERSON,HILL
4,GEORGETOWN,DIXIE
...,...,...
1259,GRADE,
1260,JEANINE,
1261,POPLAR LEVEL,
1262,WOODGATE,


In [23]:
RN = pair("roadway_number")
join("roadway_number").value_counts()


roadway_number
US0031E     77
US0031W     63
KY1020      49
KY0061      38
US0150      35
            ..
KY1230       1
KY1447       1
KY0148       1
I 0265       1
I 0064       1
Name: count, Length: 77, dtype: int64

In [24]:
 join('roadway_suffix').unique()


array(['RD   ', 'AVE  ', 'PL   ', 'PKWY ', 'ST   ', nan, 'TRCE ', 'DR   ',
       'LN   ', 'HWY  ', 'BLVD ', 'PLZ  ', 'TRL  ', 'ALY  ', 'WAY  ',
       'BYP  ', 'CIR  ', 'LOOP ', 'CT   ', 'PARK ', 'TPKE ', 'TER  ',
       '     ', 'AVE', 'HWY', 'DR', 'ST', 'LN', 'RD', 'TPKE', 'FWY',
       'LOOP', 'PKWY', 'BLVD', 'PL'], dtype=object)

In [None]:
 join('units_involved').unique()
pair('units_involved').dtypes



CSAFE      int64
LOJIC    float64
dtype: object

In [None]:
join('weather').unique()
pair('weather')

Unnamed: 0,CSAFE,LOJIC
0,CLEAR,CLEAR
1,CLEAR,CLOUDY
2,CLEAR,CLEAR
3,CLEAR,CLEAR
4,CLEAR,CLOUDY
...,...,...
1268,RAINING,
1269,CLOUDY,
1270,CLEAR,
1271,CLEAR,


## Unmatched column names:
# LOJIC only

Similar data in CSAFE. Will need to change the name

`['between_street_1', 'between_street_2']`

Useful columns not in CSAFE. Generate these columns for CSAFE

`['council_district', 'day_of_week', 'injury_indicator', 'fatality_indicator']`

 `['incident_id', 'object_id',
 'intersection_roadway', 'mode', 'owner', 'road_classification']`

# CSAFE only

Similar data in LOJIC. WIll need renames.

`['between_street_number_1', 'between_street_number_2']`

??

`['between_street_suffix_1', 'between_street_suffix_2']`

Numeric codes for different conditions. These are paired with a human readable code in CSAFE.
Similar string values exist for many of these in LOJIC data.
I will drop these numeric columns as they are not directly comparable. 

`['directional_analysis_code', 'light_condition_code', 'manner_of_collision_code',
'roadway_condition_code', 'roadway_character_code', 'weather_code', 'roadway_type_code']`

Generate "fatality_indicator" and "injury_indicator" from these.
`['killed', 'injured']`

 'intersection_roadway_number', 'intersection_roadway_suffix',
 'local_code',
 'master_file_number',
 'ramp_from_roadway_id', 'ramp_to_roadway_id',
 'collision_status_code', 


 'roadway_type'

In [35]:
LOJIC['between_street_1'].dropna() # Alphanumeric roadway designator like KY 2049, US31W
CSAFE['between_street_number_1'].dropna() # Same thing
LOJIC['between_street_2'].dropna()
CSAFE['between_street_number_2'].dropna()
# These all have the same kind of alphanumeric data. 
# Rectify these names to make them compatible
# * make sure this doesn't clash with anything. 

5        I 0264
14       KY2048
42       KY1065
62       KY1020
79       US0060
81       US0150
107      KY0061
115     US0031E
116     US0060A
132      KY1020
162      KY2054
250      US0150
264      KY0061
272      KY1865
293      KY0864
299      US0150
316     US0031E
330      KY0061
334     US0031E
365      KY0864
379     US0031E
403      KY0061
425      US0150
430      KY1703
439      US0150
442      KY2048
451     US0060A
471      KY1020
496     US0031W
527      KY0061
585     US0031E
590     US0060A
627      KY2251
638      KY0061
676      KY1931
677      US0060
681     US0031W
694      KY1065
698      US0150
704      KY1931
741      KY0061
760      KY1020
785     US0031W
811      KY2840
852      US0150
938      KY2048
980     US0060A
984      KY0061
1082     KY2049
1101     KY2860
1157     I 0065
1260     I 0265
Name: between_street_number_2, dtype: object