# Other data discovery
This will contain all my discovery work for other data sources I want to incorporate.

In [128]:
import pandas as pd
import pandasql
from os import chdir, path

chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")
SIGNALSPATH = "data/raw/Jefferson_County_KY_Signalized_Intersections.csv"

ACCIDENTSPATH = 'data/preclean/cycling_safety_louisville_clean.csv'


In [129]:
assert path.exists(SIGNALSPATH)
SIGNALS = pd.read_csv(SIGNALSPATH)
SIGNALS

assert path.exists(ACCIDENTSPATH)
ACCIDENTS = pd.read_csv(ACCIDENTSPATH)
ACCIDENTS['row_id'] = ACCIDENTS.index

| column name | type | description | cleaning notes |
|-------------|------|-------------|----------------|
|X|float|longitude coordinate for intersection | keep |
|Y|float|latitude coordinate for intersection | keep|	
|OBJECTID| number | id for row | not necessary|
|UNITID| alphanumeric | || not necesary |
|SIGID | alphanumeric | || not necessary |
|MAINSTREET | string | name of main street for intersection | keep |
|CROSSSTREET | string | name of crossstreet for intersection| keep |
|OWNER2	|character | short description for owner of street signal| redundant; info is stored more verbosely in OWNER column: drop |
|ROUTE | alphanumeric | route number for main street? | keep | 
|MILEPOINT | number | milepoint along road where signal is | questionable utility |
|DESCRIPTION | string | description of intersection (ex. 3rd@Broadway) | redundant; drop |
|TIMES| alphanumeric | time that the signal is operating| only 2 values that are not "24 HOUR" or something similar. Probably not useful? drop. |
|OWNER | string | owner of traffic signal ||
|TYPE | number | numeric code for type of traffic signal ||
|INTID | number |numeric identifier for intersection | drop |
|ATMSID | nan + one float value | doesn't matter, I'm not going to use it | drop |

In [131]:
# I think these are the only columns I really need.
# I just need to now if a particular intersection has a lighted signal.
SIGNALS[['MAINSTREET', "CROSSSTREET", "ROUTE"]]

Unnamed: 0,MAINSTREET,CROSSSTREET,ROUTE
0,OLD HENRY,I 265 RAMP,KY-3084
1,SHELBYVILLE,BECKLEY WOODS,US 60
2,SHELBYVILLE,LAKE FOREST,US-60
3,BROWNSBORO,I 265 RAMP,KY-22
4,HERR,LIME KILN,KY-22
...,...,...,...
1086,BILLTOWN RD,I 265 RAMP,KY-1819
1087,OLD HENRY RD,TERRA CROSSING BLVD,KY-3084
1088,WESTPORT RD,I 265 RAMP,KY-1447
1089,BEULAH CHURCH RD,I 265 RAMP,KY-864


In [130]:
ACCIDENTS['row_id'] = ACCIDENTS.index + 0
ACCIDENTS['main_roadway'] = ACCIDENTS['roadway_name'].combine_first(ACCIDENTS['roadway_number'])
ACCIDENTS['intersection_roadway'] = ACCIDENTS.intersection_roadway_name.combine_first(ACCIDENTS.intersection_roadway_number)

# Boolean value: True -> accident occurred in an intersection / between streets
ACCIDENTS['intersection_indicator'] = ACCIDENTS['intersection_roadway'].notnull()

#ACCIDENTS

In [132]:
selection = pandasql.sqldf("""SELECT DISTINCT ACCIDENTS.row_id as ACCIDENT_id, ACCIDENTS.main_roadway,
                           SIGNALS.MAINSTREET, ACCIDENTS.intersection_roadway, SIGNALS.CROSSSTREET
                           FROM ACCIDENTS JOIN SIGNALS 
                           ON ACCIDENTS.main_roadway == SIGNALS.MAINSTREET
                           AND ACCIDENTS.intersection_indicator == TRUE
                           AND SIGNALS.CROSSSTREET == ACCIDENTS.intersection_roadway""")

selection

# This gives me all the rows in my accident data that have roadway_name == signals.mainstreet
# This is half of what I need.
# Now I need to figure out, from these rows, which have cross streets/intersection streets that match
# rows in SIGNAL data.

Unnamed: 0,ACCIDENT_id,main_roadway,MAINSTREET,intersection_roadway,CROSSSTREET
0,7,15TH,15TH,MUHAMMAD ALI,MUHAMMAD ALI
1,8,BARDSTOWN,BARDSTOWN,LONGEST,LONGEST
2,9,BROADWAY,BROADWAY,4TH,4TH
3,10,BARDSTOWN,BARDSTOWN,SHERWOOD,SHERWOOD
4,24,BARDSTOWN,BARDSTOWN,SHERWOOD,SHERWOOD
...,...,...,...,...,...
154,1233,DIXIE,DIXIE,FLOWERVALE,FLOWERVALE
155,1253,MARKET,MARKET,WENZEL,WENZEL
156,1259,DIXIE,DIXIE,PAGES,PAGES
157,1261,CHESTNUT,CHESTNUT,2ND,2ND


In [133]:
pandasql.sqldf("""SELECT ACCIDENTS.intersection_roadway_number, ACCIDENTS.intersection_roadway_name, SIGNALS.ROUTE FROM 
               ACCIDENTS JOIN SIGNALS on ACCIDENTS.intersection_roadway_number == SIGNALS.ROUTE""")
               

Unnamed: 0,intersection_roadway_number,intersection_roadway_name,ROUTE
0,KY1865,NEW CUT,KY1865
1,KY1865,NEW CUT,KY1865
2,KY1865,NEW CUT,KY1865
3,KY1865,NEW CUT,KY1865
4,KY1865,TAYLOR,KY1865
5,KY1865,TAYLOR,KY1865
6,KY1447,WESTPORT,KY1447
7,KY1447,WESTPORT,KY1447
8,KY1447,WESTPORT,KY1447
9,KY1865,NEW CUT,KY1865


In [147]:
selection = pandasql.sqldf("""SELECT DISTINCT ACCIDENTS.roadway_name, ACCIDENTS.roadway_number
                           FROM ACCIDENTS JOIN SIGNALS 
                           ON (ACCIDENTS.roadway_name == SIGNALS.MAINSTREET OR
                                ACCIDENTS.roadway_number == SIGNALS.MAINSTREET)
                
                           AND SIGNALS.CROSSSTREET == ACCIDENTS.intersection_roadway""")
selection

Unnamed: 0,roadway_name,roadway_number
0,15TH,
1,BARDSTOWN,US0031E
2,BROADWAY,US0150
3,LOCUST,
4,JEFFERSON,
...,...,...
72,MAGNOLIA,
73,GARDEN,
74,3RD,
75,ENTERPRISE,


In [None]:

ACCIDENTS['traffic_signal_indicator'] = False
for row in selection.ACCIDENT_id:
    ACCIDENTS.at[row, 'traffic_signal_indicator'] = True

ACCIDENTS.traffic_signal_indicator