# Other data discovery

In [41]:
import pandas as pd
import pandasql
from os import path, chdir

SIGNALSPATH = "../../data/raw/Jefferson_County_KY_Signalized_Intersections.csv"
# This data contains a record for each intersection which has a lighted traffic signal.
# Data is regarding Jefferson County, KY

ACCIDENTSPATH = '../../data/preclean/cycling_safety_louisville_cleaned.csv'
# Some precleaned data for testing purposes
# Read in some accident data for testing. 
if not path.exists(ACCIDENTSPATH):
    raise Exception("Run cleaning script: 01_cleaning_cycling_safety.py before running this cell/notebook")
ACCIDENTS = pd.read_csv(ACCIDENTSPATH)
ACCIDENTS['row_id'] = ACCIDENTS.index


Exception: Run cleaning script: 01_cleaning_cycling_safety.py before running this cell/notebook

In [38]:
chdir('../02_cleaning')

%run "01_cleaning_cycling_safety.py"


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/cycling_safety_louisville.csv'

In [12]:
assert path.exists(SIGNALSPATH)
SIGNALS = pd.read_csv(SIGNALSPATH)
SIGNALS


Unnamed: 0,X,Y,OBJECTID,UNITID,SIGID,MAINSTREET,CROSSSTREET,OWNER2,ROUTE,MILEPOINT,DESCRIPTION,TIMES,OWNER,TYPE,INTID,ATMSID
0,-85.499337,38.270632,1,MPW017206,SIG017206,OLD HENRY,I 265 RAMP,S,KY-3084,1.250,OLD HENRY @ I 265 RAMP,24 HOURS,KYTC,1,76747569996,
1,-85.498479,38.241589,2,MPW017211,SIG017211,SHELBYVILLE,BECKLEY WOODS,S,US 60,12.291,SHELBYVILLE @ BECKLEY WOODS,24 HOURS,KYTC,1,78959406728,
2,-85.493550,38.240517,3,MPW017290,SIG017290,SHELBYVILLE,LAKE FOREST,S,US-60,12.580,SHELBYVILLE @ LAKE FOREST,24 HOURS,KYTC,1,123759407630,
3,-85.570212,38.311002,4,MPW015159,SIG015159,BROWNSBORO,I 265 RAMP,S,KY-22,3.700,BROWNSBORO @ I 265 RAMP,24 HOURS,KYTC,1,143907249996,
4,-85.627965,38.283916,5,MPW011726,SIG011726,HERR,LIME KILN,S,KY-22,0.440,HERR @ LIME KILN,24 HOURS,KYTC,1,184229013860,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1086,-85.546671,38.147768,1087,MPW000001,SIG000001,BILLTOWN RD,I 265 RAMP,S,KY-1819,5.290,BILLTOWN RD @ I 265 RAMP,24 HOURS,KYTC,1,822801889996,
1087,-85.495722,38.271982,1088,MPW000002,SIG000002,OLD HENRY RD,TERRA CROSSING BLVD,S,KY-3084,1.480,OLD HENRY @ TERRA CROSSING BLVD,24 HOURS,KYTC,1,2834756E294,
1088,-85.551047,38.296855,1089,MPW000003,SIG000003,WESTPORT RD,I 265 RAMP,S,KY-1447,6.470,WESTPORT RD @ I 265 RAMP,24 HOURS,KYTC,1,698671329996,
1089,-85.614517,38.125808,1090,MPW000004,SIG000004,BEULAH CHURCH RD,I 265 RAMP,S,KY-864,3.450,BEULAH CHURCH RD @ I 265 RAMP,24 HOURS,KYTC,1,978905189996,


## Signalized Intersectons Data Dictionary

| column name | type | description | cleaning notes |
|-------------|------|-------------|----------------|
|X|float|longitude coordinate for intersection | keep |
|Y|float|latitude coordinate for intersection | keep|	
|OBJECTID| number | id for row | not necessary|
|UNITID| alphanumeric | || not necesary |
|SIGID | alphanumeric | || not necessary |
|MAINSTREET | string | name of main street for intersection | keep |
|CROSSSTREET | string | name of crossstreet for intersection| keep |
|OWNER2	|character | short description for owner of street signal| redundant; info is stored more verbosely in OWNER column: drop |
|ROUTE | alphanumeric | route number for main street? | keep | 
|MILEPOINT | number | milepoint along road where signal is | questionable utility |
|DESCRIPTION | string | description of intersection (ex. 3rd@Broadway) | redundant; drop |
|TIMES| alphanumeric | time that the signal is operating| only 2 values that are not "24 HOUR" or something similar. Probably not useful? drop. |
|OWNER | string | owner of traffic signal ||
|TYPE | number | numeric code for type of traffic signal ||
|INTID | number |numeric identifier for intersection | drop |
|ATMSID | nan + one float value | doesn't matter, I'm not going to use it | drop |

In [6]:
# I think these are the only columns I really need.
# I just need to now if a particular intersection has a lighted signal.
SIGNALS[['MAINSTREET', "CROSSSTREET", "ROUTE"]]

# I'm not going to bother to clean this data.
# I just need to pull out the information I need from it.

Unnamed: 0,MAINSTREET,CROSSSTREET,ROUTE
0,OLD HENRY,I 265 RAMP,KY-3084
1,SHELBYVILLE,BECKLEY WOODS,US 60
2,SHELBYVILLE,LAKE FOREST,US-60
3,BROWNSBORO,I 265 RAMP,KY-22
4,HERR,LIME KILN,KY-22
...,...,...,...
1086,BILLTOWN RD,I 265 RAMP,KY-1819
1087,OLD HENRY RD,TERRA CROSSING BLVD,KY-3084
1088,WESTPORT RD,I 265 RAMP,KY-1447
1089,BEULAH CHURCH RD,I 265 RAMP,KY-864


In [22]:
ACCIDENTS['row_id'] = ACCIDENTS.index + 0 # Add an index column to refer back to original data. 
ACCIDENTS['main_roadway'] = ACCIDENTS['roadway_name'].combine_first(ACCIDENTS['roadway_number'])
# Combine roadway_name and roadway_number into a single column for eas of analysis.
# Some values of roadway_name might be null, so use roadway_number as a backup value.

ACCIDENTS['intersection_roadway'] = ACCIDENTS.intersection_roadway_name.combine_first(ACCIDENTS.intersection_roadway_number)
# Do the same thing for intersection roadways

# Boolean value: True -> accident occurred in an intersection / between streets
ACCIDENTS['intersection_indicator'] = ACCIDENTS['intersection_roadway'].notnull()

In [16]:
selection = pandasql.sqldf("""SELECT DISTINCT ACCIDENTS.row_id as ACCIDENT_id, ACCIDENTS.main_roadway,
                           SIGNALS.MAINSTREET, ACCIDENTS.intersection_roadway, SIGNALS.CROSSSTREET
                           FROM ACCIDENTS JOIN SIGNALS 
                           ON ACCIDENTS.main_roadway == SIGNALS.MAINSTREET
                           AND ACCIDENTS.intersection_indicator == TRUE
                           AND SIGNALS.CROSSSTREET == ACCIDENTS.intersection_roadway""")

selection

# This gives me all the rows in my accident data that have roadway_name == signals.mainstreet
# This is half of what I need.
# Now I need to figure out, from these rows, which have cross streets/intersection streets that match
# rows in SIGNAL data.

Unnamed: 0,ACCIDENT_id,main_roadway,MAINSTREET,intersection_roadway,CROSSSTREET
0,7,15TH,15TH,MUHAMMAD ALI,MUHAMMAD ALI
1,8,BARDSTOWN,BARDSTOWN,LONGEST,LONGEST
2,9,BROADWAY,BROADWAY,4TH,4TH
3,10,BARDSTOWN,BARDSTOWN,SHERWOOD,SHERWOOD
4,24,BARDSTOWN,BARDSTOWN,SHERWOOD,SHERWOOD
...,...,...,...,...,...
154,1233,DIXIE,DIXIE,FLOWERVALE,FLOWERVALE
155,1253,MARKET,MARKET,WENZEL,WENZEL
156,1259,DIXIE,DIXIE,PAGES,PAGES
157,1261,CHESTNUT,CHESTNUT,2ND,2ND


In [17]:
pandasql.sqldf("""SELECT ACCIDENTS.intersection_roadway_number, ACCIDENTS.intersection_roadway_name, SIGNALS.ROUTE FROM 
               ACCIDENTS JOIN SIGNALS on ACCIDENTS.intersection_roadway_number == SIGNALS.ROUTE""")
               

Unnamed: 0,intersection_roadway_number,intersection_roadway_name,ROUTE
0,KY1865,NEW CUT,KY1865
1,KY1865,NEW CUT,KY1865
2,KY1865,NEW CUT,KY1865
3,KY1865,NEW CUT,KY1865
4,KY1865,TAYLOR,KY1865
5,KY1865,TAYLOR,KY1865
6,KY1447,WESTPORT,KY1447
7,KY1447,WESTPORT,KY1447
8,KY1447,WESTPORT,KY1447
9,KY1865,NEW CUT,KY1865


In [10]:
selection = pandasql.sqldf("""SELECT DISTINCT ACCIDENTS.roadway_name, ACCIDENTS.roadway_number
                           FROM ACCIDENTS JOIN SIGNALS 
                           ON (ACCIDENTS.roadway_name == SIGNALS.MAINSTREET OR
                                ACCIDENTS.roadway_number == SIGNALS.MAINSTREET)
                
                           AND SIGNALS.CROSSSTREET == ACCIDENTS.intersection_roadway""")
selection

Unnamed: 0,roadway_name,roadway_number
0,15TH,
1,BARDSTOWN,US0031E
2,BROADWAY,US0150
3,LOCUST,
4,JEFFERSON,
...,...,...
72,MAGNOLIA,
73,GARDEN,
74,3RD,
75,ENTERPRISE,


In [18]:

ACCIDENTS['traffic_signal_indicator'] = False
for row in selection.ACCIDENT_id:
    ACCIDENTS.at[row, 'traffic_signal_indicator'] = True

ACCIDENTS.traffic_signal_indicator

0       False
1       False
2       False
3       False
4       False
        ...  
1268    False
1269    False
1270     True
1271    False
1272    False
Name: traffic_signal_indicator, Length: 1273, dtype: bool