In [5]:
import geopandas as geo
import pandas as pd

import os
from os import path

# This is pulling data from a source that is not tracked by this repository. 
# TODO write instructions to get the data
# meanwhile:

FILENAME = "Jefferson_County_KY_Signalized_Intersections/Jefferson_County_KY_Signalized_Intersections.shp"
PREFIX = "../../data/raw/do_not_track"
FILEPATH = path.join(PREFIX, FILENAME)


In [6]:
df = geo.read_file(FILEPATH)
df.head(5)

Unnamed: 0,OBJECTID,UNITID,SIGID,MAINSTREET,CROSSSTREE,OWNER2,ROUTE,MILEPOINT,DESCRIPTIO,TIMES,OWNER,TYPE,INTID,ATMSID,geometry
0,1,MPW017206,SIG017206,OLD HENRY,I 265 RAMP,S,KY-3084,1.25,OLD HENRY @ I 265 RAMP,24 HOURS,KYTC,1,7674756999,,POINT (-85.49934 38.27063)
1,2,MPW017211,SIG017211,SHELBYVILLE,BECKLEY WOODS,S,US 60,12.291,SHELBYVILLE @ BECKLEY WOODS,24 HOURS,KYTC,1,7895940672,,POINT (-85.49848 38.24159)
2,3,MPW017290,SIG017290,SHELBYVILLE,LAKE FOREST,S,US-60,12.58,SHELBYVILLE @ LAKE FOREST,24 HOURS,KYTC,1,12375940763,,POINT (-85.49355 38.24052)
3,4,MPW015159,SIG015159,BROWNSBORO,I 265 RAMP,S,KY-22,3.7,BROWNSBORO @ I 265 RAMP,24 HOURS,KYTC,1,14390724999,,POINT (-85.57021 38.31100)
4,5,MPW011726,SIG011726,HERR,LIME KILN,S,KY-22,0.44,HERR @ LIME KILN,24 HOURS,KYTC,1,18422901386,,POINT (-85.62797 38.28392)


In [None]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1091 entries, 0 to 1090
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   OBJECTID    1091 non-null   int64   
 1   UNITID      1091 non-null   object  
 2   SIGID       1091 non-null   object  
 3   MAINSTREET  1091 non-null   object  
 4   CROSSSTREE  1091 non-null   object  
 5   OWNER2      1091 non-null   object  
 6   ROUTE       750 non-null    object  
 7   MILEPOINT   734 non-null    float64 
 8   DESCRIPTIO  1090 non-null   object  
 9   TIMES       1090 non-null   object  
 10  OWNER       1091 non-null   object  
 11  TYPE        1091 non-null   object  
 12  INTID       1091 non-null   object  
 13  ATMSID      1 non-null      float64 
 14  geometry    1091 non-null   geometry
dtypes: float64(2), geometry(1), int64(1), object(11)
memory usage: 128.0+ KB


|column name | dtype | type | description | value notes | cleaning notes |
|------------|-------|------|-------------|-------------|----------------|
|OBJECTID|int64|int| Unique id for each stoplight | index | keep |
|UNITID|object|alphanumeric|d| not all unique 1088 vs 1091 total records | |
|SIGID|object| | | not all unique: ~ 1070 |c|
|MAINSTREET|object|string| Name of main street | ~366 unique | keep; very important |
|CROSSSTREE|object| string | Name of cross street |~450 unique: more than main street | keep; fix name |
|OWNER|object|string| | 'KYTC', 'METRO', 'JEFFERSONTOWN', 'MIDDLETOWN' | keep |
|OWNER2|object| string | | 'S', 'M', 'JT', 'MT' | duplicate? shortcode for OWNER? |
|ROUTE|object|alphanumeric| Alphanumeric designator for roadway like KY-123 US-123 | some null | keep |
|MILEPOINT|float64| float | Milepoint along roadway | some null | keep |
|DESCRIPTIO|object|string| Compact string description of intersection: MAINSTREET @ CROSSTREE(T) | redundant? | probably discard but maybe useful if its not clear what ROUTE means |
|TIMES|object|string| Appears to be a description of the operating hours for that light | Most are different versions of "24 hours", one is ACTUATED, some null | probably can ignore unless ACTUATED value is interesting|
|TYPE|object|int| No idea | 1, 2, 3, 4. Most values are 1, 2nd least values are 2, etc. | figure out what this means. Is it tied to UNITID or SIGID? |
|INTID|object| | Intersection ID? | ~1048 unique: cross reference with DESCRIPTIO(N), SIGID, etc | probably a good index |
|ATMSID|float64||No idea|Most values are null/nan. Only one non null value. See below | probably drop. Maybe that record is interesting? times value is none|
|geometry|geometry|POINT(Longitude, Latitude)| Approximate latitude, longitude coordinates of the intersection ||keep|

RANDOM NOTE: Ever notice how similar are ACTUATED and ACTIVATED semantically and graphically? cf Roman V vs U. VVEIRD STUFF

In [None]:
UID = df.UNITID

UID.nunique(), len(UID)

(1088, 1091)

In [None]:
SIGID = df.SIGID

SIGID.unique()



array(['SIG017206', 'SIG017211', 'SIG017290', ..., 'SIG000003',
       'SIG000004', 'SIG031242'], dtype=object)

In [None]:
MS = df.MAINSTREET
MS.nunique()

366

In [None]:
XS = df.CROSSSTREE
XS.nunique()

412

In [None]:
df.OWNER.unique()

array(['KYTC', 'METRO', 'JEFFERSONTOWN', 'MIDDLETOWN'], dtype=object)

In [None]:
df.OWNER2.unique()

array(['S', 'M', 'JT', 'MT'], dtype=object)

In [None]:
df.ROUTE.unique()

array(['KY-3084', 'US 60', 'US-60', 'KY-22', 'US-42', 'KY-146', 'KY-155',
       None, 'US-31E', 'KY-1747', 'KY-913', 'KY-1932', 'KY-2052',
       'KY-1819', 'I-65', 'US 31W', 'KY-1020', 'KY-864', 'KY-2054',
       'KY-1934', 'KY-1631', 'KY-1447', 'KY-1065', 'KY-2845', 'KY-61',
       'I-71', 'US-31W', 'US 150', 'KY-2049', 'KY 913', 'KY-1931',
       'KY-2050', 'KY-2048', 'KY 155', 'KY-1703', 'KY-2251', 'I-264',
       'I-64', 'KY1447', 'US-150', 'KY-1865', 'KY-1450', 'KY-907',
       'KY-3064', 'KY-3082', 'KY-1142', 'KY-1727', 'KY-3077', 'US-60A',
       'KY1865', 'I-265', 'US-31', 'KY-2051', 'KY 1934', 'KY-2860',
       'US 31E', 'US 42', 'KY-2055', 'KY 61', 'US 31'], dtype=object)

In [None]:
df.MILEPOINT

0        1.250
1       12.291
2       12.580
3        3.700
4        0.440
         ...  
1086     5.290
1087     1.480
1088     6.470
1089     3.450
1090       NaN
Name: MILEPOINT, Length: 1091, dtype: float64

In [None]:
df.DESCRIPTIO

0                OLD HENRY @ I 265 RAMP
1           SHELBYVILLE @ BECKLEY WOODS
2             SHELBYVILLE @ LAKE FOREST
3               BROWNSBORO @ I 265 RAMP
4                      HERR @ LIME KILN
                     ...               
1086           BILLTOWN RD @ I 265 RAMP
1087    OLD HENRY @ TERRA CROSSING BLVD
1088           WESTPORT RD @ I 265 RAMP
1089      BEULAH CHURCH RD @ I 265 RAMP
1090                               None
Name: DESCRIPTIO, Length: 1091, dtype: object

In [None]:
df.TIMES.value_counts()

TIMES
24 HOURS    1086
24 HOUR        2
ACTUATED       1
24HRS          1
Name: count, dtype: int64

In [None]:
df.TYPE.value_counts()

TYPE
1    1018
3      44
2      27
4       2
Name: count, dtype: int64

In [None]:
df.INTID.nunique()

1048

In [None]:
df.ATMSID.value_counts()
# only one value is not null
df[df.ATMSID.notnull()]
# WHAT DOES IT MEAN

Unnamed: 0,OBJECTID,UNITID,SIGID,MAINSTREET,CROSSSTREE,OWNER2,ROUTE,MILEPOINT,DESCRIPTIO,TIMES,OWNER,TYPE,INTID,ATMSID,geometry
1090,1091,MPW031242,SIG031242,ZORN AVE,I 71 RAMP,S,,,,,KYTC,1,121707553999,3506.0,POINT (-85.69794 38.27573)


In [None]:
df.geometry

0       POINT (-85.49934 38.27063)
1       POINT (-85.49848 38.24159)
2       POINT (-85.49355 38.24052)
3       POINT (-85.57021 38.31100)
4       POINT (-85.62797 38.28392)
                   ...            
1086    POINT (-85.54667 38.14777)
1087    POINT (-85.49572 38.27198)
1088    POINT (-85.55105 38.29685)
1089    POINT (-85.61452 38.12581)
1090    POINT (-85.69794 38.27573)
Name: geometry, Length: 1091, dtype: geometry

In [None]:
os.getcwd()

'/Users/bencampbell/code/no_traffic_lights/code'

In [None]:
# Now that I know what I'm reading, I'll pull in the raw data and make a format I can refer back to 


OUTPREFIX = "../data/raw"
path.exists(OUTPREFIX)
df.to_csv(path.join(OUTPREFIX, "semi-raw_data.csv"))