In [7]:
import pandas as pd
import pyparsing
import os
import logging as log

import string 

os.chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")


RAW = "data/raw/cycling_safety_louisville.csv"
CLEAN = "data/clean/cycling_safety_louisville_clean.csv"

df = pd.read_csv(RAW)
df

Unnamed: 0.1,Unnamed: 0,MASTER FILE NUMBER,INVESTIGATING AGENCY,LOCAL CODE,COLLISION STATUS CODE,COUNTY NAME,ROADWAY NUMBER,BLOCK/HOUSE #,ROADWAY NAME,ROADWAY SUFFIX,...,RAMP FROM ROADWAY ID,RAMP TO ROADWAY ID,SECONDARY COLLISION INDICATOR,hour,minute,Date,Latitude,Longitude,geometry,index_right
0,306,1344607,LOUISVILLE METRO POLICE DEPT,8010012297,AC,56,US0031,,BARDSTOWN,RD,...,,,,16,20,2010-02-20 16:20:00,38.231850,-85.707933,POINT (-85.707933333 38.23185),0
1,1506,70803559,LOUISVILLE METRO POLICE DEPT,8010002922,AC,56,,,ZORN,AVE,...,,,N,13,40,2010-01-13 13:40:00,38.273995,-85.696572,POINT (-85.6965716 38.2739947),0
2,1541,70803445,LOUISVILLE METRO POLICE DEPT,8010003041,AC,56,US0042,,BROWNSBORO,RD,...,,,N,10,0,2010-01-13 10:00:00,38.258551,-85.703576,POINT (-85.70357610000001 38.2585512),0
3,1690,70805078,LOUISVILLE METRO POLICE DEPT,8010003577,AC,56,,,PETERSON,AVE,...,,,N,15,50,2010-01-15 15:50:00,38.250012,-85.697265,POINT (-85.6972652 38.2500121),0
4,2876,70811322,LOUISVILLE METRO POLICE DEPT,8010007938,AC,56,,,GEORGETOWN,PL,...,,,N,6,11,2010-02-02 06:11:00,38.195890,-85.793380,POINT (-85.7933803 38.1958905),0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,253098,72129917,LOUISVILLE METRO POLICE DEPT,8017098682,AC,56,,6600,GRADE,LN,...,,,N,7,7,2017-12-05 07:07:00,38.153815,-85.733644,POINT (-85.73364410000001 38.1538153),0
1269,253931,72127942,LOUISVILLE METRO POLICE DEPT,8017101406,AC,56,,,JEANINE,DR,...,,,N,17,9,2017-12-14 17:09:00,38.163618,-85.688008,POINT (-85.6880079 38.1636178),0
1270,254491,72130250,LOUISVILLE METRO POLICE DEPT,8017102897,AC,56,KY0864,,POPLAR LEVEL,RD,...,,,N,10,0,2017-12-19 10:00:00,38.160030,-85.671480,POINT (-85.6714798 38.1600301),0
1271,254702,72131366,LOUISVILLE METRO POLICE DEPT,8017103524,AC,56,,,WOODGATE,LN,...,,,N,19,56,2017-12-21 19:56:00,38.198257,-85.626309,POINT (-85.62630919999999 38.1982569),0


In [8]:
# define parsing expressions and parsers.
integer = pyparsing.Word(pyparsing.nums).set_name("integer")
# integer is a pyparsing.Word which consists of a string of characters in '0123456789'

# Date column parser
date_expr = (integer("year") + '-' + integer("month") + '-' + integer("day") +
            integer("hour") + ":" + integer("minute") + ":" + integer("second").suppress())
# date_expr is a parsing expression composed of a date string and a time string.
# the date part is a sequence of integers delimited by '-'
# the time part is a sequence of integers delimited by ':'
# integer("second").suppress() ignores the seconds part of the time substring
# I don't need second-level precision for my analysis, and I doubt it's reliable anyway.

def _parse_Date(date:str) -> dict:
    parsed = date_expr.parse_string(date).as_dict()
    parsed = {key:int(value) for key, value in parsed.items()}
    return parsed


In [9]:
parse = lambda x:pd.Timestamp(**_parse_Date(x))

In [11]:
test = df['Date'].iloc[101]
print(test)
parse(test)

2010-09-01 10:44:00


Timestamp('2010-09-01 10:44:00')

In [13]:

df['Date'] = df['Date'].apply(parse)

In [15]:
df['Date']

0      2010-02-20 16:20:00
1      2010-01-13 13:40:00
2      2010-01-13 10:00:00
3      2010-01-15 15:50:00
4      2010-02-02 06:11:00
               ...        
1268   2017-12-05 07:07:00
1269   2017-12-14 17:09:00
1270   2017-12-19 10:00:00
1271   2017-12-21 19:56:00
1272   2017-12-22 21:51:00
Name: Date, Length: 1273, dtype: datetime64[ns]

In [17]:
names = ['MASTER FILE NUMBER', 'INVESTIGATING AGENCY', 'LOCAL CODE', 'COLLISION STATUS CODE',
     'ROADWAY NUMBER', 'ROADWAY NAME', 'ROADWAY SUFFIX', 'INTERSECTION ROADWAY NAME',
    'UNITS INVOLVED', 'MOTOR VEHICLES INVOLVED', 'KILLED', 'INJURED', 'WEATHER CODE',
    'WEATHER', 'ROADWAY CONDITION CODE', 'ROADWAY CONDITION', 'ROADWAY TYPE CODE', 'ROADWAY TYPE',
    'DIRECTIONAL ANALYSIS CODE', 'DIRECTIONAL ANALYSIS',
    'MANNER OF COLLISION CODE', 'MANNER OF COLLISION',
    'ROADWAY CHARACTER CODE', 'ROADWAY CHARACTER', 'LIGHT CONDITION CODE',
    'LIGHT CONDITION', 'RAMP FROM ROADWAY ID', 'RAMP TO ROADWAY ID', "Latitude", "Longitude", "Date"]

{name:name.replace(" ", "_").lower() for name in names}



{'MASTER FILE NUMBER': 'master_file_number',
 'INVESTIGATING AGENCY': 'investigating_agency',
 'LOCAL CODE': 'local_code',
 'COLLISION STATUS CODE': 'collision_status_code',
 'ROADWAY NUMBER': 'roadway_number',
 'ROADWAY NAME': 'roadway_name',
 'ROADWAY SUFFIX': 'roadway_suffix',
 'INTERSECTION ROADWAY NAME': 'intersection_roadway_name',
 'UNITS INVOLVED': 'units_involved',
 'MOTOR VEHICLES INVOLVED': 'motor_vehicles_involved',
 'KILLED': 'killed',
 'INJURED': 'injured',
 'WEATHER CODE': 'weather_code',
 'WEATHER': 'weather',
 'ROADWAY CONDITION CODE': 'roadway_condition_code',
 'ROADWAY CONDITION': 'roadway_condition',
 'ROADWAY TYPE CODE': 'roadway_type_code',
 'ROADWAY TYPE': 'roadway_type',
 'DIRECTIONAL ANALYSIS CODE': 'directional_analysis_code',
 'DIRECTIONAL ANALYSIS': 'directional_analysis',
 'MANNER OF COLLISION CODE': 'manner_of_collision_code',
 'MANNER OF COLLISION': 'manner_of_collision',
 'ROADWAY CHARACTER CODE': 'roadway_character_code',
 'ROADWAY CHARACTER': 'roadway_

In [18]:
codes = [
     'INTERSECTION ROADWAY #', 'INTERSECTION ROADWAY SFX', 'BETWEEN STREET ROADWAY # 1',
    'BETWEEN STREET ROADWAY NAME 1', 'BETWEEN STREET ROADWAY SFX 1',
    'BETWEEN STREET ROADWAY # 2', 'BETWEEN STREET ROADWAY NAME 2',
    'BETWEEN STREET ROADWAY SFX 2',]

def _easy_rename(name:str) -> str:
    return name.replace(" ", "_").lower()

def _less_easy_rename(name:str) -> str:
    name = name.replace("STREET ROADWAY", "street")
    name = name.replace("#", 'number').replace("SFX", 'suffix')
    return _easy_rename(name)

{C:_less_easy_rename(C) for C in codes}

{'INTERSECTION ROADWAY #': 'intersection_roadway_number',
 'INTERSECTION ROADWAY SFX': 'intersection_roadway_suffix',
 'BETWEEN STREET ROADWAY # 1': 'between_street_number_1',
 'BETWEEN STREET ROADWAY NAME 1': 'between_street_name_1',
 'BETWEEN STREET ROADWAY SFX 1': 'between_street_suffix_1',
 'BETWEEN STREET ROADWAY # 2': 'between_street_number_2',
 'BETWEEN STREET ROADWAY NAME 2': 'between_street_name_2',
 'BETWEEN STREET ROADWAY SFX 2': 'between_street_suffix_2'}