In [51]:
import pandas as pd
import pyparsing as pyp
import os

import string 

os.chdir("/Users/bencampbell/code_louisville/capstone/louisville-bike-accidents")


DATA = "data/raw/cycling_safety_louisville.csv"
df = pd.read_csv(DATA)

In [130]:
# Dropping columns I won't use.
df_dropped = df.drop(['Unnamed: 0', 'COUNTY NAME', 'GPS LATITUDE DECIMAL', 'GPS LONGITUDE DECIMAL',
         "COLLISION TIME", 'geometry', 'index_right'], axis=1)
#df_dropped

In [28]:
columns = df_dropped.columns
columns

Index(['MASTER FILE NUMBER', 'INVESTIGATING AGENCY', 'LOCAL CODE',
       'COLLISION STATUS CODE', 'ROADWAY NUMBER', 'BLOCK/HOUSE #',
       'ROADWAY NAME', 'ROADWAY SUFFIX', 'ROADWAY DIRECTION CODE',
       'MILEPOINT DERIVED', 'COLLISION DATE', 'COLLISION TIME',
       'INTERSECTION ROADWAY #', 'INTERSECTION ROADWAY NAME',
       'INTERSECTION ROADWAY SFX', 'BETWEEN STREET ROADWAY # 1',
       'BETWEEN STREET ROADWAY NAME 1', 'BETWEEN STREET ROADWAY SFX 1',
       'BETWEEN STREET ROADWAY # 2', 'BETWEEN STREET ROADWAY NAME 2',
       'BETWEEN STREET ROADWAY SFX 2', 'UNITS INVOLVED',
       'MOTOR VEHICLES INVOLVED', 'KILLED', 'INJURED', 'WEATHER CODE',
       'WEATHER', 'ROADWAY CONDITION CODE', 'ROADWAY CONDITION',
       'HIT & RUN INDICATOR', 'ROADWAY TYPE CODE', 'ROADWAY TYPE',
       'DIRECTIONAL ANALYSIS CODE', 'DIRECTIONAL ANALYSIS',
       'MANNER OF COLLISION CODE', 'MANNER OF COLLISION',
       'ROADWAY CHARACTER CODE', 'ROADWAY CHARACTER', 'LIGHT CONDITION CODE',
       'LI

In [63]:
# Renaming some of the columns:

easy_column_renames = ['MASTER FILE NUMBER', 'INVESTIGATING AGENCY', 'LOCAL CODE', 'COLLISION STATUS CODE',
     'ROADWAY NUMBER', 'ROADWAY NAME', 'ROADWAY SUFFIX', 'INTERSECTION ROADWAY NAME',
    'UNITS INVOLVED', 'MOTOR VEHICLES INVOLVED', 'KILLED', 'INJURED', 'WEATHER CODE',
    'WEATHER', 'ROADWAY CONDITION CODE', 'ROADWAY CONDITION', 'ROADWAY TYPE CODE', 'ROADWAY TYPE',
    'DIRECTIONAL ANALYSIS CODE', 'DIRECTIONAL ANALYSIS',
    'MANNER OF COLLISION CODE', 'MANNER OF COLLISION',
    'ROADWAY CHARACTER CODE', 'ROADWAY CHARACTER', 'LIGHT CONDITION CODE',
    'LIGHT CONDITION', 'RAMP FROM ROADWAY ID', 'RAMP TO ROADWAY ID', "Latitude", "Longitude",
    ]

def easy_rename(name:str) -> str:
    return name.replace(" ", "_").lower()

less_easy_column_renames = [
     'INTERSECTION ROADWAY #', 'INTERSECTION ROADWAY SFX', 'BETWEEN STREET ROADWAY # 1',
    'BETWEEN STREET ROADWAY NAME 1', 'BETWEEN STREET ROADWAY SFX 1',
    'BETWEEN STREET ROADWAY # 2', 'BETWEEN STREET ROADWAY NAME 2',
    'BETWEEN STREET ROADWAY SFX 2',]

def less_easy_rename(name:str) -> str:
    name = name.replace("STREET ROADWAY", "street")
    name = name.replace("#", 'number').replace("SFX", 'suffix')
    return easy_rename(name)

misc_column_renames = {
    'BLOCK/HOUSE #': "building_number",
    'ROADWAY DIRECTION CODE': 'roadway_direction',
    'MILEPOINT DERIVED': 'milepoint', 
    'HIT & RUN INDICATOR': "hit_and_run",
    'SECONDARY COLLISION INDICATOR': "secondary_collision"}

renames = {name:easy_rename(name) for name in easy_column_renames}
renames.update({name:less_easy_rename(name) for name in less_easy_column_renames})
renames.update(misc_column_renames)



In [64]:
df_renamed = df_dropped.rename(renames, axis=1)
df_renamed.columns

Index(['master_file_number', 'investigating_agency', 'local_code',
       'collision_status_code', 'roadway_number', 'building_number',
       'roadway_name', 'roadway_suffix', 'roadway_direction', 'milepoint',
       'COLLISION DATE', 'COLLISION TIME', 'intersection_roadway_number',
       'intersection_roadway_name', 'intersection_roadway_suffix',
       'between_street_number_1', 'between_street_name_1',
       'between_street_suffix_1', 'between_street_number_2',
       'between_street_name_2', 'between_street_suffix_2', 'units_involved',
       'motor_vehicles_involved', 'killed', 'injured', 'weather_code',
       'weather', 'roadway_condition_code', 'roadway_condition', 'hit_and_run',
       'roadway_type_code', 'roadway_type', 'directional_analysis_code',
       'directional_analysis', 'manner_of_collision_code',
       'manner_of_collision', 'roadway_character_code', 'roadway_character',
       'light_condition_code', 'light_condition', 'ramp_from_roadway_id',
       'ramp_to_r

In [166]:
# Cleaning time/date columns
timecols = ['COLLISION DATE', 'COLLISION TIME', 'hour', 'minute', 'Date']
timedata  = df[timecols]
integer = pyp.Word(pyp.nums).set_name("integer")
timedata


Unnamed: 0,COLLISION DATE,COLLISION TIME,hour,minute,Date
0,2/20/2010,1620,16,20,2010-02-20 16:20:00
1,1/13/2010,1340,13,40,2010-01-13 13:40:00
2,1/13/2010,100008,10,0,2010-01-13 10:00:00
3,1/15/2010,1550,15,50,2010-01-15 15:50:00
4,2/2/2010,611,6,11,2010-02-02 06:11:00
...,...,...,...,...,...
1268,12/5/2017,707,7,7,2017-12-05 07:07:00
1269,12/14/2017,1709,17,9,2017-12-14 17:09:00
1270,12/19/2017,100002,10,0,2017-12-19 10:00:00
1271,12/21/2017,1956,19,56,2017-12-21 19:56:00


In [102]:
dates = timedata['COLLISION DATE']
MMDDYYY_slash = integer("month") + "/" + integer("day") + "/" + integer("year")
test = dates.iloc[101] # Change number in [ ] to test random rows.
print(test)
MMDDYYY_slash.parse_string(test).as_dict()

9/1/2010


{'month': '9', 'day': '1', 'year': '2010'}

In [172]:
times = timedata['Date']
test = times.iloc[230] # Testing random rows.
datetime = (integer("year") + '-' + integer("month") + '-' + integer("day") +
            integer("hour") + ":" + integer("minute") +  ":" + integer("second").suppress())
print(test)
datetime.parse_string(test).asDict()


2011-06-23 15:30:00


{'year': '2011', 'month': '06', 'day': '23', 'hour': '15', 'minute': '30'}

In [170]:
date_expr = (integer("year") + '-' + integer("month") + '-' + integer("day") +
            integer("hour") + ":" + integer("minute") + ":" + integer("second"))
# date_expr is a parsing expression composed of a date string and a time string.
# the date part is a sequence of integers delimited by '-'
# the time part is a sequence of integers delimited by ':'

def _parse_Date(date:str) -> dict:
    parsed = date_expr.parse_string(date).as_dict()
    parsed = {key:int(value) for key, value in parsed.items()}
    return parsed

def clean_date_columns(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['hour', 'minute', 'COLLISION DATE', 'COLLISION TIME'], axis=1)
    parsed_df = pd.DataFrame(df['Date'].apply(_parse_Date).to_list())
    out = pd.concat([df, parsed_df], axis=1)
    return out.drop('Date', axis=1)

cdf = clean_date_columns(df)
cdf[['hour', 'minute', 'year', 'month', 'day']]

Unnamed: 0,hour,minute,year,month,day
0,16,20,2010,2,20
1,13,40,2010,1,13
2,10,0,2010,1,13
3,15,50,2010,1,15
4,6,11,2010,2,2
...,...,...,...,...,...
1268,7,7,2017,12,5
1269,17,9,2017,12,14
1270,10,0,2017,12,19
1271,19,56,2017,12,21


In [160]:
o = timedata['Date'].apply(lambda x:datetime.parse_string(x).as_dict()).to_list()
t = pd.DataFrame(o)
t
t.drop("second", axis=1)

u = pd.concat([df, t], axis=1)
u[['Date', 'year', 'month', 'day', 'hour', 'minute']]

Unnamed: 0,Date,year,month,day,hour,hour.1,minute,minute.1
0,2010-02-20 16:20:00,2010,02,20,16,16,20,20
1,2010-01-13 13:40:00,2010,01,13,13,13,40,40
2,2010-01-13 10:00:00,2010,01,13,10,10,0,00
3,2010-01-15 15:50:00,2010,01,15,15,15,50,50
4,2010-02-02 06:11:00,2010,02,02,6,06,11,11
...,...,...,...,...,...,...,...,...
1268,2017-12-05 07:07:00,2017,12,05,7,07,7,07
1269,2017-12-14 17:09:00,2017,12,14,17,17,9,09
1270,2017-12-19 10:00:00,2017,12,19,10,10,0,00
1271,2017-12-21 19:56:00,2017,12,21,19,19,56,56


In [129]:
def dict_values_str_to_int(dictionary:dict) -> None:
    """Convert string values to int values in a dictionary *in place*"""
    items = dictionary.items()
    for key, value in items:
        dictionary[key] = int(value)

def validate_time_data(location:int):
    data = df.iloc[location][timecols]
    collision_date_parsed = MMDDYYY_slash.parse_string(data['COLLISION DATE']).as_dict()
    dict_values_str_to_int(collision_date_parsed)
    date_parsed = datetime.parse_string(data['Date']).as_dict()
    dict_values_str_to_int(date_parsed)
    df_hour = int(data['hour'])
    df_minute = int(data['minute'])
    assert date_parsed['minute'] == df_minute
    assert date_parsed['hour'] == df_hour
    assert date_parsed['year'] == collision_date_parsed['year']
    assert date_parsed['month'] == collision_date_parsed['month']
    assert date_parsed['day'] == collision_date_parsed['day']
    return True

validate_time_data(101)

all(validate_time_data(n) for n in range(len(df)))


True

In [135]:
t = df.iloc[10]['test'] = 100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t = df.iloc[10]['test'] = 100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t = df.iloc[10]['test'] = 100
