# Join pedestrian crash data

Joining all records from the MassDOT Impact Person-level tables that are recorded as non-motorists

In [199]:
# Libraries in active development
# MDIData is a helper library to ease handling the way data is split across years

# Environment
%cd /kt/data/massdot-impact
%load_ext autoreload
%autoreload
from massdot_impact import MDIData
mdi = MDIData('./raw')

/kt/data/massdot-impact
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [200]:
import pandas
import os
from dateutil.parser import parse
from numpy import nan
import geopandas
from shapely.geometry import Point

## Load all the sheets except 2010, filter to non-motorists with each

In [201]:
for year in [y for y in mdi.d.keys() if y != 2010]:
    mdi.load(year, print_msg=True)
    mdi.d[year]['df'] = mdi.df(year)[ mdi.df(year).PERS_TYPE == 'Non-motorist' ]

loaded the 2002 table into dictionary
loaded the 2003 table into dictionary
loaded the 2004 table into dictionary
loaded the 2005 table into dictionary
loaded the 2006 table into dictionary
loaded the 2007 table into dictionary
loaded the 2008 table into dictionary
loaded the 2009 table into dictionary
loaded the 2011 table into dictionary
loaded the 2012 table into dictionary
loaded the 2013 table into dictionary
loaded the 2014 table into dictionary
loaded the 2015 table into dictionary
loaded the 2016 table into dictionary
loaded the 2017 table into dictionary
loaded the 2018 table into dictionary
loaded the 2019 table into dictionary


### Load 2010 and add to data

2010 is missing the person-level fields (see `codebook-colsqc.ods`) and had to be downloaded manually (also see `NOTES.md`). The downloaded extract has already been filtered for non-motorist records. The column names are lower-case and must be capitalized.

In [202]:
patch = pandas.read_csv('raw-2010/export_5_12_2022_16_04_21.csv', low_memory=False)
patch.columns = [c.upper() for c in patch.columns]

In [203]:
mdi.d[2010] = {}
mdi.d[2010]['df'] = patch

## Unique values in `INJY_STAT_DESCR`

Of use if assigning variable levels.

In [153]:
injury_levels = []

for y in mdi.d.keys():
    injury_levels += list(set(mdi.df(y).INJY_STAT_DESCR.tolist()))
    
injury_levels = set(injury_levels)
for a in injury_levels:
    print(a)

Fatal injury (K)
Non-fatal injury - Incapacitating
Non-fatal injury - Non-incapacitating
No Apparent Injury (O)
Deceased not caused by crash
Suspected Serious Injury (A)
Possible Injury (C)
Unknown
Not reported
nan
Reported but invalid
Suspected Minor Injury (B)
Non-fatal injury - Possible
Not Applicable
No injury


## Cleanup

Fix 2010 datetime

In [204]:
# Fill NA in times
mdi.d[2010]['df'].CRASH_TIME_2 = mdi.d[2010]['df'].CRASH_TIME_2.fillna(value='00:00')

# Drop NA in dates

# Parse date and time into a datetime string
mdi.d[2010]['df']['CRASH_DATETIME'] = mdi.df(2010).apply(lambda r: parse(f"{r.CRASH_DATE} {r.CRASH_TIME_2}").strftime("%m/%d/%Y, %H:%M:%S"), 
                   axis=1)

Fix columns for 2018/2019 datetime values

In [205]:
# Would use a lambda, but it fails without the ability to handle NA
def convtime(val):
    try:
        return parse(val.CRASH_DATE).strftime("%m/%d/%Y, %H:%M:%S")
    except:
        return nan

# This may give the SettingWithCopyWarning but it works
for y in [2018,2019]:
    mdi.d[y]['df']['CRASH_DATETIME'] = nan
    mdi.d[y]['df']['CRASH_DATETIME'] = mdi.d[y]['df'].apply(
        convtime,
        axis=1)

Drop NA dates

In [206]:
for y in mdi.d.keys():
    mdi.d[y]['df'] = mdi.d[y]['df'].dropna(subset='CRASH_DATETIME')

### NA Coords

This will drop a sizeable amount of data from the earlier years. Drops about 4500 rows or 6% of the data.

Quick QC of NA counts:

In [192]:
na_coord_df = pandas.DataFrame(columns=[False,True])

for y in mdi.d:
    na_coord_df.loc[y] = mdi.df(y).X.isna().value_counts()
    
na_coord_df['pct'] = na_coord_df[True]/na_coord_df[False]*100

na_coord_df

Unnamed: 0,False,True,pct
2002,2753,662,24.046495
2003,2625,600,22.857143
2004,2955,468,15.837563
2005,2942,402,13.664174
2006,3186,340,10.671689
2007,3206,279,8.702433
2008,3796,206,5.426765
2009,3961,134,3.382984
2010,3951,187,4.732979
2011,4340,154,3.548387


If using this data for point-based geographic analysis the records with NA coordinates must be dropped. The first few years are not usable for such analysis.

In [198]:
# Run if doing point-based analysis

# for y in mdi.d.keys():
#     mdi.d[y]['df'] = mdi.d[y]['df'].dropna(subset=['X','Y'])

## Concatenate ('join')

In [207]:
mdi_concat = pandas.concat([mdi.d[y]['df'] for y in mdi.d]).drop(['OBJECTID','FID'], axis=1)

In [211]:
mdi_concat

Unnamed: 0,CRASH_NUMB,CITY_TOWN_NAME,CRASH_DATE_TEXT,CRASH_TIME,CRASH_DATETIME,CRASH_HOUR,CRASH_STATUS,CRASH_SEVERITY_DESCR,MAX_INJR_SVRTY_CL,NUMB_VEHC,...,Y.1,CRASH_DATE,CRASH_TIME_2,SPTROOP,FMSCA_RPTBL_CL,FMSCA_RPTBL,FMSCA_RPTBL_VL,TRVL_DIRC_DESCR,T_EXC_TYPE,T_EXC_TIME
48,1481125,BILLERICA,08/23/2002,1:40 PM,2002/08/23 13:40:00,01:00PM to 01:59PM,Closed,Non-fatal injury,Non-fatal injury - Non-incapacitating,1,...,,,,,,,,,,
103,1483656,BROCKTON,09/12/2002,4:34 PM,2002/09/12 16:34:00,04:00PM to 04:59PM,Closed,Non-fatal injury,Non-fatal injury - Non-incapacitating,1,...,,,,,,,,,,
132,1418321,WELLFLEET,02/27/2002,7:29 AM,2002/02/27 07:28:59,07:00AM to 07:59AM,Closed,Property damage only (none injured),No injury,1,...,,,,,,,,,,
271,1445995,SAUGUS,04/13/2002,4:18 PM,2002/04/13 16:18:00,04:00PM to 04:59PM,Closed,Non-fatal injury,Non-fatal injury - Incapacitating,1,...,,,,,,,,,,
323,1477274,WESTON,08/01/2002,12:26 PM,2002/08/01 12:26:00,12:00PM to 12:59PM,Closed,Non-fatal injury,Non-fatal injury - Incapacitating,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324934,4743992,ROCKPORT,07/26/2019,,"07/26/2019, 12:19:00",12:00PM to 12:59PM,Closed,Non-fatal injury,Non-fatal injury - Possible,1,...,,2019/07/26 12:19:00+00,12:19 PM,,,,,,,
324979,5019722,SHIRLEY,06/13/2019,,"06/13/2019, 07:44:00",07:00AM to 07:59AM,Closed,Non-fatal injury,Non-fatal injury - Incapacitating,1,...,,2019/06/13 07:44:00+00,7:44 AM,,,,,,,
324993,5019588,SHIRLEY,02/21/2019,,"02/21/2019, 12:08:00",12:00PM to 12:59PM,Closed,Property damage only (none injured),No injury,1,...,,2019/02/21 12:08:00+00,12:08 PM,,,,,,,
325072,4789834,HANSON,12/10/2019,,"12/10/2019, 12:49:59",12:00PM to 12:59PM,Closed,Non-fatal injury,Possible Injury (C),1,...,,2019/12/10 12:49:59+00,12:50 PM,,,,,,,


Give each one a unique index (this could happen earlier if wanted)

In [220]:
mdi_concat['PERS_ID'] = mdi_concat.apply(lambda r: f"{r.CRASH_NUMB}.{r.PERS_NUMB}", axis=1)
mdi_concat = mdi_concat.set_index('PERS_ID')

Export

In [222]:
mdi_concat.to_csv('proc/mdi-ped.csv', index=True)