In [1]:
#Use a for loop to consolidate the ARIES datasets from 2007-Present.

library(readr)
library(dplyr)

tbl <- tibble()

for (year in 2007:2019) {
    dataset <- paste0("aries_data/aries_crash_data_",year,".csv")
    locTbl <- read_csv(dataset,
                       #col_types argument is manually setting the types for each variable. This is because the sheer volume of missing data made it hard for the parser infer types itself.
                       col_types="dddcddcccdcdcdccdcdcdcdcccccddcdccccdtcdddddccccdcddccccccdcdccdcdcdcccccccccccdddccccdcccccdcdcccccdcdccdc") %>% 
                #Variables of interest to this project included identifiers, injury characterization, geographic position, time, and characterization of the physical environment
                #See ARIES data dictionary for all the variables. https://hub.mph.in.gov/dataset/aries-crash-data-2007-2017/resource/f61a5dcb-5ca3-485a-9ecf-cd3d8740dc9b?inner_span=True
                select(INDIVIDUAL_MR_RECORD,
                            LATDECIMALNMB,
                            LONGDECIMALNMB,
                            COUNTYDESCR,
                            UNIQUELOCATIONID,
                            COLLISION_YEAR,
                            COLLDTE,
                            SPEEDLIMITTXT,
                            TRAFFICCNTRLDESCR,
                            TRAFFICCNTLOPIND,
                            RUMBLESTRIPIND,
                            SURFACETYPEDESCR,
                            ROADTYPEDESCR,
                            SCHOOLZONEIND,
                            MANNERCOLLDESCR,
                            COLLEVENTDESCR,
                            INJUREDNMB,
                            DEADNMB, 
                            UNITNMB,
                                OCCUPSNMB,
                                PERSONNMB,             
                                PERSONTYPEDESCR)
    tbl <- bind_rows(tbl,locTbl)
}

str(tbl)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

“7712 parsing failures.
  row              col expected actual                                   file
41379 SURFACETYPECDE   a double      + 'aries_data/aries_crash_data_2007.csv'
62021 WEATHERCDE       a double      + 'aries_data/aries_crash_data_2007.csv'
62021 PRIMARYFACTORCDE a double      + 'aries_data/aries_crash_data_2007.csv'
62021 UNITTYPECDE      a double      + 'aries_data/aries_crash_data_2007.csv'
71273 UNITTYPECDE      a double      + 'aries_data/aries_crash_data_2007.csv'
..... ................ ........ ...... ......................................
See problems(...) for more details.
“472 parsing failures.
  row              col           expected      actual                                   file
21431 VEHMODELTXT      delimiter or quote I           'aries_data/aries_crash_da

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	7244543 obs. of  22 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  0 40.7 39.2 40.2 41.6 ...
 $ LONGDECIMALNMB      : num  0 -85.4 -85.9 -85.4 -86.2 ...
 $ COUNTYDESCR         : chr  "Bartholomew" "Huntington" "Bartholomew" "Delaware" ...
 $ UNIQUELOCATIONID    : chr  "COLUMBUSSHOPPINGCENTER" "HUNTINGTONAVE" "E25THST" "KILGOREAVE" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.06.22" "2007.11.13" "2007.06.09" "2007.02.27" ...
 $ SPEEDLIMITTXT       : chr  "15" "0" "15" "20" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND    

In [2]:
tbl <- tbl %>%
    #Give each observation its own unique id by combining crash ID, vehicle ID, and person ID
    mutate(index = as.numeric(paste0(INDIVIDUAL_MR_RECORD,UNITNMB,PERSONNMB))) %>%
    #Delete duplicate rows
    distinct()

“NAs introduced by coercion”

In [3]:
library(feather)

#Fast, language-agnostic format for tabular data storage on disk
write_feather(tbl, "pastedARIES.feather")

In [4]:
#This cell debugged the manual typing of variables

# probs <- tibble()

#ariesVars <- c("PERSONTYPEDESCR","GENDERCDE","AGE_GRP","INJSTATUSDESCR","INJNATUREDESCR","TESTGIVENDESCR","RESULTALCHTXT","RESULTDRUGIND", "COUNTYDESCR","COLLDTE","MOTORVEHINVOLVEDNMB","INJUREDNMB","DEADNMB","RDWYSUFFIXTXT","LATDECIMALNMB","LONGDECIMALNMB", "TRAFFICCNTLOPIND","AGGRESSIVEDRIVEIND","HITRUNIND","SCHOOLZONEIND","RUMBLESTRIPIND","CONSTRUCTIND","LIGHTCONDDESCR", "WEATHERDESCR","SURFACETYPECDE_CONDDESCR","TYPEDESCR","PRIMARYFACTORDESCR","MANNERCOLLDESCR","TRAFFICCNTRLDESCR", "UNITTYPEDESCR","OCCUPSNMB","SPEEDLIMITTXT","VEHUSEDESCR","ROADTYPEDESCR","TRAVDIRDESCR","EMGERENCY_RUN","PRECOLLACTDESCR")

# for (year in 2007:2019) {
#     dataset <- paste0("aries_crash_data_",year,".csv")
#     tbl <- read_csv(dataset,
#                        col_types="dddcddcccdcdcdccdcdcdcdcccccddcdccccdtcdddddccccdcddccccccdcdccdcdcdcccccccccccdddccccdcccccdcdcccccdcdccdc") %>% 
#                 select(7:9,18,20,23,25,26,31,34,40,42,43,45,51,52,53,54:58,60,62,63,65,67,69,79,83,87,91,94,96,98,99,104) %>%
#                 filter(PERSONTYPEDESCR %in% c("Pedal Cyclist","Pedestrian"))
#     locprobs <- problems(tbl) %>% 
#                 group_by(col,expected,actual) %>% 
#                 summarize(count = n()) %>%
#                 filter(col %in% ariesVars)
#     probs <- bind_rows(probs,locprobs)
# }

# probs

In [5]:
#This cell generated missing value percentages
# library(purrr)

# tbl %>% map(~ (mean(is.na(.))*100))

In [6]:
#This cell dropped variables that had missing values that were above 75% (Injury status and nature)

# tbl2 <- tbl %>% select(which(colMeans(is.na(.))*100 < 75))

# tbl2 %>% map(~ (mean(is.na(.))*100))

In [7]:
#Show how to identify collisions involving active transportation

print(tbl %>% group_by(PERSONTYPEDESCR) %>% summarize(count = n()))
tbl %>% group_by(COLLEVENTDESCR) %>% summarize(count = n())

[38;5;246m# A tibble: 9 x 2[39m
  PERSONTYPEDESCR                 count
  [3m[38;5;246m<chr>[39m[23m                           [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m Animal Drawn Vehicle Operator     359
[38;5;250m2[39m Driver                        4[4m0[24m[4m4[24m[4m0[24m518
[38;5;250m3[39m Injured                        [4m1[24m[4m6[24m[4m5[24m773
[38;5;250m4[39m Other                             143
[38;5;250m5[39m Owner Trailer                   [4m9[24m[4m3[24m101
[38;5;250m6[39m Owner Vehicle                 2[4m7[24m[4m4[24m[4m4[24m219
[38;5;250m7[39m Pedal Cyclist                   [4m1[24m[4m2[24m533
[38;5;250m8[39m Pedestrian                      [4m2[24m[4m2[24m819
[38;5;250m9[39m [31mNA[39m                             [4m1[24m[4m5[24m[4m0[24m315


COLLEVENTDESCR,count
<chr>,<int>
05,1
06,1
11,1
12,3
Animal Drawn Vehicle,13038
Animal Other Than Deer,27631
Another Motor Vehicle,5649200
Bicycle,19109
Bridge Overhead Structure,4502
Bridge Parapet End,731


In [8]:
library(tibble)
library(dplyr)

#Function to consolidate variables based on mode of active travel

assignMode = function(x, y) {
    if (any(x %in% "Pedestrian", y %in% "Pedestrian")) {
        return("Pedestrian")
    }
    else if (any(x %in% "Pedal Cyclist", y %in% "Bicycle")) {
        return("Bicyclist")
    }
    else {
        return("Other")
    }
}

# toy <- tribble(
#     ~PERSONTYPEDESCR, ~COLLEVENTDESCR, ~UNIMPORTANT,
#     "Pedestrian", "Pedestrian",34,
#     "fsdsdf", "Pedestrian",34,
#     "dsfsd","Bicycle",54,
#     "Pedal Cyclist", "fsdfd",76,
#     "dfsdfd", "sdfsdfds",78,
#     NA,"sdasda",87,
#     NA, "Bicycle",98,
#     "fdsfsd", NA, 03,
#     "Pedestrian",NA,98
    
# )

tbl_with_mode <- tbl %>% rowwise() %>% mutate(activityMode = assignMode(PERSONTYPEDESCR,COLLEVENTDESCR))

In [9]:
#Make dataset of collisions that all involve active transportation

activeTbl <- tbl_with_mode %>% filter(activityMode %in% c("Bicyclist","Pedestrian"))

str(activeTbl)

Classes ‘rowwise_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	85008 obs. of  24 variables:
 $ INDIVIDUAL_MR_RECORD: num  1267560 1267560 1508589 3131760 1572041 ...
 $ LATDECIMALNMB       : num  41.7 41.7 38.3 41 0 ...
 $ LONGDECIMALNMB      : num  -86 -86 -85.8 -85.1 0 ...
 $ COUNTYDESCR         : chr  "Elkhart" "Elkhart" "Floyd" "Allen" ...
 $ UNIQUELOCATIONID    : chr  "LUSHERPRAIRIE" "LUSHERPRAIRIE" "SPRINGSTSTATEST" "CALHOUNSTPAULDINGRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.02.22" "2007.02.22" "2007.05.11" "2007.06.13" ...
 $ SPEEDLIMITTXT       : chr  "30" "30" "25" "35" ...
 $ TRAFFICCNTRLDESCR   : chr  "Traffic Control Signal" "Traffic Control Signal" "Traffic Control Signal" "Traffic Control Signal" ...
 $ TRAFFICCNTLOPIND    : chr  "Y" "Y" "Y" "Y" ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Two Lanes (Two Wa

In [10]:
#Explore roles of people in the active transportation data sets

activeTbl %>% group_by(PERSONTYPEDESCR) %>% summarize(count = n())

“Grouping rowwise data frame strips rowwise nature”

PERSONTYPEDESCR,count
<chr>,<int>
Driver,28771
Injured,441
Other,1
Owner Trailer,264
Owner Vehicle,18268
Pedal Cyclist,12533
Pedestrian,22819
,1911


In [11]:
#Explore how many unique collisions there are in the active transportation data set

str(activeTbl %>% group_by(INDIVIDUAL_MR_RECORD) %>% summarize(count = n()))

“Grouping rowwise data frame strips rowwise nature”

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	36473 obs. of  2 variables:
 $ INDIVIDUAL_MR_RECORD: num  310623 364290 411775 697708 1020241 ...
 $ count               : int  1 2 2 1 2 1 2 2 2 2 ...


In [12]:
#Make dataset of collisions that have no inclusion of active transportation

inactiveTbl <- tbl_with_mode %>% filter(!(activityMode %in% c("Bicyclist","Pedestrian")))

str(inactiveTbl)

Classes ‘rowwise_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	7144772 obs. of  24 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  0 40.7 39.2 40.2 41.6 ...
 $ LONGDECIMALNMB      : num  0 -85.4 -85.9 -85.4 -86.2 ...
 $ COUNTYDESCR         : chr  "Bartholomew" "Huntington" "Bartholomew" "Delaware" ...
 $ UNIQUELOCATIONID    : chr  "COLUMBUSSHOPPINGCENTER" "HUNTINGTONAVE" "E25THST" "KILGOREAVE" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.06.22" "2007.11.13" "2007.06.09" "2007.02.27" ...
 $ SPEEDLIMITTXT       : chr  "15" "0" "15" "20" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCH

In [13]:
#Write datasets to disk

write_feather(activeTbl,"activeARIES.feather")
write_feather(inactiveTbl,"inactiveARIES.feather")