# Consolidating the datasets:

In [1]:
#Use a for loop to consolidate the ARIES datasets from 2007-Present.

library(readr)
library(dplyr)

tbl <- tibble()

for (year in 2007:2019) {
    dataset <- paste0("aries_data/aries_crash_data_",year,".csv")
    locTbl <- read_csv(dataset,
                       #col_types argument is manually setting the types for each variable. This is because the sheer volume of missing data made it hard for the parser infer types itself.
                       col_types="dddcddcccdcdcdccdcdcdcdcccccddcdccccdtcdddddccccdcddccccccdcdccdcdcdcccccccccccdddccccdcccccdcdcccccdcdccdc") %>% 
                #Variables of interest to this project included identifiers, injury characterization, geographic position, time, and characterization of the physical environment
                #See ARIES data dictionary for all the variables. https://hub.mph.in.gov/dataset/aries-crash-data-2007-2017/resource/f61a5dcb-5ca3-485a-9ecf-cd3d8740dc9b?inner_span=True
                select(INDIVIDUAL_MR_RECORD,
                            LATDECIMALNMB,
                            LONGDECIMALNMB,
                            COUNTYDESCR,
                            UNIQUELOCATIONID,
                            COLLISION_YEAR,
                            COLLDTE,
                            SPEEDLIMITTXT,
                            TRAFFICCNTRLDESCR,
                            TRAFFICCNTLOPIND,
                            RUMBLESTRIPIND,
                            SURFACETYPEDESCR,
                            ROADTYPEDESCR,
                            SCHOOLZONEIND,
                            MANNERCOLLDESCR,
                            COLLEVENTDESCR,
                            INJUREDNMB,
                            DEADNMB, 
                            UNITNMB,
                                OCCUPSNMB,
                                PERSONNMB,             
                                PERSONTYPEDESCR)
    tbl <- bind_rows(tbl,locTbl)
}

str(tbl)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

“7712 parsing failures.
  row              col expected actual                                   file
41379 SURFACETYPECDE   a double      + 'aries_data/aries_crash_data_2007.csv'
62021 WEATHERCDE       a double      + 'aries_data/aries_crash_data_2007.csv'
62021 PRIMARYFACTORCDE a double      + 'aries_data/aries_crash_data_2007.csv'
62021 UNITTYPECDE      a double      + 'aries_data/aries_crash_data_2007.csv'
71273 UNITTYPECDE      a double      + 'aries_data/aries_crash_data_2007.csv'
..... ................ ........ ...... ......................................
See problems(...) for more details.
“472 parsing failures.
  row              col           expected      actual                                   file
21431 VEHMODELTXT      delimiter or quote I           'aries_data/aries_crash_da

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	7244543 obs. of  22 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  0 40.7 39.2 40.2 41.6 ...
 $ LONGDECIMALNMB      : num  0 -85.4 -85.9 -85.4 -86.2 ...
 $ COUNTYDESCR         : chr  "Bartholomew" "Huntington" "Bartholomew" "Delaware" ...
 $ UNIQUELOCATIONID    : chr  "COLUMBUSSHOPPINGCENTER" "HUNTINGTONAVE" "E25THST" "KILGOREAVE" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.06.22" "2007.11.13" "2007.06.09" "2007.02.27" ...
 $ SPEEDLIMITTXT       : chr  "15" "0" "15" "20" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND    

In [2]:
library(feather)

#Fast, language-agnostic format for tabular data storage on disk
write_feather(tbl, "raw_pasted_ARIES.feather")

# Quality Measures:

In [3]:
# Delete observations without valid latitude or longitude
tbl <- tbl %>% filter(LATDECIMALNMB != 0 & LONGDECIMALNMB != 0) 
            
str(tbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6431818 obs. of  22 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [4]:
tbl <- tbl %>% distinct()

str(tbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  22 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

## Attempting to give a unique ID to each observation reveals there are duplicate rows:

In [5]:
# Give each observation its own "unique" id by combining crash ID, vehicle ID, and person ID
tbl <- tbl %>% mutate(nested_id = as.numeric(paste0(INDIVIDUAL_MR_RECORD,UNITNMB,PERSONNMB)))

str(tbl)

“NAs introduced by coercion”

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  23 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [6]:
# We still have repeat nested_id's and not all of these repeats are "NA"
paste0("Number of distinct index numbers: ",n_distinct(tbl$nested_id))
paste0("Number of index numbers that are NA: ",sum(is.na(tbl$nested_id)))

In [7]:
# This "toy" was used for validating the script below
# toy <- tribble(
#     ~a, ~b, ~nested_id,
#     1, 2, 3, 
#     1, 2, 3,
#     1, 2, NA,
#     1, 2, NA, 
#     4, 5, 6
# )

# Identify cells that still have the same Nested_ID number
duplicates <- tbl %>% group_by(nested_id) %>% filter(n()>1 & !(nested_id %in% NA)) %>% ungroup() %>% arrange(nested_id)

# Each of these observations has a duplicate Nested_ID, but a slightly different combination of variable values. 
# I'm saving these observations for reference later, but will keep the rows in the dataset.
write_csv(duplicates,"duplicates.csv")

In [8]:
#Giving a truly unique ID number

tbl <- tbl %>% mutate(index = 1:n())

str(tbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  24 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [9]:
write_feather(tbl, "pasted_ARIES_and_IDs.feather")

# Adding variables that mark each observation as involving active transportation or not, then also "pedestrian" versus "bicyclist" versus "other":

In [10]:
#Show how to identify collisions involving active transportation

print(tbl %>% group_by(PERSONTYPEDESCR) %>% summarize(count = n()))
tbl %>% group_by(COLLEVENTDESCR) %>% summarize(count = n())

[38;5;246m# A tibble: 9 x 2[39m
  PERSONTYPEDESCR                 count
  [3m[38;5;246m<chr>[39m[23m                           [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m Animal Drawn Vehicle Operator     280
[38;5;250m2[39m Driver                        3[4m5[24m[4m7[24m[4m3[24m573
[38;5;250m3[39m Injured                        [4m1[24m[4m5[24m[4m2[24m719
[38;5;250m4[39m Other                             140
[38;5;250m5[39m Owner Trailer                   [4m8[24m[4m0[24m886
[38;5;250m6[39m Owner Vehicle                 2[4m4[24m[4m4[24m[4m0[24m048
[38;5;250m7[39m Pedal Cyclist                   [4m1[24m[4m1[24m766
[38;5;250m8[39m Pedestrian                      [4m2[24m[4m1[24m421
[38;5;250m9[39m [31mNA[39m                             [4m1[24m[4m3[24m[4m7[24m889


COLLEVENTDESCR,count
<chr>,<int>
05,1
06,1
11,1
12,3
Animal Drawn Vehicle,10939
Animal Other Than Deer,23234
Another Motor Vehicle,5064198
Bicycle,17755
Bridge Overhead Structure,3976
Bridge Parapet End,622


In [11]:
library(tibble)

#Function to consolidate variables based on mode of active travel

assignMode = function(x, y) {
    if (any(x %in% "Pedestrian", y %in% "Pedestrian")) {
        return("Pedestrian")
    }
    else if (any(x %in% "Pedal Cyclist", y %in% "Bicycle")) {
        return("Bicyclist")
    }
    else {
        return("Other")
    }
}

# toy <- tribble(
#     ~PERSONTYPEDESCR, ~COLLEVENTDESCR, ~UNIMPORTANT,
#     "Pedestrian", "Pedestrian",34,
#     "fsdsdf", "Pedestrian",34,
#     "dsfsd","Bicycle",54,
#     "Pedal Cyclist", "fsdfd",76,
#     "dfsdfd", "sdfsdfds",78,
#     NA,"sdasda",87,
#     NA, "Bicycle",98,
#     "fdsfsd", NA, 03,
#     "Pedestrian",NA,98
    
# )

tbl <- tbl %>% rowwise() %>% mutate(activityMode = assignMode(PERSONTYPEDESCR,COLLEVENTDESCR))

str(tbl)

Classes ‘rowwise_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  25 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIN

In [12]:
write_feather(tbl, "pasted_ARIES_and_IDs_mode.feather")

In [1]:
#R keeps crashing when I don't reload the dataset at this point and try to go immediately to the next cell ¯\_(ツ)_/¯

library(feather)

tbl <- read_feather("pasted_ARIES_and_IDs_mode.feather")

str(tbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  25 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [2]:
library(dplyr)

#Make mode of activity into a binary factor variable

tbl <- tbl %>% mutate(active = as.factor(ifelse(activityMode %in% c("Bicyclist","Pedestrian"),"active","inactive")))

str(tbl)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6418722 obs. of  26 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [3]:
write_feather(tbl, "ARIES.feather")

# Subsetting by activity for future convenience

In [4]:
#Make dataset of collisions that all involve active transportation

activeTbl <- tbl %>% filter(active %in% ("active"))

str(activeTbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	79511 obs. of  26 variables:
 $ INDIVIDUAL_MR_RECORD: num  1267560 1267560 1508589 3131760 1331273 ...
 $ LATDECIMALNMB       : num  41.7 41.7 38.3 41 38.6 ...
 $ LONGDECIMALNMB      : num  -86 -86 -85.8 -85.1 -85.6 ...
 $ COUNTYDESCR         : chr  "Elkhart" "Elkhart" "Floyd" "Allen" ...
 $ UNIQUELOCATIONID    : chr  "LUSHERPRAIRIE" "LUSHERPRAIRIE" "SPRINGSTSTATEST" "CALHOUNSTPAULDINGRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.02.22" "2007.02.22" "2007.05.11" "2007.06.13" ...
 $ SPEEDLIMITTXT       : chr  "30" "30" "25" "35" ...
 $ TRAFFICCNTRLDESCR   : chr  "Traffic Control Signal" "Traffic Control Signal" "Traffic Control Signal" "Traffic Control Signal" ...
 $ TRAFFICCNTLOPIND    : chr  "Y" "Y" "Y" "Y" ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Two Lanes (Two Way)" "Tw

In [5]:
# Explore roles of people in the active transportation data sets

activeTbl %>% group_by(PERSONTYPEDESCR) %>% summarize(count = n())

PERSONTYPEDESCR,count
<chr>,<int>
Driver,26985
Injured,408
Other,1
Owner Trailer,241
Owner Vehicle,16939
Pedal Cyclist,11766
Pedestrian,21421
,1750


In [6]:
# Explore how many unique collisions there are in the active transportation data set

str(activeTbl %>% group_by(LATDECIMALNMB, LONGDECIMALNMB) %>% summarize(count = n()))

# That reduces the number of unique observations significantly. Tread carefully in the future!

Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame':	27754 obs. of  3 variables:
 $ LATDECIMALNMB : num  1 37.8 37.9 37.9 37.9 ...
 $ LONGDECIMALNMB: num  1 -87.1 -87.1 -87 -87.1 ...
 $ count         : int  334 2 1 2 3 2 2 1 1 3 ...
 - attr(*, "groups")=Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	25684 obs. of  2 variables:
  ..$ LATDECIMALNMB: num  1 37.8 37.9 37.9 37.9 ...
  ..$ .rows        :List of 25684
  .. ..$ : int 1
  .. ..$ : int 2
  .. ..$ : int 3
  .. ..$ : int 4
  .. ..$ : int 5
  .. ..$ : int 6
  .. ..$ : int 7
  .. ..$ : int 8
  .. ..$ : int 9
  .. ..$ : int 10
  .. ..$ : int 11
  .. ..$ : int 12
  .. ..$ : int 13
  .. ..$ : int 14
  .. ..$ : int 15
  .. ..$ : int 16
  .. ..$ : int 17
  .. ..$ : int 18
  .. ..$ : int 19
  .. ..$ : int 20
  .. ..$ : int 21
  .. ..$ : int 22
  .. ..$ : int 23
  .. ..$ : int 24
  .. ..$ : int 25
  .. ..$ : int 26
  .. ..$ : int 27
  .. ..$ : int 28
  .. ..$ : int 29
  .. ..$ : int 30
  .. ..$ : int 31
  .. ..$ : int 32
  .. ..$ : int 33
  

In [7]:
write_feather(activeTbl,"activeARIES.feather")

In [8]:
#Make dataset of collisions that have no inclusion of active transportation

inactiveTbl <- tbl %>% filter(active %in% "inactive")

str(inactiveTbl)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	6339211 obs. of  26 variables:
 $ INDIVIDUAL_MR_RECORD: num  9.01e+08 9.01e+08 9.01e+08 9.01e+08 9.01e+08 ...
 $ LATDECIMALNMB       : num  40.7 39.2 40.2 41.6 40 ...
 $ LONGDECIMALNMB      : num  -85.4 -85.9 -85.4 -86.2 -85.9 ...
 $ COUNTYDESCR         : chr  "Huntington" "Bartholomew" "Delaware" "St Joseph" ...
 $ UNIQUELOCATIONID    : chr  "HUNTINGTONAVE" "E25THST" "KILGOREAVE" "WIRELANDRD" ...
 $ COLLISION_YEAR      : num  2007 2007 2007 2007 2007 ...
 $ COLLDTE             : chr  "2007.11.13" "2007.06.09" "2007.02.27" "2007.12.30" ...
 $ SPEEDLIMITTXT       : chr  "0" "15" "20" "10" ...
 $ TRAFFICCNTRLDESCR   : chr  "None" "None" "None" "None" ...
 $ TRAFFICCNTLOPIND    : chr  NA NA NA NA ...
 $ RUMBLESTRIPIND      : chr  "N" "N" "N" "N" ...
 $ SURFACETYPEDESCR    : chr  "ASPHALT" "ASPHALT" "ASPHALT" "ASPHALT" ...
 $ ROADTYPEDESCR       : chr  "Private Drive" "Private Drive" "Private Drive" "Private Drive" ...
 $ SCHOOLZONEIND       : chr 

In [9]:
write_feather(inactiveTbl,"inactiveARIES.feather")

# Legacy cells used for troubleshooting or exploratory analysis:

In [4]:
#This cell debugged the manual typing of variables

# probs <- tibble()

#ariesVars <- c("PERSONTYPEDESCR","GENDERCDE","AGE_GRP","INJSTATUSDESCR","INJNATUREDESCR","TESTGIVENDESCR","RESULTALCHTXT","RESULTDRUGIND", "COUNTYDESCR","COLLDTE","MOTORVEHINVOLVEDNMB","INJUREDNMB","DEADNMB","RDWYSUFFIXTXT","LATDECIMALNMB","LONGDECIMALNMB", "TRAFFICCNTLOPIND","AGGRESSIVEDRIVEIND","HITRUNIND","SCHOOLZONEIND","RUMBLESTRIPIND","CONSTRUCTIND","LIGHTCONDDESCR", "WEATHERDESCR","SURFACETYPECDE_CONDDESCR","TYPEDESCR","PRIMARYFACTORDESCR","MANNERCOLLDESCR","TRAFFICCNTRLDESCR", "UNITTYPEDESCR","OCCUPSNMB","SPEEDLIMITTXT","VEHUSEDESCR","ROADTYPEDESCR","TRAVDIRDESCR","EMGERENCY_RUN","PRECOLLACTDESCR")

# for (year in 2007:2019) {
#     dataset <- paste0("aries_crash_data_",year,".csv")
#     tbl <- read_csv(dataset,
#                        col_types="dddcddcccdcdcdccdcdcdcdcccccddcdccccdtcdddddccccdcddccccccdcdccdcdcdcccccccccccdddccccdcccccdcdcccccdcdccdc") %>% 
#                 select(7:9,18,20,23,25,26,31,34,40,42,43,45,51,52,53,54:58,60,62,63,65,67,69,79,83,87,91,94,96,98,99,104) %>%
#                 filter(PERSONTYPEDESCR %in% c("Pedal Cyclist","Pedestrian"))
#     locprobs <- problems(tbl) %>% 
#                 group_by(col,expected,actual) %>% 
#                 summarize(count = n()) %>%
#                 filter(col %in% ariesVars)
#     probs <- bind_rows(probs,locprobs)
# }

# probs

In [5]:
#This cell generated missing value percentages
# library(purrr)

# tbl %>% map(~ (mean(is.na(.))*100))

In [6]:
#This cell dropped variables that had missing values that were above 75% (Injury status and nature)

# tbl2 <- tbl %>% select(which(colMeans(is.na(.))*100 < 75))

# tbl2 %>% map(~ (mean(is.na(.))*100))