# Preprocessing to break up original dataset file so as to be uploadable onto GitHub (without Git LFS)

In [1]:
library(tidyverse)
library(haven)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
# The team wanted to store the raw data on github and not have to use https://git-lfs.github.com

# Source for dataset: https://www.fjc.gov/sites/default/files/idb/datasets/cr19_0.sas7bdat

# Unfortunately using `read_sas("https://www.fjc.gov/sites/default/files/idb/datasets/cr19_0.sas7bdat")` 
# i.e. loading straight from the URL doesn't work for some reason

cr19_df <- read_sas("cr19_0.sas7bdat")

In [3]:
nrow(cr19_df) # To decide on reasonable chunk size

In [4]:
# Source https://stackoverflow.com/a/7060331/6328256
chunk <- 40000 # So there will be 6 output files 
n <- nrow(cr19_df)
r <- rep(1:ceiling(n/chunk), each=chunk)[1:n]
d <- split(cr19_df, r)

In [6]:
# Create directory for csvs
dir.create("../raw_data_csvs")

In [7]:
# Create the CSV files
write_csv(d$`1`, '../raw_data_csvs/cr19_part1.csv', col_names=TRUE)
write_csv(d$`2`, '../raw_data_csvs/cr19_part2.csv', col_names=FALSE)
write_csv(d$`3`, '../raw_data_csvs/cr19_part3.csv', col_names=FALSE)
write_csv(d$`4`, '../raw_data_csvs/cr19_part4.csv', col_names=FALSE)
write_csv(d$`5`, '../raw_data_csvs/cr19_part5.csv', col_names=FALSE)
write_csv(d$`6`, '../raw_data_csvs/cr19_part6.csv', col_names=FALSE)