# Preprocessing to break up original dataset file so as to be uploadable onto GitHub (without Git LFS)

In [1]:
library(tidyverse)
library(haven)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


In [2]:
# The dataset file was too large to upload to github and the team wanted to store the raw data on github 
# not https://git-lfs.github.com

# Source for SAS file: https://www.fjc.gov/sites/default/files/idb/datasets/cr19.sas7bdat
cr19_df <- read_sas("Project/preprocessing/cr19.sas7bdat")

In [3]:
nrow(cr19_df) # To decide on reasonable chunk size

In [4]:
# Source https://stackoverflow.com/a/7060331/6328256
chunk <- 40000 # So there will be 6 output files 
n <- nrow(cr19_df)
r <- rep(1:ceiling(n/chunk), each=chunk)[1:n]
d <- split(cr19_df, r)

In [5]:
# Create directory for csvs
dir.create("Project/raw_data_csvs")

In [6]:
# Create the CSV files
write_csv(d$`1`, 'Project/raw_data_csvs/cr19_part1.csv', col_names=TRUE)
write_csv(d$`2`, 'Project/raw_data_csvs/cr19_part2.csv', col_names=FALSE)
write_csv(d$`3`, 'Project/raw_data_csvs/cr19_part3.csv', col_names=FALSE)
write_csv(d$`4`, 'Project/raw_data_csvs/cr19_part4.csv', col_names=FALSE)
write_csv(d$`5`, 'Project/raw_data_csvs/cr19_part5.csv', col_names=FALSE)
write_csv(d$`6`, 'Project/raw_data_csvs/cr19_part6.csv', col_names=FALSE)