In [1]:
# Display options
options(jupyter.rich_display = FALSE)

In [2]:
# Let's check current working directory
getwd()

[1] "/home/anna/TEACHING/01_winter/dprpy-teachers/solutions/exercises_07"

In [3]:
# Let's see what is in this directory
list.files('.')

 [1] "__MACOSX"                         "2013-citibike-tripdata"          
 [3] "2014-citibike-tripdata"           "2015-citibike-tripdata"          
 [5] "CRAN-packages-download-v2.R"      "CRAN-packages-download.R"        
 [7] "download.R"                       "exemple-nycbikes.R"              
 [9] "JC-201510-citibike-tripdata.csv"  "log-download.txt"                
[11] "log-uzip.txt"                     "nycbikes-analysis.R"             
[13] "nycbikes-data-exploration.ipynb"  "nycbikes-tripdata-clean_up.ipynb"
[15] "nycbikes-tripdata-download.ipynb" "nycbikes-tripdata-download.R"    

In [4]:
# We want to take all relevant nycbikes data and moved them into structured directory

path = '.' # we know that our data are in our current working directory
           # we can change it if we want
# all data are stored in folowing folders after extracting from zip archives
dirs <- file.path(path, c('2013-citibike-tripdata/', 
                          '2014-citibike-tripdata/',
                          '2015-citibike-tripdata/'))

In [5]:
# We will list all the data (with full path)
files <- c(list.files(dirs[-1], recursive = TRUE, # we treat 2013 differently 
                      full.names = TRUE),         # because it is structured differently
           list.files(dirs[1], pattern = '.csv', 
                      full.names = TRUE))

In [6]:
head(files) # let's take a look

[1] "./2014-citibike-tripdata//1_January/201401-citibike-tripdata_1.csv"  
[2] "./2014-citibike-tripdata//10_October/201410-citibike-tripdata_1.csv" 
[3] "./2014-citibike-tripdata//11_November/201411-citibike-tripdata_1.csv"
[4] "./2014-citibike-tripdata//12_December/201412-citibike-tripdata_1.csv"
[5] "./2014-citibike-tripdata//2_February/201402-citibike-tripdata_1.csv" 
[6] "./2014-citibike-tripdata//3_March/201403-citibike-tripdata_1.csv"    

In [7]:
# Let's create the target directory for our data 
outdir <- file.path(path, 'nycbikes')
if(!dir.exists(outdir)){
    dir.create(outdir, recursive = TRUE)
}

In [8]:
# Now it may be useful to split each year into separate subdirectory
for (subdir in file.path(outdir, substr(dirs, 3, 6))){
    dir.create(subdir)
}

In [9]:
# Here we create path that we want to use to move the data
# i.e. target paths
outfiles <- file.path(outdir, substr(basename(files), 1, 4), basename(files))

In [28]:
outfiles

 [1] "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014"
 [5] "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014"
 [9] "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014" "./nycbikes/2014"
[13] "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015"
[17] "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015"
[21] "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015"
[25] "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015" "./nycbikes/2015"
[29] "./nycbikes/2013" "./nycbikes/2013" "./nycbikes/2013" "./nycbikes/2013"
[33] "./nycbikes/2013" "./nycbikes/2013" "./nycbikes/2013"

In [10]:
# Here we will copy all files into apropriate subdirectory
file.copy(files, outfiles)

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE

In [11]:
# Let's remove original files
file.remove(files)

 [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[31] TRUE TRUE TRUE TRUE TRUE

In [28]:
# In order to remove the data we no longer need
# We have to remove all files first:
file.remove(list.files(dirs, recursive = TRUE, full.names = TRUE, all.files = TRUE))

[1] TRUE TRUE TRUE

In [30]:
# Then directories
file.remove(list.dirs(dirs, recursive = TRUE))

[1] TRUE TRUE TRUE

In [44]:
# Let's compress the data
library(R.utils)
files <- list.files(outdir, recursive = TRUE, full.names = TRUE)
for (f in files) gzip(f)

In [45]:
list.files(outdir, recursive = TRUE, full.names = TRUE)

 [1] "./nycbikes/2013/201306-citibike-tripdata.csv.gz"  
 [2] "./nycbikes/2013/201307-citibike-tripdata.csv.gz"  
 [3] "./nycbikes/2013/201308-citibike-tripdata.csv.gz"  
 [4] "./nycbikes/2013/201309-citibike-tripdata.csv.gz"  
 [5] "./nycbikes/2013/201310-citibike-tripdata.csv.gz"  
 [6] "./nycbikes/2013/201311-citibike-tripdata.csv.gz"  
 [7] "./nycbikes/2013/201312-citibike-tripdata.csv.gz"  
 [8] "./nycbikes/2014/201401-citibike-tripdata_1.csv.gz"
 [9] "./nycbikes/2014/201402-citibike-tripdata_1.csv.gz"
[10] "./nycbikes/2014/201403-citibike-tripdata_1.csv.gz"
[11] "./nycbikes/2014/201404-citibike-tripdata_1.csv.gz"
[12] "./nycbikes/2014/201405-citibike-tripdata_1.csv.gz"
[13] "./nycbikes/2014/201406-citibike-tripdata_1.csv.gz"
[14] "./nycbikes/2014/201407-citibike-tripdata_1.csv.gz"
[15] "./nycbikes/2014/201408-citibike-tripdata_1.csv.gz"
[16] "./nycbikes/2014/201409-citibike-tripdata_1.csv.gz"
[17] "./nycbikes/2014/201410-citibike-tripdata_1.csv.gz"
[18] "./nycbikes/2014/201411-ci