From 214a07fe9656f9e67cf46906bae87c40ee41d3e5 Mon Sep 17 00:00:00 2001 From: Alex Bokov Date: Tue, 8 Oct 2019 14:04:24 -0500 Subject: [PATCH] isfiletext() checks whether a file is text or binary in a cross platform matter, completing ticket #236. This can be useful when a file extension is missing or ambiguous --- NAMESPACE | 1 + R/isfiletext.R | 34 +++++++++++++++ man/isfiletext.Rd | 45 ++++++++++++++++++++ tests/testthat/test_isfiletext.R | 71 ++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) create mode 100644 R/isfiletext.R create mode 100644 man/isfiletext.Rd create mode 100644 tests/testthat/test_isfiletext.R diff --git a/NAMESPACE b/NAMESPACE index 87d7ab1..cf1b00c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,7 @@ export(get_ext) export(import) export(import_list) export(install_formats) +export(isfiletext) export(spread_attrs) importFrom(curl,curl_fetch_memory) importFrom(curl,parse_headers) diff --git a/R/isfiletext.R b/R/isfiletext.R new file mode 100644 index 0000000..9b3dae2 --- /dev/null +++ b/R/isfiletext.R @@ -0,0 +1,34 @@ +#' Determine whether a file is "plain-text" or some sort of binary format +#' +#' @param filename Path to the file +#' @param maxsize Maximum number of bytes to read +#' @param textbytes Which characters are used by normal (though not necessarily +#' just ASCII) text. To detect just ASCII, the following value +#' can be used: `as.raw(c(7:16,18,19,32:127))` +#' @param tf If `TRUE` (default) simply return `TRUE` when `filename` +#' references a text-only file and `FALSE` otherwise. If set to +#' `FALSE` then returns the "non text" bytes found in the file. +#' +#' @return boolean or raw +#' @export +#' @examples +#' library(datasets) +#' export(iris,"iris.yml") +#' isfiletext("iris.yml") +#' ## TRUE +#' +#' export(iris,"iris.sav") +#' isfiletext("iris.sav") +#' ## FALSE +#' isfiletext("iris.sav", tf=FALSE) +#' ## These are the characters found in "iris.sav" that are not printable text +#' ## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f +isfiletext <- function(filename,maxsize=Inf, + textbytes=as.raw(c(0x7:0x10,0x12,0x13,0x20:0xFF)), + tf=TRUE){ + bytes <- readBin(ff<-file(filename,'rb'),raw(),n=min(file.info(filename)$size, + maxsize)); + close(ff); + nontextbytes <- setdiff(bytes,textbytes); + if(tf) return(length(nontextbytes)==0) else return(nontextbytes); +} \ No newline at end of file diff --git a/man/isfiletext.Rd b/man/isfiletext.Rd new file mode 100644 index 0000000..78aa944 --- /dev/null +++ b/man/isfiletext.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/isfiletext.R +\name{isfiletext} +\alias{isfiletext} +\title{Determine whether a file is "plain-text" or some sort of binary format} +\usage{ +isfiletext( + filename, + maxsize = Inf, + textbytes = as.raw(c(7:16, 18, 19, 32:255)), + tf = TRUE +) +} +\arguments{ +\item{filename}{Path to the file} + +\item{maxsize}{Maximum number of bytes to read} + +\item{textbytes}{Which characters are used by normal (though not necessarily +just ASCII) text. To detect just ASCII, the following value +can be used: `as.raw(c(7:16,18,19,32:127))`} + +\item{tf}{If `TRUE` (default) simply return `TRUE` when `filename` +references a text-only file and `FALSE` otherwise. If set to +`FALSE` then returns the "non text" bytes found in the file.} +} +\value{ +boolean or raw +} +\description{ +Determine whether a file is "plain-text" or some sort of binary format +} +\examples{ +library(datasets) +export(iris,"iris.yml") +isfiletext("iris.yml") +## TRUE + +export(iris,"iris.sav") +isfiletext("iris.sav") +## FALSE +isfiletext("iris.sav", tf=FALSE) +## These are the characters found in "iris.sav" that are not printable text +## 02 00 05 03 06 04 01 14 15 11 17 16 1c 19 1b 1a 18 1e 1d 1f +} diff --git a/tests/testthat/test_isfiletext.R b/tests/testthat/test_isfiletext.R new file mode 100644 index 0000000..ccbb00b --- /dev/null +++ b/tests/testthat/test_isfiletext.R @@ -0,0 +1,71 @@ +context("correctly identifying files as text vs binary") +require("datasets") + +txtformats <- c("arff","csv","csv2","dump","fwf","psv","r","tsv","txt","fwf") +binformats <- c("dbf","dta","rda","rdata","rds","sas7bdat","sav","xlsx","xpt") +names(iris) <- gsub("\\.","_",names(iris)) + +test_that("Required text formats recognized as text", { + for(xx in txtformats) expect_true(isfiletext(export(iris, + paste0("iris.",xx))), + label = paste0(xx," should be text")) + }) + +test_that("Required non-text formats recognized as non-text", { + for(xx in binformats) expect_false(isfiletext(export(iris, + paste0("iris.",xx))), + label = paste0(xx," should be text")) + }) + +test_that("csvy recognized as text", { + skip_if_not_installed(pkg="csvy") + expect_true(isfiletext(export(iris,'iris.csvy'))) + }) + +test_that("xml and html recognized as text", { + skip_if_not_installed(pkg="xml2") + expect_true(isfiletext(export(iris,'iris.xml'))) + expect_true(isfiletext(export(iris,'iris.html'))) + }) + +test_that("json recognized as text", { + skip_if_not_installed(pkg="jsonlite") + expect_true(isfiletext(export(iris,'iris.json'))) + }) + +test_that("yml recognized as text", { + skip_if_not_installed(pkg="yaml") + expect_true(isfiletext(export(iris,'iris.yml'))) + }) + +test_that("pzfx recognized as text", { + skip_if_not_installed(pkg="pzfx") + expect_true(isfiletext(export(iris[,-5],"iris.pzfx"))) + }) + +# binformats_suggest <- c("matlab","ods","fst","feather") +test_that("matlab recognized as binary", { + skip_if_not_installed(pkg="rmatio") + expect_false(isfiletext(export(iris,'iris.matlab'))) +}) + +test_that("ods recognized as binary", { + skip_if_not_installed(pkg="readODS") + expect_false(isfiletext(export(iris,'iris.ods'))) +}) + +test_that("fst recognized as binary", { + skip_if_not_installed(pkg="fst") + expect_false(isfiletext(export(iris,'iris.fst'))) +}) + +test_that("feather recognized as binary", { + skip_if_not_installed(pkg="feather") + expect_false(isfiletext(export(iris,'iris.feather'))) +}) + +unlink(paste0('iris.',c(txtformats,binformats,'csvy','xml','html','json', + 'yml','pzfx','matlab','ods', + 'fst','feather'))) +rm(iris,txtformats,binformats) +