From 2f5ea4fb484ea71df802bf4d2644c804dbdcac78 Mon Sep 17 00:00:00 2001 From: ncihnegn Date: Fri, 12 Aug 2022 02:25:37 -0700 Subject: [PATCH] Add support for PAX Format, Version 1.0 --- src/archive.rs | 91 +++++++++++++++++++++++++--------- src/entry.rs | 40 +++++++++++++++ src/header.rs | 10 +++- tests/all.rs | 13 +++++ tests/archives/pax_sparse.tar | Bin 0 -> 7168 bytes 5 files changed, 129 insertions(+), 25 deletions(-) create mode 100644 tests/archives/pax_sparse.tar diff --git a/src/archive.rs b/src/archive.rs index e875124a..9e2d4b83 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -9,9 +9,10 @@ use std::path::Path; use crate::entry::{EntryFields, EntryIo}; use crate::error::TarError; +use crate::header::{SparseEntry, BLOCK_SIZE}; use crate::other; use crate::pax::pax_extensions_size; -use crate::{Entry, GnuExtSparseHeader, GnuSparseHeader, Header}; +use crate::{Entry, GnuExtSparseHeader, Header}; /// A top-level representation of an archive file. /// @@ -256,6 +257,7 @@ impl<'a, R: Read> Iterator for Entries<'a, R> { } } +#[allow(unused_assignments)] // https://github.com/rust-lang/rust/issues/22630 impl<'a> EntriesFields<'a> { fn next_entry_raw( &mut self, @@ -277,14 +279,14 @@ impl<'a> EntriesFields<'a> { // Otherwise, check if we are ignoring zeros and continue, or break as if this is the // end of the archive. if !header.as_bytes().iter().all(|i| *i == 0) { - self.next += 512; + self.next += BLOCK_SIZE as u64; break; } if !self.archive.inner.ignore_zeros { return Ok(None); } - self.next += 512; + self.next += BLOCK_SIZE as u64; header_pos = self.next; } @@ -325,11 +327,11 @@ impl<'a> EntriesFields<'a> { // Store where the next entry is, rounding up by 512 bytes (the size of // a header); let size = size - .checked_add(511) + .checked_add(BLOCK_SIZE as u64 - 1) .ok_or_else(|| other("size overflow"))?; self.next = self .next - .checked_add(size & !(512 - 1)) + .checked_add(size & !(BLOCK_SIZE as u64 - 1)) .ok_or_else(|| other("size overflow"))?; Ok(Some(ret.into_entry())) @@ -394,26 +396,65 @@ impl<'a> EntriesFields<'a> { if let Some(pax_extensions_ref) = &pax_extensions { pax_size = pax_extensions_size(pax_extensions_ref); } + // Not an entry + // Keep pax_extensions for the next ustar header + processed -= 1; continue; } let mut fields = EntryFields::from(entry); + fields.pax_extensions = pax_extensions; + pax_extensions = None; // Reset pax_extensions after use + if is_recognized_header && fields.is_pax_sparse() { + gnu_longname = fields.pax_sparse_name(); + } fields.long_pathname = gnu_longname; fields.long_linkname = gnu_longlink; - fields.pax_extensions = pax_extensions; self.parse_sparse_header(&mut fields)?; return Ok(Some(fields.into_entry())); } } fn parse_sparse_header(&mut self, entry: &mut EntryFields<'a>) -> io::Result<()> { - if !entry.header.entry_type().is_gnu_sparse() { + if !entry.is_pax_sparse() && !entry.header.entry_type().is_gnu_sparse() { return Ok(()); } - let gnu = match entry.header.as_gnu() { - Some(gnu) => gnu, - None => return Err(other("sparse entry type listed but not GNU header")), - }; + let mut sparse_map = Vec::::new(); + let mut real_size = 0; + if entry.is_pax_sparse() { + real_size = entry.pax_sparse_realsize()?; + let mut num_bytes_read = 0; + let mut reader = io::BufReader::with_capacity(BLOCK_SIZE, &self.archive.inner); + let mut read_decimal_line = || -> io::Result { + let mut str = String::new(); + num_bytes_read += reader.read_line(&mut str)?; + str.strip_suffix("\n") + .and_then(|s| s.parse::().ok()) + .ok_or_else(|| other("failed to read a decimal line")) + }; + + let num_entries = read_decimal_line()?; + for _ in 0..num_entries { + let offset = read_decimal_line()?; + let size = read_decimal_line()?; + sparse_map.push(SparseEntry { offset, size }); + } + let rem = BLOCK_SIZE - (num_bytes_read % BLOCK_SIZE); + entry.size -= (num_bytes_read + rem) as u64; + } else if entry.header.entry_type().is_gnu_sparse() { + let gnu = match entry.header.as_gnu() { + Some(gnu) => gnu, + None => return Err(other("sparse entry type listed but not GNU header")), + }; + real_size = gnu.real_size()?; + for block in gnu.sparse.iter() { + if !block.is_empty() { + let offset = block.offset()?; + let size = block.length()?; + sparse_map.push(SparseEntry { offset, size }); + } + } + } // Sparse files are represented internally as a list of blocks that are // read. Blocks are either a bunch of 0's or they're data from the @@ -442,13 +483,10 @@ impl<'a> EntriesFields<'a> { let data = &mut entry.data; let reader = &self.archive.inner; let size = entry.size; - let mut add_block = |block: &GnuSparseHeader| -> io::Result<_> { - if block.is_empty() { - return Ok(()); - } - let off = block.offset()?; - let len = block.length()?; - if len != 0 && (size - remaining) % 512 != 0 { + let mut add_block = |block: &SparseEntry| -> io::Result<_> { + let off = block.offset; + let len = block.size; + if len != 0 && (size - remaining) % BLOCK_SIZE as u64 != 0 { return Err(other( "previous block in sparse file was not \ aligned to 512-byte boundary", @@ -474,10 +512,10 @@ impl<'a> EntriesFields<'a> { data.push(EntryIo::Data(reader.take(len))); Ok(()) }; - for block in gnu.sparse.iter() { - add_block(block)? + for block in sparse_map { + add_block(&block)? } - if gnu.is_extended() { + if entry.header.as_gnu().map(|gnu| gnu.is_extended()) == Some(true) { let mut ext = GnuExtSparseHeader::new(); ext.isextended[0] = 1; while ext.is_extended() { @@ -485,14 +523,19 @@ impl<'a> EntriesFields<'a> { return Err(other("failed to read extension")); } - self.next += 512; + self.next += BLOCK_SIZE as u64; for block in ext.sparse.iter() { - add_block(block)?; + if !block.is_empty() { + add_block(&SparseEntry { + offset: block.offset()?, + size: block.length()?, + })?; + } } } } } - if cur != gnu.real_size()? { + if cur != real_size { return Err(other( "mismatch in sparse file chunks and \ size in header", diff --git a/src/entry.rs b/src/entry.rs index cce39d45..b3cfc11d 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -285,6 +285,46 @@ impl<'a> EntryFields<'a> { self.read_to_end(&mut v).map(|_| v) } + pub fn is_pax_sparse(&mut self) -> bool { + if let Some(ref pax) = self.pax_extensions { + let mut extensions = PaxExtensions::new(pax).filter_map(|f| f.ok()); + return extensions + .find(|f| f.key_bytes() == b"GNU.sparse.major" && f.value_bytes() == b"1") + .is_some() + && extensions + .find(|f| f.key_bytes() == b"GNU.sparse.minor" && f.value_bytes() == b"0") + .is_some(); + } + false + } + + pub fn pax_sparse_name(&mut self) -> Option> { + if let Some(ref pax) = self.pax_extensions { + return PaxExtensions::new(pax) + .filter_map(|f| f.ok()) + .find(|f| f.key_bytes() == b"GNU.sparse.name") + .map(|f| f.value_bytes().to_vec()); + } + None + } + + pub fn pax_sparse_realsize(&mut self) -> io::Result { + if let Some(ref pax) = self.pax_extensions { + let pax = PaxExtensions::new(pax) + .filter_map(|f| f.ok()) + .find(|f| f.key_bytes() == b"GNU.sparse.realsize") + .map(|f| f.value_bytes()); + if let Some(field) = pax { + let str = + std::str::from_utf8(&field).map_err(|_| other("failed to read string"))?; + return str + .parse::() + .map_err(|_| other("failed to parse the real size")); + } + } + Err(other("PAX extension GNU.sparse.realsize not found")) + } + fn path(&self) -> io::Result> { bytes2path(self.path_bytes()) } diff --git a/src/header.rs b/src/header.rs index 7e507fc7..4e482ca7 100644 --- a/src/header.rs +++ b/src/header.rs @@ -16,11 +16,13 @@ use std::str; use crate::other; use crate::EntryType; +pub const BLOCK_SIZE: usize = 512; + /// Representation of the header of an entry in an archive #[repr(C)] #[allow(missing_docs)] pub struct Header { - bytes: [u8; 512], + bytes: [u8; BLOCK_SIZE], } /// Declares the information that should be included when filling a Header @@ -110,6 +112,12 @@ pub struct GnuHeader { pub pad: [u8; 17], } +/// Description of a spare entry. +pub struct SparseEntry { + pub offset: u64, + pub size: u64, +} + /// Description of the header of a spare entry. /// /// Specifies the offset/number of bytes of a chunk of data in octal. diff --git a/tests/all.rs b/tests/all.rs index fa38ef62..22436535 100644 --- a/tests/all.rs +++ b/tests/all.rs @@ -1089,6 +1089,19 @@ fn sparse_with_trailing() { assert_eq!(&s[0x100_000..], "1MB through\n"); } +#[test] +fn pax_sparse() { + let rdr = Cursor::new(tar!("pax_sparse.tar")); + let mut ar = Archive::new(rdr); + let td = t!(TempBuilder::new().prefix("tar-rs").tempdir()); + t!(ar.unpack(td.path())); + + let mut s = String::new(); + t!(t!(File::open(td.path().join("sparse_begin.txt"))).read_to_string(&mut s)); + assert_eq!(&s[..5], "test\n"); + assert!(s[5..].chars().all(|x| x == '\u{0}')); +} + #[test] fn path_separators() { let mut ar = Builder::new(Vec::new()); diff --git a/tests/archives/pax_sparse.tar b/tests/archives/pax_sparse.tar new file mode 100644 index 0000000000000000000000000000000000000000..d74bef7b57b53810f3a35c68fae58f9672941e42 GIT binary patch literal 7168 zcmeH{!A`?442FBoQ{)MP?WDL6nO0$6q9j&q zfA*hyKimfY8sb&|;bR}3p2J(ysx+ixF#`Z=j4bk+?YPp<9)NTyksN~44oi@LRvRef zuvi8&4}JE@$DCU1y4jMm+jcwi&a}9($dwpX@+CFvNdjQzmFL^@+jn`sORt8>p;U3Bxs~nS1$g z5dL+u3kv?^7s%3>(Ldt&DV*xxA(=wLIl%sR{R+)1%w1576#B$Xzo3S2 jDolV0FaajO1egF5U;<2l2`~XBzyz286JP>NU_k=k(iU$S literal 0 HcmV?d00001