Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
adavies42 committed Jul 9, 2021
1 parent 6025dc9 commit d4c9d92
Show file tree
Hide file tree
Showing 2 changed files with 397 additions and 0 deletions.
350 changes: 350 additions & 0 deletions q/unzip/unzip.q
@@ -0,0 +1,350 @@
.finos.dep.include"util/util.q"

// Split a subsection of data into fields.
// Starts from offset and takes sum fields entries, splitting them according
// to fields.
// fields is a dictionary of field names and widths
// @param x fields
// @param y offset
// @param z data
// @return the split subsection of the vector
.finos.unzip.priv.split:{(key x)!(get sums prev x)cut z y+til sum x}

// Parse a range of data with a header.
// parser is a function of three arguments:
// its first argument will be (data;extra); extra is passed as :: if not
// included
// its second argument will be the starting index of the record to extract
// its third argument will be the raw headers of the record, split and
// labeled according to fields
// it should return (record;next index)
// parser will be called until it returns next index equal to length
// @param x (parser;fields;extra)
// @param y data
// @param z length
// @return parsed records
// @see .finos.unzip.priv.split
.finos.unzip.priv.parse:{
if[2=count x;
x,:(::);
];

f:{
$[
(z 1)=z 2;
z;
[
h:.finos.unzip.priv.split[x 1;z 1]y;
a:x[0][(y;x 2);(z 1)+sum x 1]h;
(raze(first z;enlist a 0);a 1;z 2)]]};

1_first f[x][y]over(enlist(enlist`)!enlist(::);0;z)}

// field names and widths for end-of-central-directory
.finos.unzip.priv.wecd:`sig`dnu`dcd`den`ten`csz`cof`cln`cmt!4 2 2 2 2 4 4 2 0

// Parse end-of-central-directory record.
// @param x bytes
// @return end-of-central-directory record
.finos.unzip.priv.pecd:{
r:.finos.unzip.priv.split[.finos.unzip.priv.wecd;0]x;
r:![r;();0b;{y!x y}[{({0x0 sv reverse x};x)}'](key r)except`sig`cmt];
r:update cmt:"c"$(neg cln)#x from r;
r}

// field names and widths for central directory
.finos.unzip.priv.wcd:`sig`ver`vrr`flg`cmp`mtm`mdt`crc`csz`usz`nln`xln`cln`dnu`iat`xat`lof!4 2 2 2 2 2 2 4 4 4 2 2 2 2 2 4 4

// Parse a central directory record.
// @param x (bytes;extra)
// @param y index
// @param z header
// @return (record;next index)
// @see .finos.unzip.priv.parse
.finos.unzip.priv.pcd:{
e:x 1;
x:x 0;

r:update
{("i"$first x)%10}ver,
{("i"$first x)%10}vrr,
0b vs 0x0 sv reverse flg,
0x0 sv reverse cmp,
{"v"$24 60 60 sv 1 1 2*2 sv'0 5 11 cut 0b vs 0x0 sv reverse x}mtm,
{.finos.util.ymd . 1980 0 0+2 sv'0 7 11 cut 0b vs 0x0 sv reverse x}mdt,
0x0 sv reverse csz,
0x0 sv reverse usz,
0x0 sv reverse nln,
0x0 sv reverse xln,
0x0 sv reverse cln,
0x0 sv reverse dnu,
0b vs 0x0 sv reverse iat,
0b vs 0x0 sv reverse xat,
0x0 sv reverse lof
from z;

r:update
fnm:`$"c"$x y+til nln,
xfd:x y+nln+til xln,
cmt:"c"$x y+nln+xln+til cln
from r;

(r;exec y+nln+xln+cln from r)}

// field names and widths for extra field
.finos.unzip.priv.wxfd:`id`sz!2 2

// Parse an extra field record.
// @param x (bytes;extra)
// @param y index
// @param z header
// @return (record;next index)
// @see .finos.unzip.priv.parse
.finos.unzip.priv.pxfd:{
e:x 1;
x:x 0;

r:update
reverse id,
0x0 sv reverse sz
from z;

r:$[
/ ZIP64
0x0001~r`id;
/ a variable number of fields in fixed order: parse according to size
r,:0x0 sv'reverse each{((count y)#x)!y}[`usz`csz`lof`dnu](sums prev{(1+((type y)$sums y)?x)#y}[r`sz]8 8 8 4h)cut y _x;

/ Extended Timestamp
0x5455~r`id;
[
/ check field size matches flag byte
if[(r`sz)<>1+4*sum 0b vs first(r`sz)#y _x;
'`parse;
];
/ a variable number of fields in fixed order: parse according to size
r,:ltime .finos.util.timestamp_from_epoch each 0x0 sv'reverse each{((count y)#x)!y}[`mtime`atime`ctime]4 cut 1_(r`sz)#y _x;
r];

/ Extended Timestamp
0x7875~r`id;
[
/ check version
if[1<>first(r`sz)#y _x;
'`nyi;
];
/ check field size is consistent with data
if[(r`sz)<>3+last{r:x 1;x:x 0;s:first x;((1+s)_x;r+s)}over(1_(r`sz)#y _x;0);
'`parse;
];
r,:0x0 sv'reverse each`uid`gid!last{r:x 1;x:x 0;s:first x;x:1_x;$[s;(s _ x;r,enlist s#x);(x;r)]}over(1_(r`sz)#y _x;());
r];

[
.finos.log.warning(-3!r`id),": unimplemented extra field id; skipping";
r]];

(r;exec y+sz from r)}

// field names and widths for field data
.finos.unzip.priv.wfd:`sig`ver`os`flg`cmp`mtm`mdt`crc`csz`usz`nln`xln!4 1 1 2 2 2 2 4 4 4 2 2

// Parse a file data record.
// @param x (bytes;extra)
// @param y index
// @param z header
// @return (record;next index)
// @see .finos.unzip.priv.parse
.finos.unzip.priv.pfd:{
e:x 1;
x:x 0;

r:update
{("i"$first x)%10}ver,
first os,
0b vs 0x0 sv reverse flg,
0x0 sv reverse cmp,
{"v"$24 60 60 sv 1 1 2*2 sv'0 5 11 cut 0b vs 0x0 sv reverse x}mtm,
{.finos.util.ymd . 1980 0 0+2 sv'0 7 11 cut 0b vs 0x0 sv reverse x}mdt,
0x0 sv reverse csz,
0x0 sv reverse usz,
0x0 sv reverse nln,
0x0 sv reverse xln
from z;

r:update fnm:`$"c"$x y+til nln from r;

r:update xfd:x y+nln+til xln from r;

if[(not r`xln)&any -1=r`csz`usz;
'`parse;
];

if[r`xln;
r:update .finos.unzip.priv.parse[(.finos.unzip.priv.pxfd;.finos.unzip.priv.wxfd);xfd;count xfd]from r;

/ if ZIP64 record, upsert
r,:exec{$[not any i:0x0001~/:x[;`id];();1=sum i;2_x first where i;'`parse]}xfd from r;

/ ignore any other extra fields for now
];

r:update
fdt:x y+nln+xln+til csz,
dtd:x{(x*count y)#y}[flg 3]y+nln+xln+csz+til 3
from r;

/ TODO can this filter be applied any earlier?
r:$[
(e~(::))|(r`fnm)in e;
[
.finos.log.info"inflating ",string r`fnm;

$[
/ no compression: copy
0=r`cmp;update fdu:"c"$fdt from r;

/ deflate: reframe as gzip stream and inflate
8=r`cmp;update fdu:"c"$(.Q.gz 0x1f8b0800000000000003,fdt,crc,4#reverse 0x0 vs usz mod prd 32#2)from r;

'`nyi]];
update fdu:""from r];

(r;exec y+nln+xln+csz+3*flg 3 from r)}

// Extract one file from an archive using unzip(1).
// @param x hsym
// @param y sym
// @return character vector
.finos.unzip.priv.unzip_system:{
f:hsym`$first system"mktemp";
system"(unzip -p \"",(1_string x),"\" \"",(string y),"\" >",(1_string f),")";
r:"c"$read1 f;
hdel f;
r}

// Perform various zip-related operations.
// Possible values for x, and expected z arg in each case:
// `list: List files in an archive.
// z: ignored
// `unzip: Extract (specific file(s) from) an archive.
// z: sym, sym vector, or (::) to unzip all files
// See https://users.cs.jmu.edu/buchhofp/forensics/formats/pkzip.html,
// https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT, etc.
// @param x sym
// @param y hsym, character vector, or byte vector
// @param z see above
// @return dictionary of filenames and character vectors
.finos.unzip.priv.unzip:{
if[not x in`list`unzip;
'`domain;
];

/ accept file; save filename as n
/ (n will be () if x is not an hsym)
if[-11h=type y;
n:y;
y:read1 y;
];

/ accept chars
if[10h=type y;
y:"x"$y;
];

/ accept bytes
if[4h<>type y;
'`type;
];

if[`unzip=x;
if[not(11h=abs type z)|z~(::);
'`domain;
];
/
if[not z~(::);
'`nyi;
];
\
];

.finos.log.info"processing ",$[-11h=type n;1_string n;"archive"];

/ look for central directory signature
/ assume last match is valid
/ more sophisticated algos are possible, but they can be implemented as needed
cds:("c"$y)ss"c"$0x504b0506;
if[1>count cds;
'"no cds";
];
cds:last cds;

/ parse end-of-central-directory record
ecd:.finos.unzip.priv.pecd y(first cds)+til(count y)-first cds;

/ punt on multi-disk archives
if[0<>ecd`dnu;'`nyi];
if[0<>ecd`dcd;'`nyi];

/ TODO delete?
/ blank record table
/o:enlist(enlist`)!enlist(::);

r:$[
`list=x;
[
/ bytes of central directory record
cd:y(ecd`cof)+til ecd`csz;

/ parse central directory
cd:.finos.unzip.priv.parse[(.finos.unzip.priv.pcd;.finos.unzip.priv.wcd);cd;count cd];
1!select name:fnm,size:usz,timestamp:mdt+mtm from cd];
`unzip=x;
[
/ parse file data
fd:.finos.unzip.priv.parse[(.finos.unzip.priv.pfd;.finos.unzip.priv.wfd;z);y;ecd`cof];

r:exec fnm!fdu from fd;

r:$[
11h=type z;
z#r;
-11h=type z;
r z;
r];

if[.finos.unzip.verify&-11h=type n;
.finos.log.info"verifying";
v:r~$[
-11h=type z;
.finos.unzip.priv.unzip_system[n]z;
{y!x y}[n .finos.unzip.priv.unzip_system/:]key r];
if[not v;
'`parse;
];
.finos.log.info"verified";
];
r];
'`domain];

r}

// Set to true to verify extraction against unzip(1).
// N.b. will not work if .finos.unzip.unzip is called from a thread
.finos.unzip.verify:0b

// List files in an archive.
// @param x hsym, character vector, or byte vector
// @return table of filenames and file metadata
.finos.unzip.list:{.finos.unzip.priv.unzip[`list;x;::]}

// Unzip an archive.
// @param x hsym, character vector, or byte vector
// @return dictionary of filenames and character vectors
.finos.unzip.unzip:{.finos.unzip.priv.unzip[`unzip;x;::]}

// Unzip specific files from an archive.
// @param x hsym, character vector, or byte vector
// @param y sym vector
// @return dictionary of filenames and character vectors
.finos.unzip.unzip2:{.finos.unzip.priv.unzip[`unzip;x;y]}
47 changes: 47 additions & 0 deletions q/util/util.q
@@ -0,0 +1,47 @@
// log stubs
.finos.log.critical:{-1"CRITICAL: ",x;}
.finos.log.error :{-1"ERROR: " ,x;}
.finos.log.warning :{-1"WARNING: " ,x;}
.finos.log.info :{-1"INFO: " ,x;}
.finos.log.debug :{-1"DEBUG: " ,x;}

// Run and log garbage collection.
.finos.util.free:{[].finos.log.debug"freed ",(string .Q.gc[])," bytes";}

// Date from year/month/day.
.finos.util.ymd:{"D"$"."sv"0"^-4 -2 -2$string(x;y;z)}'

// Convert epoch seconds to (global) timestamp.
// @param x number or number vector
// @return timestamp or timestamp vector
.finos.util.timestamp_from_epoch:{"p"$("j"$1970.01.01D)+1000000000*x}

// Print progress, with peach and try-catch.
// The weight function is used to measure progress more accurately when
// different arguments will take significantly different amounts of time.
// When this is not the case, pass a constant function (e.g. {1}).
// E.g. to (re/de)compress files, set/unset .z.zd and pass x as hcount, y
// as {x set get x}, and z as the files.
// @param x monadic function: weight (e.g. hcount, {1}, etc.)
// @param y monadic function
// @param z list: args for y
// @return dict: z!@[(1b;)y@;;(0b;)]peach z
.finos.util.progress:{
f:{[s;f;a;w;i]
eta:{x+(abs type e)$(e:y-x)%z};
dll:{" "sv(key x){": "sv(string x;$[10<>type y;string;]y)}'get x};
progper:{
paren:{"(",x,")"};
prog:{"/"sv(neg count string y)$string(x;y)};
per:{.Q.fmt[6;2;100*x],"%"};
" "sv(prog[x;y];paren per x%y)};
.finos.log.debug dll`now`position`work`elapsed`eta!(
p;
progper[i+1;count a];
progper[w i;last w];
p-s;
eta[s;p:.z.P;(w i)%last w]
);
@[(1b;)f@;a i;(0b;)]};
z!f[.z.P;y;z;w:sums x peach z]peach til count z}

0 comments on commit d4c9d92

Please sign in to comment.