Skip to content
Browse files

Initial commit

  • Loading branch information...
0 parents commit 046528fb3a57616845c851b44334e2da1df7128f Anthony D. Urso committed Nov 4, 2010
Showing with 21,106 additions and 0 deletions.
  1. +110 −0 build.xml
  2. +5 −0 conf/log4j.properties
  3. +1,255 −0 docs/sawzall-intrinsics.html
  4. +2,913 −0 docs/sawzall-language.html
  5. +1,964 −0 docs/sawzall-spec.html
  6. +845 −0 docs/sawzall-style-guide.html
  7. BIN lib/avro-1.3.2.jar
  8. BIN lib/commons-cli-1.2.jar
  9. BIN lib/commons-codec-1.4.jar
  10. BIN lib/commons-logging-1.1.1.jar
  11. BIN lib/hadoop-0.21.0-mrunit.jar
  12. BIN lib/hadoop-common-0.21.0.jar
  13. BIN lib/hadoop-mapred-0.21.0.jar
  14. BIN lib/javacc.jar
  15. BIN lib/javassist-3.8.0.GA.jar
  16. BIN lib/jtb132.jar
  17. BIN lib/junit-4.8.2.jar
  18. BIN lib/log4j-1.2.15.jar
  19. BIN lib/protobuf-2.0.3.jar
  20. BIN lib/scannotation-1.0.2.jar
  21. +162 −0 src/java/sizzle/aggregators/Aggregator.java
  22. +38 −0 src/java/sizzle/aggregators/AggregatorSpec.java
  23. +31 −0 src/java/sizzle/aggregators/CollectionAggregator.java
  24. +94 −0 src/java/sizzle/aggregators/DistinctAggregator.java
  25. +76 −0 src/java/sizzle/aggregators/FloatHistogramAggregator.java
  26. +59 −0 src/java/sizzle/aggregators/FloatMeanAggregator.java
  27. +68 −0 src/java/sizzle/aggregators/FloatQuantileAggregator.java
  28. +64 −0 src/java/sizzle/aggregators/FloatSumAggregator.java
  29. +101 −0 src/java/sizzle/aggregators/HistogramAggregator.java
  30. +79 −0 src/java/sizzle/aggregators/IntHistogramAggregator.java
  31. +59 −0 src/java/sizzle/aggregators/IntMeanAggregator.java
  32. +71 −0 src/java/sizzle/aggregators/IntQuantileAggregator.java
  33. +64 −0 src/java/sizzle/aggregators/IntSumAggregator.java
  34. +45 −0 src/java/sizzle/aggregators/LogAggregator.java
  35. +59 −0 src/java/sizzle/aggregators/MaximumAggregator.java
  36. +49 −0 src/java/sizzle/aggregators/MeanAggregator.java
  37. +183 −0 src/java/sizzle/aggregators/MinimaxAggregator.java
  38. +60 −0 src/java/sizzle/aggregators/MinimumAggregator.java
  39. +37 −0 src/java/sizzle/aggregators/MrcounterAggregator.java
  40. +30 −0 src/java/sizzle/aggregators/OutputAggregator.java
  41. +112 −0 src/java/sizzle/aggregators/QuantileAggregator.java
  42. +124 −0 src/java/sizzle/aggregators/SortedCountingSet.java
  43. +30 −0 src/java/sizzle/aggregators/StderrAggregator.java
  44. +30 −0 src/java/sizzle/aggregators/StdoutAggregator.java
  45. +49 −0 src/java/sizzle/aggregators/TextAggregator.java
  46. +262 −0 src/java/sizzle/aggregators/TopAggregator.java
  47. +69 −0 src/java/sizzle/aggregators/UniqueAggregator.java
  48. +865 −0 src/java/sizzle/compiler/CodeGeneratingVisitor.java
  49. +109 −0 src/java/sizzle/compiler/FunctionTrie.java
  50. +455 −0 src/java/sizzle/compiler/IndexeeFindingVisitor.java
  51. +326 −0 src/java/sizzle/compiler/NameFindingVisitor.java
  52. +224 −0 src/java/sizzle/compiler/SizzleCompiler.java
  53. +125 −0 src/java/sizzle/compiler/StaticDeclarationCodeGeneratingVisitor.java
  54. +562 −0 src/java/sizzle/compiler/SymbolTable.java
  55. +823 −0 src/java/sizzle/compiler/TypeCheckingVisitor.java
  56. +34 −0 src/java/sizzle/compiler/TypeException.java
  57. +41 −0 src/java/sizzle/functions/FunctionSpec.java
  58. +237 −0 src/java/sizzle/functions/SizzleCasts.java
  59. +206 −0 src/java/sizzle/functions/SizzleEncodingIntrinsics.java
  60. +138 −0 src/java/sizzle/functions/SizzleFileIntrinsics.java
  61. +160 −0 src/java/sizzle/functions/SizzleMathIntrinsics.java
  62. +215 −0 src/java/sizzle/functions/SizzleSortIntrinsics.java
  63. +251 −0 src/java/sizzle/functions/SizzleSpecialIntrinsics.java
  64. +381 −0 src/java/sizzle/functions/SizzleStringIntrinsics.java
  65. +894 −0 src/java/sizzle/functions/SizzleTimeIntrinsics.java
  66. +188 −0 src/java/sizzle/io/EmitKey.java
  67. +258 −0 src/java/sizzle/io/EmitValue.java
  68. +111 −0 src/java/sizzle/runtime/SizzleCombiner.java
  69. +48 −0 src/java/sizzle/runtime/SizzleMapper.java
  70. +104 −0 src/java/sizzle/runtime/SizzleReducer.java
  71. +63 −0 src/java/sizzle/runtime/SizzleRunner.java
  72. +22 −0 src/java/sizzle/types/SizzleAny.java
  73. +130 −0 src/java/sizzle/types/SizzleArray.java
  74. +21 −0 src/java/sizzle/types/SizzleBool.java
  75. +21 −0 src/java/sizzle/types/SizzleBytes.java
  76. +31 −0 src/java/sizzle/types/SizzleFingerprint.java
  77. +50 −0 src/java/sizzle/types/SizzleFloat.java
  78. +214 −0 src/java/sizzle/types/SizzleFunction.java
  79. +42 −0 src/java/sizzle/types/SizzleInt.java
  80. +116 −0 src/java/sizzle/types/SizzleMap.java
  81. +57 −0 src/java/sizzle/types/SizzleScalar.java
  82. +21 −0 src/java/sizzle/types/SizzleString.java
  83. +332 −0 src/java/sizzle/types/SizzleTable.java
  84. +34 −0 src/java/sizzle/types/SizzleTime.java
  85. +141 −0 src/java/sizzle/types/SizzleTuple.java
  86. +81 −0 src/java/sizzle/types/SizzleType.java
  87. +83 −0 src/java/sizzle/types/SizzleVarargs.java
  88. +8 −0 src/jtb/sizzle.jtb
  89. +5 −0 src/proto/p4stat.proto
  90. +6 −0 src/proto/querylog.proto
  91. +6 −0 src/proto/sizzle_document.proto
  92. +6 −0 src/proto/sizzle_location.proto
  93. +579 −0 src/test/javaxtools/compiler/CharSequenceCompiler.java
  94. +73 −0 src/test/javaxtools/compiler/CharSequenceCompilerException.java
  95. +66 −0 src/test/sizzle/aggregators/TestCollectionAggregator.java
  96. +126 −0 src/test/sizzle/aggregators/TestDistinctAggregator.java
  97. +82 −0 src/test/sizzle/aggregators/TestFloatHistogramAggregator.java
  98. +54 −0 src/test/sizzle/aggregators/TestFloatMeanAggregator.java
  99. +103 −0 src/test/sizzle/aggregators/TestFloatQuantileAggregator.java
  100. +54 −0 src/test/sizzle/aggregators/TestFloatSumAggregator.java
  101. +82 −0 src/test/sizzle/aggregators/TestIntHistogramAggregator.java
  102. +54 −0 src/test/sizzle/aggregators/TestIntMeanAggregator.java
  103. +103 −0 src/test/sizzle/aggregators/TestIntQuantileAggregator.java
  104. +54 −0 src/test/sizzle/aggregators/TestIntSumAggregator.java
  105. +152 −0 src/test/sizzle/aggregators/TestMaximumAggregator.java
  106. +152 −0 src/test/sizzle/aggregators/TestMinimumAggregator.java
  107. +75 −0 src/test/sizzle/aggregators/TestMrcounterAggregator.java
  108. +105 −0 src/test/sizzle/aggregators/TestSortedCountingSet.java
  109. +66 −0 src/test/sizzle/aggregators/TestTextAggregator.java
  110. +247 −0 src/test/sizzle/aggregators/TestTopAggregator.java
  111. +120 −0 src/test/sizzle/aggregators/TestUniqueAggregator.java
  112. +498 −0 src/test/sizzle/compiler/TestCodeGeneratingVisitor.java
  113. +93 −0 src/test/sizzle/compiler/TestFunctionTrie.java
  114. +43 −0 src/test/sizzle/compiler/TestIndexeeFindingVisitor.java
  115. +400 −0 src/test/sizzle/compiler/TestTypeCheckingVisitor.java
  116. +17 −0 src/test/sizzle/functions/LocationInfo.java
  117. +45 −0 src/test/sizzle/functions/SizzleTestFunctions.java
  118. +58 −0 src/test/sizzle/functions/TestSizzleSpecialIntrinsics.java
  119. +22 −0 src/test/sizzle/functions/TestSizzleStringIntrinsics.java
  120. +45 −0 src/test/sizzle/functions/TestSizzleTimeIntrinsics.java
  121. +32 −0 src/test/sizzle/utils/MakeNormalDoubles.java
  122. +32 −0 src/test/sizzle/utils/MakeRandomDoubles.java
  123. +32 −0 src/test/sizzle/utils/MakeRandomInts.java
Sorry, we could not display the entire diff because it was too big.
110 build.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" ?>
+<project name="sizzle" default="package">
+ <property name="src.proto" location="src/proto"/>
+
+ <path id="classpath">
+ <pathelement location="build/classes" />
+ <pathelement location="build/testclasses" />
+ <fileset dir="lib" includes="*.jar" />
+ </path>
+
+ <target name="init">
+ <mkdir dir="build/java/sizzle/parser" />
+ <mkdir dir="build/javacc" />
+ <mkdir dir="build/classes" />
+ <mkdir dir="build/testclasses" />
+ <mkdir dir="dist" />
+ </target>
+
+ <target name="translate-proto" depends="init">
+ <apply executable="protoc" parallel="true">
+ <arg value="--proto_path=${src.proto}" />
+ <arg value="--java_out=build/java" />
+ <srcfile />
+ <fileset dir="${src.proto}" includes="*.proto" />
+ </apply>
+ </target>
+
+ <target name="tree" depends="init">
+ <java jar="lib/jtb132.jar" fork="true" dir="build/java/sizzle/parser">
+ <arg value="-o" />
+ <arg value="../../../javacc/sizzle.jj" />
+ <arg value="-p" />
+ <arg value="sizzle.parser" />
+ <arg value="../../../../src/jtb/sizzle.jtb" />
+ </java>
+ </target>
+
+ <target name="parser" depends="tree">
+ <javacc target="build/javacc/sizzle.jj" outputdirectory="build/java/sizzle/parser" javacchome="lib" static="true" />
+ </target>
+
+ <target name="compile-generated" depends="translate-proto,parser">
+ <javac includeantruntime="true" srcdir="build/java" destdir="build/classes" debug="on" debuglevel="lines,vars,source">
+ <classpath refid="classpath" />
+ </javac>
+ </target>
+
+ <target name="compile" depends="compile-generated">
+ <javac includeantruntime="true" srcdir="src/java" destdir="build/classes" debug="on" debuglevel="lines,vars,source">
+ <classpath refid="classpath" />
+ </javac>
+ </target>
+
+ <target name="compile-tests" depends="compile">
+ <javac includeantruntime="true" srcdir="src/test" destdir="build/testclasses" debug="on" debuglevel="lines,vars,source">
+ <classpath refid="classpath" />
+ </javac>
+ </target>
+
+ <target name="test" depends="compile-tests">
+ <junit fork="yes" haltonfailure="yes">
+ <classpath refid="classpath" />
+ <batchtest>
+ <formatter type="plain" usefile="false" />
+ <fileset dir="build/testclasses" includes="**/Test*.class" />
+ </batchtest>
+ </junit>
+ <delete file="SecurityAuth.audit" />
+ </target>
+
+ <target name="doc" depends="init">
+ <javadoc destdir="build/classes" sourcepath="src/java" packagenames="sizzle.*">
+ <classpath refid="classpath" />
+ </javadoc>
+ </target>
+
+ <target name="package" depends="test,doc">
+ <jar destfile="dist/sizzle-runtime.jar">
+ <fileset dir="build/classes">
+ <patternset>
+ <include name="sizzle/" />
+ <exclude name="**/parser/" />
+ <exclude name="**/compiler/" />
+ <exclude name="**/types/" />
+ </patternset>
+ </fileset>
+ </jar>
+ <jar destfile="dist/sizzle-compiler.jar">
+ <manifest>
+ <attribute name="Main-Class" value="sizzle.compiler.SizzleCompiler" />
+ </manifest>
+ <fileset dir="build/classes">
+ <patternset>
+ <exclude name="**/io/" />
+ <exclude name="**/runtime/" />
+ </patternset>
+ </fileset>
+ <fileset dir="conf" />
+ <zipfileset excludes="META-INF/" src="lib/log4j-1.2.15.jar" />
+ <zipfileset excludes="META-INF/" src="lib/commons-cli-1.2.jar" />
+ <zipfileset excludes="META-INF/" src="lib/scannotation-1.0.2.jar" />
+ <zipfileset excludes="META-INF/" src="lib/javassist-3.8.0.GA.jar" />
+ </jar>
+ </target>
+
+ <target name="clean" depends="init">
+ <delete dir="build" />
+ <delete dir="dist" />
+ </target>
+</project>
5 conf/log4j.properties
@@ -0,0 +1,5 @@
+log4j.rootCategory=INFO, console
+
+log4j.appender.console = org.apache.log4j.ConsoleAppender
+log4j.appender.console.layout = org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern = %d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
1,255 docs/sawzall-intrinsics.html
@@ -0,0 +1,1255 @@
+<!doctype html public "-//w3c//dtd html 4.01 transitional//en">
+<html>
+<head>
+<title>A Manual for the Sawzall Intrinsics</title>
+<link rel="stylesheet" href="docstyle.css">
+</head>
+<body>
+
+<h1> <a name="title">A Manual for the Sawzall Intrinsics</a> </h1>
+
+<p>This document describes the properties of the most important
+pre-defined identifiers ("intrinsics") in the Sawzall language.
+
+<p>
+The starting point for this document was the result of running the Sawzall
+interpreter <tt>szl</tt> with the <tt>--explain</tt> option, like this:
+<pre>
+ for i in $(szl --explain=)
+ do
+ szl --explain=$i
+ done
+</pre>
+That script will always provide up to date, if rudimentary, information.
+<p>
+Throughout this document, the term <i>undef</i> refers to the explicitly
+undefined value created by various incorrect or incalculable operations,
+such as dividing by zero or the contents of a non-existent file.
+
+<br><br>
+<h2> <a name="types">Basic Data Types</a> </h2>
+These are the basic data types defined by Sawzall.
+Although `basic', some of them (<tt>string</tt>, <tt>bytes</tt>) have
+array-like structure.
+
+<br><br><hr><pre><font color=green>
+type int = int; # basic type
+</font></pre>
+The <tt>int</tt> type represents a signed 64-bit quantity.
+
+<br><br><hr><pre><font color=green>
+type uint = uint; # basic type
+</font></pre>
+The <tt>uint</tt> type represents an unsigned 64-bit quantity.
+
+<br><br><hr><pre><font color=green>
+type float = float; # basic type
+</font></pre>
+In the current implementation, the <tt>float</tt> type represents
+a 64-bit IEEE floating-point value.
+
+<br><br><hr><pre><font color=green>
+type string = string; # basic type
+</font></pre>
+In the current implementation, the <tt>string</tt> type represents a string
+of 16-bit Unicode characters. Individual characters may be accessed
+(read or written) using indexing, exactly as if the string were an array of
+unsigned 16-bit integers.
+
+<br><br><hr><pre><font color=green>
+type time = time; # basic type
+</font></pre>
+In the current implementation, the <tt>time</tt> type represents
+an unsigned 64-bit quantity
+recording microseconds since the Unix epoch of January 1 1970 00:00 GMT.
+
+<br><br><hr><pre><font color=green>
+type bytes = bytes; # basic type
+</font></pre>
+The <tt>bytes</tt> type represents a string of 8-bit
+unsigned bytes.
+Individual bytes may be accessed (read or written) using
+indexing, exactly as if the <tt>bytes</tt> object were an array of unsigned
+8-bit integers.
+
+<br><br><hr><pre><font color=green>
+type bool = bool; # basic type
+</font></pre>
+The <tt>bool</tt> type represents a Boolean value.
+
+<br><br><hr><pre><font color=green>
+type fingerprint = fingerprint; # basic type
+</font></pre>
+The <tt>fingerprint</tt> type represents
+an unsigned 64-bit quantity
+calculated using an implementation-dependent hash function.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="i/o">I/O Variables</a> </h2>
+
+<hr><pre><font color=green>
+input: bytes;
+</font></pre>
+The <tt>input</tt> variable is set to the raw byte-level representation
+of each successive input record.
+
+<br><br><hr><pre><font color=green>
+stdout: table collection of s: string file("/dev/stdout") format("%s\n", s);
+</font></pre>
+The <tt>stdout</tt> variable provides a convenient destination for
+formatted output. <tt>Emit</tt> statements that send text to
+<tt>stdout</tt> will print the text to standard output (file descriptor 1).
+A newline is added automatically to each string emitted.
+
+<br><br><hr><pre><font color=green>
+stderr: table collection of s: string file("/dev/stderr") format("%s\n", s);
+</font></pre>
+The <tt>stderr</tt> variable provides a convenient destination for
+formatted debugging output. <tt>Emit</tt> statements that send text to
+<tt>stderr</tt> will print the text to standard error (file descriptor 2).
+A newline is added automatically to each string emitted.
+
+<br><br><hr><pre><font color=green>
+output: table collection of bytes file("/dev/stdout");
+</font></pre>
+The <tt>output</tt> variable provides a convenient destination for
+<tt>emit</tt> statements that write to standard output.
+The data sent to <tt>output</tt> is unmodified and uninterpreted.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="constants">Named Constants</a> </h2>
+Many are self-explanatory or familiar.
+
+<br><br><hr><pre><font color=green>
+static true: bool = true; # literal
+</font></pre>
+
+<br><hr><pre><font color=green>
+static false: bool = false; # literal
+</font></pre>
+
+<br><hr><pre><font color=green>
+static PI: float = 3.1415926535897931; # literal
+</font></pre>
+A floating-point approximation of Pi.
+
+<br><br><hr><pre><font color=green>
+static Inf: float = Inf; # literal
+static inf: float = Inf; # literal
+</font></pre>
+IEEE 754 infinity.
+
+<br><br><hr><pre><font color=green>
+static NaN: float = NaN; # literal
+static nan: float = NaN; # literal
+</font></pre>
+IEEE 754 not a number.
+
+<br><br><hr><pre><font color=green>
+static SECOND: time = SECOND; # literal
+</font></pre>
+A million microseconds.
+
+<br><br><hr><pre><font color=green>
+static SEC: time = SEC; # literal
+</font></pre>
+Synonym for <tt>SECOND</tt>.
+
+<br><br><hr><pre><font color=green>
+static MINUTE: time = MINUTE; # literal
+</font></pre>
+Sixty <tt>SECONDS</tt>.
+
+<br><br><hr><pre><font color=green>
+static MIN: time = MIN; # literal
+</font></pre>
+Synonym for <tt>MINUTE</tt>.
+
+<br><br><hr><pre><font color=green>
+static HOUR: time = HOUR; # literal
+</font></pre>
+Sixty <tt>MINUTES</tt>.
+
+<br><br><hr><pre><font color=green>
+static HR: time = HR; # literal
+</font></pre>
+Synonym for <tt>HOUR</tt>.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="special">Special Functions</a> </h2>
+
+These functions have special properties, such as variable types,
+variable numbers of parameters,
+or parameters that are types rather than values. Some of the syntax
+used to describe them, e.g. "...", default arguments and overloading,
+is not part of the Sawzall language.
+
+<br><br><hr><pre><font color=green>
+abs: function(v: int);
+abs: function(v: float);
+</font></pre>
+Return the absolute value of the argument. The type must be one of int or float.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+assert: function(condition: bool [, message: string] );
+</font></pre>
+If <tt>condition</tt> is false, print the <tt>message</tt> to standard
+error, with the prefix <tt>assertion failed:</tt>, and exit.
+The message may be empty or absent altogether.
+
+<br><br><hr><pre><font color=green>
+def: function(v: <i>type</i>): bool;
+</font></pre>
+The <tt>def</tt> function returns a boolean value according to whether <tt>v</tt>
+has a defined value. It can serve as a guard to protect code against undefined values.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+fingerprintof: function(v: <i>type</i>): fingerprint;
+</font></pre>
+The <tt>fingerprintof</tt> function
+returns the 64-bit fingerprint of the argument, which may be of any type.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+inproto: function(f: <i>type</i>): bool;
+</font></pre>
+The <tt>inproto</tt> function tests whether the field <tt>f</tt> was present
+in the proto buffer converted into the proto tuple containing the field <tt>f</tt>. <tt>f</tt> must be of the form
+<i>proto_tuple_var</i><tt>.</tt><i>field_name</i>. Consequently,
+<tt>inproto</tt> must only be applied to fields of proto tuples. If the
+proto tuple field was set explicitly (e.g. via an assignment to that
+field) or by conversion from a proto buffer that contains an explicit value for
+that field, <tt>inproto</tt> returns <tt>true</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+inprotocount: function(f: <i>type</i>): bool;
+</font></pre>
+
+Returns the number of fields in a proto tuple that have the inproto bit set.
+Fields in nested tuples are taken into account. In the case of an array of
+nested tuples, the fields in each tuple get counted.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+clearproto: function(f: <i>type</i>): bool;
+</font></pre>
+The clearproto function clears a field in the proto buffer converted into the
+proto tuple containing the field f.f must be of the form
+proto_tuple_var.field_name. Consequently, clearproto must only be applied to
+fields of proto tuples. clearproto will make a subsequent <tt>inproto()</tt> on
+the same field return false. However, the memory for this field will not be
+freed until the whole protocol buffer goes out of scope.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+new: function(array of <i>type</i>, n: int, v: <i>type</i>): array of <i>type</i>;
+</font></pre>
+In this form, the <tt>new</tt> function creates an array of <tt>n</tt> elements
+of the specified <i>type</i>, all initialized to the value <tt>v</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+new: function(string, n: int, v: int): string;
+</font></pre>
+In this form, the <tt>new</tt> function creates a string of <tt>n</tt>
+copies of the character specified by the value <tt>v</tt>, which must not be
+zero.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+new: function(bytes, n: int, v: int): bytes;
+</font></pre>
+In this form, the <tt>new</tt> function creates a bytes value consisting of
+<tt>n</tt> copies of the least significant byte of the value <tt>v</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+new: function(<i>maptype</i>): <i>maptype</i>;
+</font></pre>
+In this form, the <tt>new</tt> function creates an empty map of
+the specified <i>maptype</i>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+regex: function(<i>type</i> [, <i>base</i>] ): string;
+</font></pre>
+The <tt>regex</tt> function returns a string holding a regular expression
+suitable for matching text representing values of the specified <i>type</i>.
+For example, <tt>regex(int)</tt> generates a string to match integer constants
+as represented in Sawzall or C++ programs (<tt>-23</tt>, <tt>0x1f</tt>, etc.).
+The implemention is rudimentary; at the moment, only <tt>int</tt> and <tt>float</tt>
+types are supported.
+When the type is <tt>int</tt>, an optional numerical base may be specified
+for the conversion.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+saw: function(s: string, r: regex, ...): array of string;
+sawn: function(n: int, s: string, r: regex, ...): array of string;
+sawzall: function(s: string, r: regex, ...): array of string;
+</font></pre>
+These functions slice string <tt>s</tt> into pieces according to subsequent
+matches of the regular expressions.
+The <tt>saw</tt> function is the simplest; it runs along the list of regular
+expressions exactly once, while <tt>sawn</tt> does it at most <tt>n</tt>
+times and <tt>sawzall</tt> continues until the input string is exhausted.
+More detail should appear in this document, but for now please refer
+to the <a href="sawmill-language.html">Sawzall language design document</a> for more information.
+<br>These functions return <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+haskey: function(m: map[<i>keytype</i>] of <i>valuetype</i>, key: <i>keytype</i>): bool;
+</font></pre>
+Return a boolean reporting whether the key is present in the map.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+keys: function(m: map[<i>keytype</i>] of <i>valuetype</i>): array of <i>keytype</i>;
+</font></pre>
+Return an array holding, in no particular order, the set of keys present in the map <tt>m</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+len: function(v: <i>type</i>): int;
+</font></pre>
+Return the number of elements in <tt>v</tt>, which must be an array or map
+or of type <tt>string</tt> or <tt>bytes</tt>.
+<br>If <tt>string</tt>, the value is the number of Unicode characters in the string;
+<br>if <tt>bytes</tt>, the number of bytes.
+<br>If a map, the value is the number of distinct keys present.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+lookup: function(m: map[<i>keytype</i>] of <i>valuetype</i>, key: <i>keytype</i>, value: <i>valuetype</i>): <i>valuetype</i>;
+</font></pre>
+Return the element of the map indexed by the key or, if there is no such element, the specified default value.
+Assuming the map, key, and value are defined, equivalent to (using C <tt>?:</tt>
+ notation): <tt>def(m[key])? m[key] : value</tt>, but more efficient.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+max: function(v1: <i>type</i>, v2: <i>type</i>): <i>type</i>;
+</font></pre>
+Return the maximum of <tt>v1</tt> and <tt>v2</tt>.
+The <i>type</i> must be one of <tt>int</tt>, <tt>time</tt>, <tt>string</tt>, or <tt>float</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+min: function(v1: <i>type</i>, v2: <i>type</i>): <i>type</i>;
+</font></pre>
+Return the minimum of <tt>v1</tt> and <tt>v2</tt>.
+The <i>type</i> must be one of <tt>int</tt>, <tt>time</tt>, <tt>string</tt>, or <tt>float</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+sort(array of basic_type): array of basic_type;
+</font></pre>
+Return the sorted version of an array. Only scalar values can be sorted.
+Values will be arranged in increasing order. (An optional comparison
+function, which takes two elements and returns int {-,0,+}, is accepted
+as a second argument, but it is ignored.)
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+sortx(array of basic_type): array of int;
+</font></pre>
+Return the index vector that sorts an array. Only scalar values can be
+sorted. The index vector arranges array values in increasing order.
+(An optional comparison function, which takes two elements and returns
+int {-,0,+}, is accepted as a second argument, but it is ignored.)
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+DEBUG: function(): int;
+</font></pre>
+(Obsolete) This function is for use by the implementers only.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="convert">Conversions</a> </h2>
+
+<hr><pre><font color=green>
+convert: function(<i>type1</i>, v: <i>type2</i>, parameters: <i>type3</i>, ...): <i>type1</i>;
+</font></pre>
+The <tt>convert</tt> operator converts value <tt>v</tt>, of type <i>type2</i>, into
+the corresponding value of type <i>type1</i>.
+The parameters depend on the particular types being converted.
+For instance, <tt>convert(int, "23", 16);</tt> interprets the string <tt>"23"</tt>
+as a hexadecimal number.
+<br>Returns <i>undef</i> only if an error occurs.
+<p>
+The following basic conversions are supported; each row is a "from" type and each column is a "to" type. Parameter values are allowed only as
+noted, and are always optional.
+<p>
+<table border=1 cellpadding=4>
+ <th>
+ <td><tt><b>bool</b></tt></td>
+ <td><tt><b>int</b></tt></td>
+ <td><tt><b>float</b></tt></td>
+ <td><tt><b>time</b></tt></td>
+ <td><tt><b>fingerprint</b></tt></td>
+ <td><tt><b>string</b></tt></td>
+ <td><tt><b>bytes</b></tt></td>
+ <td><tt><b>uint</b></tt></td>
+ </th>
+ <tr>
+ <td><tt><b>bool</b></tt></td>
+ <td>identity</td>
+ <td>0 and 1</td>
+ <td>&nbsp;</td>
+ <td>&nbsp;</td>
+ <td>&nbsp;</td>
+ <td>"false" and "true"</td>
+ <td>&nbsp;</td>
+ <td>0 and 1</td>
+ </tr>
+ <tr>
+ <td><tt><b>int</b></tt></td>
+ <td>&nbsp;</td>
+ <td>identity</td>
+ <td>as in C++</td>
+ <td>reinterpret</td>
+ <td>reinterpret</td>
+ <td><i>param</i>: base</td>
+ <td><i>param</i>: encoding format</td>
+ <td>bitwise identity</td>
+ </tr>
+ <tr>
+ <td><tt><b>float</b></tt></td>
+ <td>&nbsp;</td>
+ <td>as in C++</td>
+ <td>identity</td>
+ <td>&nbsp;</td>
+ <td>&nbsp;</td>
+ <td>as in C++</td>
+ <td>&nbsp;</td>
+ <td>as in C++</td>
+ </tr>
+ <tr>
+ <td><tt><b>time</b></tt></td>
+ <td>&nbsp;</td>
+ <td>reinterpret</td>
+ <td>&nbsp;</td>
+ <td>identity</td>
+ <td>&nbsp;</td>
+ <td><i>param</i>: time zone</td>
+ <td>&nbsp;</td>
+ <td>reinterpret</td>
+ </tr>
+ <tr>
+ <td><tt><b>fingerprint</b></tt></td>
+ <td>&nbsp;</td>
+ <td>reinterpret</td>
+ <td>&nbsp;</td>
+ <td>&nbsp;</td>
+ <td>identity</td>
+ <td>as <tt>0x%016xp</tt></td>
+ <td>unpack</td>
+ <td>reinterpret</td>
+ </tr>
+ <tr>
+ <td><tt><b>string</b></tt></td>
+ <td>matches <tt>^[Tt]</tt></td>
+ <td><i>param</i>: base</td>
+ <td>as in C++</td>
+ <td><i>param</i>: time zone</td>
+ <td><i>param</i>: base</td>
+ <td>identity</td>
+ <td><i>param</i>: string encoding</td>
+ <td><i>param</i>: base</td>
+ </tr>
+ <tr>
+ <td><tt><b>bytes</b></tt></td>
+ <td>&nbsp;</td>
+ <td><i>param</i>: encoding format</td>
+ <td>&nbsp;</td>
+ <td>&nbsp;</td>
+ <td>pack</td>
+ <td><i>param</i>: string encoding</td>
+ <td>identity</td>
+ <td><i>param</i>: encoding format</td>
+ </tr>
+ <tr>
+ <td><tt><b>uint</b></tt></td>
+ <td>&nbsp;</td>
+ <td>bitwise identity</td>
+ <td>as in C++</td>
+ <td>reinterpret</td>
+ <td>reinterpret</td>
+ <td><i>param</i>: base</td>
+ <td><i>param</i>: encoding format</td>
+ <td>identity</td>
+ </tr>
+</table>
+<p>
+Notes:
+<ul>
+ <li>
+ Reinterpreted values are based on 64-bit signed or unsigned
+ underlying values.
+ </li>
+ <li>
+ Supported string encodings include "UTF-8" (the default), "latin-1",
+ and "hex" (each byte is represented as two characters in hexadecimal).
+ Conversion of bytes to string also accepts "array-literal", which yields
+ a string suitable for use as a bytes value in a Sawzall program. The
+ exact format of this string may depend on whether any of the bytes have
+ a value outside the range of printable ASCII characters.
+ </li>
+ <li>
+ Bases from 2 to 36 are supported (see <tt>strtol</tt>); the default output
+ base is 10; the default input base is 10 unless it is overridden by a
+ leading "<tt>0x</tt>" for hex or "<tt>0</tt>" for octal. An explicit input
+ base of 0 is the same as omitting the base.
+ </li>
+ <li>
+ Unpacked fingerprints are exactly eight bytes long; order is big-endian.
+ </li>
+ <li>
+ Time zones are RFC822 (e.g. <tt>"CDT"</tt>, <tt>"EST5EDT"</tt>, or
+ <tt>"GMT"</tt>) or Olson identifiers (e.g. <tt>"America/Los_Angeles"</tt>);
+ the default time zone is PST8PDT.
+ <a href="http://code.google.com/p/tzdata/">Olson identifiers are
+ preferred</a>.
+ </li>
+ <li>
+ Supported int encoding formats for conversion between bytes and int are
+ "fixed32-big", "fixed32-little", "fixed64-big", "fixed64-little" (32- and
+ 64-bit big- and little-endian packed bytes), "saw" (an alias of
+ "fixed64-big", "varint" (64-bit
+ run-length-encoded format used in protocol buffers), and "zigzag" (a variant
+ of "varint" which uses the ZigZag encoding to encode negative numbers
+ efficiently). The encoding parameter is required.
+ </li>
+ <li>
+ When converting from string to int, if you fail to specify a base,
+ the base used for the conversion is determined by the appearance
+ of the input. C-like rules are followed (as for C's <tt>strtol</tt>
+ function), meaning that for example "08" fails to convert
+ because 8 is not a valid octal digit. Specify an explicit
+ base to avoid this problem. If you want to use base 0, document
+ your choice by making the base explicit.
+ </li>
+</ul>
+
+A limited number of conversions are supported on compound types:
+<ul>
+ <li>
+ array to array: an array may be converted to a different array type
+ if the conversion of the element types is one of these:
+ <ul>
+ <li> any of the above basic conversions. The additional parameters are
+ allowed or required exactly as in the corresponding non-array
+ conversions and have the same meaning.
+ <li> proto tuple to bytes, as descibed below.
+ <li> bytes to proto tuple, as descibed below.
+ <li> tuple to tuple, as descibed below.
+ </ul>
+ </li>
+ <br>
+ <li>
+ array to map: an array with an even number of elements may be converted
+ to a map if the array element type is either the same as or convertible
+ (using a basic conversion) to the key type and the array element type is
+ either the same as or convertible (using a basic conversion) to the value
+ type; except that conversions from bytes to int or uint, and from int or
+ uint to bytes, are not allowed because there is no default value for the
+ encoding parameter.
+ <p>
+ It is not possible to supply additional parameters for the base, encoding
+ or time zone. The conversions from the type of an array element to the
+ key type and from the type of an array element to the value type always
+ use the corresponding default value for any optional parameter. See the
+ notes above regarding the default values.
+ </li>
+ <br>
+ <li>
+ array, map or tuple to string: returns a text representation of the value
+ (except when converting array of int to string with "unicode"; see below).
+ </li>
+ <li>
+ function to string: returns the function name, a text representation of the
+ function or a string indicating that the value is anonymous.
+ </li>
+ <br>
+ <li>
+ proto tuple to bytes: returns the encoded protocol buffer byte
+ stream.
+ </li>
+ <li>
+ bytes to proto tuple: treats the bytes value as an encoded protocol buffer
+ byte stream and returns the decoded value.
+ </li>
+ <br>
+ <li>
+ tuple to tuple: returns the corresponding value; a value of tuple type may
+ be converted to a different but structurally equivalent tuple type.
+ </li>
+ <br>
+ <li>
+ string to array of int: each array element gets the code point value
+ of a single character of the string; the parameter must be present and
+ must have the value <tt>"unicode"</tt>.
+ </li>
+ <li>
+ array of int to string, when the parameter is present and has the value
+ <tt>"unicode"</tt>: each array element supplies the code point value
+ of a single character of the string.
+ </li>
+ <br>
+ <li>
+ array to tuple: an array may be converted to a tuple if there is one array
+ element for each tuple field and each array element can be converted
+ to the corresponding field. Conversions from bytes to int or uint, and
+ from int or uint to bytes, are not allowed because there is no default
+ value for the encoding parameter. All other supported conversions are
+ allowed including conversions on compound types.
+ <p>
+ It is not possible to supply additional parameters for the base, encoding
+ or time zone. The conversion from the type of an array element to the
+ type of a field always uses the corresponding default value for any
+ optional parameter. See the notes above regarding the default values.
+ </li>
+</ul>
+<br><hr>
+
+<br>
+<h2 id="time">Time</h2>
+These functions manipulate time values.
+Although one may do simple arithmetic to add a minute, say,
+because of daylight saving time, leap years, and other inconveniences many
+such operations require more sophistication, which these functions
+provide.
+<p>
+Most time functions accept an optional argument indicating the time zone; the
+default time zone is PST8PDT.
+
+<br><br><hr><pre><font color=green>
+addday: function(t: time, n: int [, tz: string] ): time;
+</font></pre>
+Return the time <tt>n</tt> days after <tt>t</tt>.
+The value of <tt>n</tt> may be negative, or <tt>n</tt>
+may be absent altogether (<tt>addday(t)</tt>), in which
+case <tt>n</tt> defaults to 1.
+An optional third argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+addmonth: function(t: time, n: int [, tz: string] ): time;
+</font></pre>
+Like <tt>addday</tt>, but for months.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+addweek: function(t: time, n: int [, tz: string] ): time;
+</font></pre>
+Like <tt>addday</tt>, but for weeks.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+addyear: function(t: time, n: int [, tz: string] ): time;
+</font></pre>
+Like <tt>addday</tt>, but for years.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+dayofmonth: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric day of the month; for January 17, return 17, etc.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+dayofweek: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric day of the week, from Monday=1 to Sunday=7.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+dayofyear: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric day of the year. January 1 is day 1.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+hourof: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric hour of the day, from 0 to 23. Midnight is 0, 1AM is 1, etc.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+minuteof: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric minute of the hour, from 0 to 59.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+monthof: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric month of the year. January is 1.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+secondof: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric second of the minute, from 0 to 59.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+yearof: function(t: time [, tz: string] ): int;
+</font></pre>
+The numeric year value, such as 2003.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctoday: function(t: time [, tz: string] ): time;
+</font></pre>
+Truncate <tt>t</tt> to the zeroth microsecond of the day.
+Useful when creating variables indexed to a particular day, since
+all times in the day truncated with <tt>trunctoday</tt> will fold to
+the same value, which is the first time value in that day.
+An optional second argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctohour: function(t: time [, tz: string] ): time;
+</font></pre>
+Like <tt>trunctoday</tt>, but truncate to the start of the hour.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctominute: function(t: time [, tz: string] ): time;
+</font></pre>
+Like <tt>trunctoday</tt>, but truncate to the start of the minute.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctomonth: function(t: time [, tz: string] ): time;
+</font></pre>
+Like <tt>trunctoday</tt>, but truncate to the start of the month.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctosecond: function(t: time [, tz: string] ): time;
+</font></pre>
+Like <tt>trunctoday</tt>, but truncate to the start of the second.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+trunctoyear: function(t: time [, tz: string] ): time;
+</font></pre>
+Like <tt>trunctoday</tt>, but truncate to the start of the year.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+now: function(): time;
+</font></pre>
+Return the current time at the moment of execution. Note that the <tt>time</tt>
+value returned does not depend on a time zone.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+formattime: function(formatstring: string, t: time [, tz: string]): string;
+</font></pre>
+Return a string containing the time argument formatted according to the format
+string fmt. The syntax of the format string is the same as in ANSI C strftime.
+An optional third argument, a string, names a time zone.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="files">Files and other OS resources</a> </h2>
+
+<hr><pre><font color=green>
+load: function(file: string): bytes;
+</font></pre>
+Return the entire contents of the named <tt>file</tt> as an uninterpreted byte stream.
+<br>Returns <i>undef</i> only if the file cannot be opened or read.
+
+
+<br><br><hr><pre><font color=green>
+getenv: function(variable: string): string;
+</font></pre>
+Return the contents of the named environment
+<tt>variable</tt> as a string. The raw data is interpreted as UTF-8 in the
+same manner as the default conversion from bytes to string.
+<br>Returns <i>undef</i> only if the variable does not exist.
+
+<br><br><hr><pre><font color=green>
+getadditionalinput: function(variable: string): bytes;
+</font></pre>
+A map of strings to bytes may be provided to Proc by the process running
+szl. Return the bytes mapped to by the argument.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+setadditionalinput: function(label: string, value: bytes);
+</font></pre>
+Stores a (label, value) pair.
+<br>Never returns <i>undef</i>.
+
+
+<br><br><hr><pre><font color=green>
+lockadditionalinput: function();
+</font></pre>
+Prevents further calls to setadditionalinput for this record.
+<br>Never returns <i>undef</i>.
+
+
+<br><br><hr><pre><font color=green>
+type resourcestats = {
+ initializedavailablemem: int,
+ initializedallocatedmem: int,
+ initializedusertime: time,
+ initializedsystemtime: time,
+ availablemem: int,
+ allocatedmem: int,
+ usertime: time,
+ systemtime: time
+};
+</font></pre>
+
+<pre><font color=green>
+getresourcestats: function(): resourcestats;
+</font></pre>
+
+Return a tuple of type resourcestats containing resource usage statistics. The
+f
+irst set of numbers reports the statistics after static initialization. The
+second set reports the values consumed by processing the current input record.
+The availablemem figure reports total size of the heap; allocatedmem is the
+amount in use on the heap. Memory is measured in bytes, and time is measured
+in microseconds. Availability and accuracy of these values is implementation
+dependent.
+<br>Never returns <i>undef</i>.
+
+
+<hr>
+<br>
+<i>The database intrinsics are not implemented. They are provided
+as a recommendation for the any future implementation of database access.</i>
+<pre><font color=green>
+type SQL_DB = int; # basic type
+
+dbconnect: function(dbspec: string, defaultspec: string): int;
+</font></pre>
+Connects to a database with the dbspecs and returns a db object. It is
+recommended to declare the db object as static so only one connection is made per
+worker.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<pre><font color=green>
+dbquery: function(db: int, query: string): array of array of string;
+</font></pre>
+Executes a sql query on the given database object. Returns an array of array
+of string, each array of string representing one row of results. For most
+queries such as SELECT statements, the results can be declared as static to
+avoid excessive queries on the database.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="strings">String Manipulation</a> </h2>
+
+<hr><pre><font color=green>
+lowercase: function(s: string): string;
+</font></pre>
+Return the string <tt>s</tt> with all characters converted to lower case,
+as defined by Unicode.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+uppercase: function(s: string): string;
+</font></pre>
+Return the string <tt>s</tt> with all characters converted to upper case,
+as defined by Unicode.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+strfind: function(p: string, s: string): int;
+</font></pre>
+Search for the first occurrence of the literal string <tt>p</tt>
+within <tt>s</tt> and return the integer index of its first character,
+or <tt>-1</tt> if it does not occur.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+strrfind: function(p: string, s: string): int;
+</font></pre>
+Search for the last occurrence of the literal string <tt>p</tt>
+within <tt>s</tt> and return the integer index of its first character,
+or <tt>-1</tt> if it does not occur.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+bytesfind: function(p: bytes, b: bytes): int;
+</font></pre>
+Search for the first occurrence of the literal bytes <tt>p</tt>
+within <tt>b</tt> and return the integer index of its first byte,
+or <tt>-1</tt> if it does not occur.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+bytesrfind: function(p: bytes, b: bytes): int;
+</font></pre>
+Search for the last occurrence of the literal bytes <tt>p</tt>
+within <tt>b</tt> and return the integer index of its first byte,
+or <tt>-1</tt> if it does not occur.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+strreplace: function(str: string, lit: string, rep: string, replace_all: bool): string; </font></pre>
+Return a copy of string <tt>str</tt>, with non-overlapping instances of
+<tt>lit</tt> replaced by <tt>rep</tt>. If <tt>replace_all</tt> is
+false, only the first found instance is replaced.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+match: function(r: string, s: string): bool;
+</font></pre>
+Search for a match of the regular expression <tt>r</tt> within <tt>s</tt>,
+and return a boolean value indicating whether a match was found.
+(The regular expression syntax is that of
+<a href="http://www.pcre.org/">PCRE</a>.)
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+matchposns: function(r: string, s: string): array of int;
+</font></pre>
+Search for a match of the regular expression <tt>r</tt> within <tt>s</tt>,
+and return an array consisting of character positions within <tt>s</tt>
+defined by the match. Positions 0 and 1 of the array report the location
+of the match of the entire expression, subsequent pairs report the location
+of matches of successive parenthesized subexpressions.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+matchstrs: function(r: string, s: string): array of string;
+</font></pre>
+Search for a match of the regular expression <tt>r</tt> within <tt>s</tt>,
+and return an array of strings consisting of matched substrings of <tt>s</tt>.
+The 0th string is the entire match; following elements of the array hold
+matches of successive parenthesized subexpressions.
+This function is equivalent to using <tt>matchposns</tt> to find successive
+locations of matches and created array slices of <tt>s</tt> with the indices returned.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+splitcsvline: function(line: bytes): array of bytes;
+</font></pre>
+The function splitcsvline takes a line of UTF-8 bytes and splits it
+at the commas. It returns the array of fields produced.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+splitcsv: function(lines: bytes, fields: array of int): array of bytes;
+</font></pre>
+The function splitcsv takes an array of UTF-8 bytes containing lines of text,
+such as that produced by the load() builtin. It splits each line as if by
+<tt>splitcsvline</tt>, and then selects the fields
+indicated by the second argument (numbered starting at 1).
+The return value is a flat array of the collected fields.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+format: function(fmt: string, ...): string;
+</font></pre>
+Return a string containing the arguments formatted according to the format
+string fmt. The syntax of the format string is essentially that of ANSI C with
+the following differences:
+<ul>
+ <li><tt>%b</tt> prints a boolean, "true" or "false".</li>
+ <li><tt>%c</tt> prints a (u)int as a Unicode character in UTF-8.</li>
+ <li><tt>%k</tt> like <tt>%c</tt> with single quotes and backslash escapes for special characters.</li>
+ <li><tt>%s</tt> prints a Sawzall string as UTF-8.</li>
+ <li><tt>%q</tt> like <tt>%s</tt> with double quotes and backslash escapes for special characters.</li>
+ <li><tt>%p</tt> prints a fingerprint, in the format <tt>0x%.16x</tt>.</li>
+ <li><tt>%t</tt> prints a time, in the format of the Unix function ctime without a newline.</li>
+ <li><tt>%T</tt> prints a Sawzall type of the argument; <tt>%#T</tt> expands user-defined types.</li>
+ <li><tt>%d</tt> / <tt>%i</tt> / <tt>%o</tt> / <tt>%u</tt> / <tt>%x</tt> /
+ <tt>%X</tt> apply to a Sawzall (u)int and have no '<tt>l</tt>' or '<tt>h</tt>' modifiers.</li>
+ <li><tt>%e</tt> / <tt>%f</tt> / <tt>%g</tt> / <tt>%E</tt> / <tt>%G</tt> apply
+ to a Sawzall float and have no '<tt>l</tt>' or '<tt>h</tt>' modifiers.</li>
+ <li>format verbs '<tt>n</tt>' and '<tt>*</tt>' are not supported.</li>
+</ul>
+Never returns <i>undef</i>.
+
+<br><br><hr>
+
+<br>
+<h2> <a name="arithmetic">Arithmetic</a> </h2>
+
+<hr><pre><font color=green>
+highbit: function(n: int): int;
+</font></pre>
+Return an integer representing the bit position of the
+highest one bit in <tt>n</tt>.
+If <tt>n</tt> is zero, the result is 0; if <tt>n</tt> is 1, the result is 1,
+if <tt>n</tt> is 15, the result is 4, etc.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+rand: function(): float;
+</font></pre>
+Return a random floating point number in the range 0.0&lt;=<i>x</i>&lt;1.0.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+nrand: function(n: int): int;
+</font></pre>
+Return a random integer in the range 0&lt;=<i>x</i>&lt;<tt>n</tt>.
+Returns <i>undef</i> if <tt>n</tt> is negative or zero.
+<br>Never returns <i>undef</i>.
+
+<hr>
+<br>
+The intrinsics <tt>__undefine</tt>, <tt>__raise_segv</tt>,
+<tt>__addressof</tt> and <tt>__heapcheck</tt> are for testing purposes only.
+
+<br><br><hr>
+
+<br>
+<h2><a name="floatingpoint">Floating Point Math Functions</a> </h2>
+
+These functions are wrappers for the corresponding
+ones in <tt>math.h</tt>. Instead of checking domain
+and range we just return whatever the underlying library
+does. The value <tt>errno</tt> is not available nor checked.
+In general you'll have to check the results using
+<tt>isfinite()</tt>.
+
+<br><br><hr><pre><font color=green>
+ln: function(x: float): float;
+</font></pre>
+The natural logarithm of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+log10: function(x: float): float;
+</font></pre>
+The logarithm base 10 of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+exp: function(x: float): float;
+</font></pre>
+The exponential, base <i>e</i>, of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+sqrt: function(x: float): float;
+</font></pre>
+The square root of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+pow: function(x: float, y: float): float;
+</font></pre>
+The exponential, base <tt>x</tt>, of <tt>y</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+sin: function(x: float): float;
+</font></pre>
+The sine of <tt>x</tt>, where <tt>x</tt> is in radians.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+cos: function(x: float): float;
+</font></pre>
+The cosine of <tt>x</tt>, where <tt>x</tt> is in radians.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+tan: function(x: float): float;
+</font></pre>
+The tangent of <tt>x</tt>, where <tt>x</tt> is in radians.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+asin: function(x: float): float;
+</font></pre>
+The arc sine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+acos: function(x: float): float;
+</font></pre>
+The arc cosine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+atan: function(x: float): float;
+</font></pre>
+The arc tangent of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+atan2: function(x: float, y: float): float;
+</font></pre>
+The arc tangent of <tt>y</tt>/<tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+cosh: function(x: float): float;
+</font></pre>
+The hyperbolic cosine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+sinh: function(x: float): float;
+</font></pre>
+The hyperbolic sine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+tanh: function(x: float): float;
+</font></pre>
+The hyperbolic tangent of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+acosh: function(x: float): float;
+</font></pre>
+The hyperbolic arc cosine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+asinh: function(x: float): float;
+</font></pre>
+The hyperbolic arc sine of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+atanh: function(x: float): float;
+</font></pre>
+The hyperbolic arc tangent of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+fabs: function(x: float): float;
+</font></pre>
+The absolute value of <tt>x</tt>.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+ceil: function(x: float): float;
+</font></pre>
+Round <tt>x</tt> up to the nearest integer.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+floor: function(x: float): float;
+</font></pre>
+Round <tt>x</tt> down to the nearest integer.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+round: function(x: float): float;
+</font></pre>
+Round <tt>x</tt> to the nearest integer, but round halfway cases away from zero.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+trunc: function(x: float): float;
+</font></pre>
+Round <tt>x</tt> to the nearest integer not larger in absolute value.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+isnan: function(x: float): bool;
+</font></pre>
+Tests if a float value is an IEEE NaN
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+isinf: function(x: float): bool;
+</font></pre>
+Tests if a float value is an IEEE Inf
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+isfinite: function(x: float): bool;
+</font></pre>
+Tests if a float value is not +-Inf or NaN
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+isnormal: function(x: float): bool;
+</font></pre>
+Tests if a float value is neither zero, subnormal, Inf, nor NaN
+<br>Never returns <i>undef</i>.
+
+<br><br><hr>
+
+<br>
+<h2 id="encryptencode">Encryption and encoding functions</h2>
+
+<hr><pre><font color=green>
+tobase64: function(input: bytes, websafe: bool): bytes;
+</font></pre>
+The function tobase64 takes an input bytes array and returns a bytes array
+containing its base64 encoding. The boolean flag, if set, invokes the
+web-safe encoding that maps + to - and / to _ and does not pad the output
+with =.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+frombase64: function(input: bytes, websafe: bool): bytes;
+</font></pre>
+The function frombase64 takes an input bytes array and returns a bytes array
+containing its base64 decoding. The boolean flag, if set, invokes the
+web-safe decoding that maps + to - and / to _.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+gunzip: function(compressed_data: bytes): bytes;
+</font></pre>
+Decompress gzip compressed data. The data must contain a valid gzip header and
+footer (as in a .gz file), but data after the footer is ignored.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+gzip: function(uncompressed_data: bytes): bytes;
+</font></pre>
+Compress data using gzip.
+<br>Never returns <i>undef</i>.
+
+<br><br><hr><pre><font color=green>
+zlibuncompress: function(compressed_data: bytes, skip_header: bool): bytes;
+</font></pre>
+Uncompresses the zipped data using zlib, and returns the uncompressed data.
+Extra data past the valid zlib data results in an error. Optional parameter gives
+intermediate buffer size for decompression in bytes (default 8192)
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr><pre><font color=green>
+zlibcompress: function(uncompressed_data: bytes, skip_header: bool): bytes;
+</font></pre>
+Compresses the zipped data using zlib, and returns the compressed data.
+<br>Returns <i>undef</i> only if an error occurs.
+
+<br><br><hr>
+
+</body>
+</html>
2,913 docs/sawzall-language.html
2,913 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
1,964 docs/sawzall-spec.html
@@ -0,0 +1,1964 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 TRANSITIONAL//EN">
+<html>
+<head>
+<title>The Sawzall Language Specification</title>
+ <link rel="stylesheet" href="guidestyle.css">
+ <style>
+ div.nav-2-columns {margin-top:20px}
+ div.nav-2-columns td {width:50%}
+ div.nav-2-columns table {width:100%}
+ </style>
+</head>
+<body>
+
+
+<h1>
+ <a name="title">The Sawzall Language Specification</a>
+</h1>
+<p>
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="objective">Objective</a>
+</h2>
+This document is a semi-formal specification of the Sawzall language.
+It contains a formal definition of the language syntax with concise
+explanations of type and semantic rules. It accompanies the
+<a href="sawzall-language.html">
+Sawzall Language</a> document, which is a more informal but broader
+description of the language and the supporting environment.
+
+
+<!----------------------------------------------------------------------------->
+
+<!-- The "nav" div contains the top-level TOC. JS fills it in. -->
+<div id="nav" class="nav-2-columns"></div>
+
+
+
+<h2>
+ <a name="organization">Organization and conventions</a>
+</h2>
+To make it easier to learn the language, Sawzall's syntax
+was designed to match C syntax closely where sensible; this is the case
+in particular for statements and expressions. Also, identifiers, literals
+(character, integer, floating point, string and other constants) largely follow
+C notation. It is the hope that many of the finer points can be ignored
+in the beginning since most programmers will be able to extrapolate from
+previous experience. Thus, to facilitate a quick start, in the following the
+language is presented in a top-down fashion rather than the more
+conventional bottom-up approach used for language specifications.
+<p>
+To describe the language syntax in detail, Extended Backus-Naur Form
+(EBNF) is used: Alternatives are separated by vertical bars |. Parentheses
+( and ) are used for grouping. Expressions enclosed in square brackets [ and ]
+are optional. Curly braces { and } denote repetition (0 or more times) of the
+enclosed expression. Names of EBNF productions referring to terminal symbols
+start with a lower-case letter; all other production names start with a capital
+letter. Literal characters and strings are enclosed in single quotes ' and '.
+A precise definition of EBNF can be found in the <a href="#appendix">
+Appendix</a>.
+<p>
+Occasionally, the language syntax is presented in a slightly
+simplified form to facilitate reading. For instance, this is the case for the
+definition of string literals where a complete formal definition would
+become very cumbersome to understand, without adding value to the
+reader. Annotations in <font size=-1>small font</font> may be used
+to explain some of the fine points in those cases.
+
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="model">Programming model</a>
+</h2>
+Sawzall was designed for parallel log processing. To make parallelization
+possible, some restrictions have been built into the language by design.
+Instead of specifying how to process an entire set of log entries, a
+Sawzall program specifies how to process an individual log entry
+<em>independent</em> of any other log entry. Thus several log
+entries may be processed in parallel by different executions of the
+same Sawzall program, possibly on different machines.
+<p>
+Since processing
+is independent, data about all the processed log entries is aggregated
+<em>externally</em> to a Sawzall program proper. To that end, Sawzall
+supports the declaration of <a href="#output_types">output types</a>
+and corresponding output variables. Output variables are the
+connection to specific aggregators (such as accumulators, collectors,
+filters, etc.) of data. A special construct, the <a href="#emit">emit</a>
+statement, is used to send data to such an aggregator. The aggregator
+receives the data emitted to it, and aggregates it in a manner specific
+to its type. For example, a <em>sum</em> aggregator, referred to via
+an output variable of <em>sum</em> output type, will add up all the
+values it receives. A <em>collection</em> aggregator will simply collect
+all the data, etc. The Sawzall language doesn't specify a particular
+implementation for aggregators, but the Szl implementation does provide
+an implementation.
+<p>
+Processing of a large set of log entries may take a considerable
+amount of time even if they are processed in parallel. Furthermore,
+it is not unlikely to occasionally encouter corrupted log entries. It is
+important to be able to handle these cases gracefully, without abortion
+of the process. To that end, Sawzall supports the notion of <em>defined</em>
+and <em>undefined</em> values. Expressions and statements may
+be silently aborted if undefined values are encountered. It is the hope
+that most programs will be able to continue execution even in the presence
+of occasional errors caused by corrupted input data.
+
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="structure">Overall program structure</a>
+</h2>
+A Sawzall program is simply a sequence of
+<a href="#declarations">declarations</a>
+and <a href="#statements">statements</a>.
+<pre><font color=orangered>
+Program = { Declaration | Statement }.
+</font></pre>
+
+<h4>Example</h4>
+<pre>
+# Program to find the three most commonly used word in a list.
+
+# Input is text lines containing a word and a count, separated with a comma.
+
+ topwords: table top(3) of word: string weight count: int;
+ fields: array of bytes = splitcsvline(input);
+ w: string = string(fields[0]);
+ c: int = int(string(fields[1]), 10);
+ if (c != 0) {
+ emit topwords <- w weight c;
+ }
+</pre>
+
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="declarations">Declarations and scope rules</a>
+</h2>
+A declaration introduces a name (syntactically an
+<a href="#identifiers">identifier</a>) and
+associates it with a language entity. In Sawzall there are
+<a href="#type_declarations">type</a> and (static and non-static)
+<a href="#variable_declarations">variable</a> declarations.
+<pre><font color=orangered>
+Declaration = TypeDecl | StaticVarDecl | VarDecl.
+</font></pre>
+
+Once a name has been introduced with a declaration, it can be used within
+the <em>scope</em> of the declaration to <em>refer</em> to the associated entity.
+The scope of a declaration extends from the point of
+the declaration of the name to the end of the immediate surrounding
+<a href="#block">block</a>. Blocks - and therefore scopes - may be
+nested. A name declared in a nested scope <em>shadows</em> equal
+names declared in outer scopes; thus a name always refers to its innermost
+declaration. A Sawzall program implicitly defines a block enclosing all
+declarations and statements of the program.
+
+
+<h3><a name="type_declarations">Type declarations</a></h3>
+A type declaration associates a <em>type name</em> with a
+<a href="#types">type specification</a>. Within the scope of the
+declaration, the type name can then be used interchangeably with the
+type specification.
+<pre><font color=orangered>
+TypeDecl = 'type' type_name '=' Type ';'.
+type_name = identifier.
+</font></pre>
+
+<h4>Examples</h4>
+<pre>
+type my_bool = bool;
+type Coordinates = { x: float, y: float };
+type CityMap = map [city_name: string] of Coordinates;
+</pre>
+
+
+<h3><a name="variable_declarations">Variable declarations</a></h3>
+A variable declaration introduces a variable. The declaration associates
+the variable with a <em>variable name</em> and a <em>variable type</em>.
+A variable holds a (possibly <a href="#undefined_values">undefined</a>)
+value of the associated variable type.
+<p>
+The variable type may be specified explicitly, followed by an optional
+<em>initialization expression</em>. Alternatively, the explicit type
+specification may be elided in favor of the initialization expression,
+in which case the type of the variable is the type of the initialization
+expression. Thus, either the variable type or the initialization expression
+(but not both) may be missing.
+A variable may
+be declared <em>static</em> by prefixing the declaration with the
+keyword <code>static</code>.
+<pre><font color=orangered>
+StaticVarDecl = 'static' VarDecl.
+VarDecl = var_name ':' [Type] ['=' Expression | Block] ';'.
+var_name = identifier.
+</font></pre>
+The initialization expression for <em>static</em> variables <em>may</em>
+be evaluated only once per program run (implementation specific).
+In particular, if the value of an initialization expression for a static variable
+changes during program execution, the program is <em>erroneous</em>.
+Variables of an <a href="#output_types">output type</a> are always static,
+and the <code>static</code> keyword may be omitted in that case.
+<p>
+Initialization expressions for static variables must not <a href="#declarations">refer</a>
+to non-static variables; if variable names appear in the expression, the
+associated variables must be static. In particular, if a (non-intrinsic) function
+is called in an initialization expression, that function must be stored in a
+static variable.
+
+<p>
+A variable declaration with an initialization expression of the form:
+<pre>
+variable_name: variable_type = expression;
+</pre>
+is a shortcut for the variable declaration followed by an
+<a href="#assignment">assignment</a>, possibly with a
+<a href="#conversions">conversion</a> <code>(variable_type)</code>.
+The conversion is omitted if <code>expression</code>
+already is of type <code>variable_type</code>:
+<pre>
+variable_name: variable_type;
+variable_name = variable_type(expression);
+</pre>
+If the type of <code>expression</code> is exactly <code>variable_type</code>
+the declaration may be further shortened to:
+<pre>
+variable_name := expression;
+</pre>
+Notice that even though there is no explicit type specified here, it nonetheless
+declares a new variable (the <code>:</code> indicates the variable declaration).
+Obviously, in this case no conversion is implied.
+<p>
+If the initialization expression is a <a href="#functions">function</a>, the variable type must be
+a <a href="#function_types">function type</a>.
+Therefore, a function declaration looks like this:
+<pre>
+function_name: function_type = function_type {
+ # function body
+};
+</pre>
+Since there is no type conversion possible for functions, the function types in the
+declaration must be identical and this syntax can always be
+shortened to:
+<pre>
+function_name := function_type {
+ # function body
+};
+</pre>
+<font size=-1>A slightly different notation was previously supported as a special
+case for functions:
+<pre>
+function_name: function_type {
+ # function body
+};
+</pre>
+but that form has been obsoleted by the appearance of the <tt>:=</tt> style.
+</font>
+<p>
+The evaluation of the initialization expression may result in an
+<em>undefined value</em>. In that case, the undefined
+value may be silently assigned to the variable (implementation-specific,
+see also <a href="#undefined_values">Undefined values</a> below).
+
+<h4>Examples</h4>
+<pre>
+n := 10;
+counter: int = 0;
+static pi := 3.14159265;
+hypot := sqrt(x*x + y*y);
+static word := load(&quot;/home/szluser/current/word&quot;);
+a: array of float = { 2.4, PI, float("1.234") };
+unique_language_values: table unique (10) of {language: string, value: string};
+average := function(list: array of float): float;
+min := function(x: int, y: int): int { if (x < y) return x; return y; };
+static country_codes: array of string = LoadCountries("countries.txt");
+</pre>
+
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="undefined_values">Undefined values</a>
+</h2>
+The value of an expression may be either <em>defined</em> or
+<em>undefined</em>. If the value is defined, it is one of the set of
+values specified by its <a href="#types">type</a>. If the value is undefined,
+it does not have a value.
+<p>
+If an undefined value is used within an expression, the value of the
+expression becomes undefined. In particular, if an undefined value is
+used as an argument for a function call, the function is not called
+and the function result is undefined.
+<p>
+If an undefined value is assigned to
+a variable, the variable becomes undefined. In particular, if an undefined
+value is stored as array element, tuple field, or map entry, the entire
+array, tuple, or map
+becomes undefined. Thus, <em>variables are either entirely defined or
+entirely undefined</em>. Initially, all variables are undefined until a
+defined value is assigned to them. Function parameters are defined
+upon invocation of the function because it can only be called if
+all arguments are defined.
+<p>
+The intrinsic function <code>def(x: T): bool</code> can be used to test
+an arbitrary expression <code>x</code> of any type <code>T</code> for defined'ness.
+<code>def(x)</code> returns <code>true</code> if the value of <code>x</code> is defined;
+it returns <code>false</code> otherwise. Thus, <code>def()</code> acts like a guard
+against the propagation of undefined values.
+
+<h4>Current implementation</h4>
+Except for initialization expressions, return expressions,
+and expressions guarded with <code>def()</code>,
+by default the current implementation aborts with a run-time error if an
+undefined value is encountered. When the <code>--ignore_undefs</code>
+flag is specified with either the <code>szl</code> or <code>saw</code> application,
+all undefined values are propagated silently, without
+runtime errors.
+<p>
+Undefined initialization expressions don't cause a run-time
+trap so that it is possible to write code such as
+<pre>
+x: T = expr; # potentially undefined
+if (def(x))
+ # x is defined; it can be used w/o runtime trap
+else
+ # x is undefined; using it will cause a runtime trap
+ # (unless used again in one of the exception cases described here)
+</pre>
+without the need to re-evaluate <code>expr</code> if its value is defined.
+<p>
+Undefined return expressions don't cause a run-time trap so that it is
+possible to return an undefined value from a user-defined function; usually
+to indicate that the function failed. For instance, one might write a function
+<code>f</code> that returns a result of type <code>T</code>, and also use
+the result mechanism to indicate if the function was successful or not:
+<pre>
+f := function(...): T {
+ ...
+ # return an undefined value to indicate failure of f
+ u: T;
+ return u;
+};
+
+result: T = f(...);
+if (def(result))
+ # function succeeded; result is defined
+else
+ # function failed; result is undefined
+}
+</pre>
+
+
+<!----------------------------------------------------------------------------->
+<h2>
+ <a name="types">Types</a>
+</h2>
+A type specifies a (possibly infinite) <em>set of values</em> and an
+<em>interface</em>. Sawzall is a statically typed language, that is, the
+types of all variables and expressions are known at compile-time. The
+interface of a type determines which operations are legal on values of
+that type. The interface is implicitly defined for all the basic types. For
+composite types, the interface is explicitly defined via their structure.
+
+<h3><a name="value_semantics">Value semantics</a></h3>
+All Sawzall types have <em>value semantics</em>; a variable always stores an
+entire copy of a value and not a reference to it. This is of particular
+importance for <a href="#composite_types">composite types</a>
+such as arrays, maps, and tuples: If a value
+<code>x</code> held in a variable <code>v1</code> is assigned to a second
+variable <code>v2</code>, changing <code>v2</code> after the assignment -
+for instance by modifying components of <code>v2</code> - won't affect the
+value of <code>v1</code>.
+
+<h4>Implementation note</h4>
+The current implementation uses a copy-on-write scheme to implement
+value semantics efficiently; a copy is made only when modifying a shared
+value.
+
+
+<h3>Type specifications</h3>
+A type specification may be preceded by the <code>proto</code> keyword.
+In that case, the type specification must be a tuple type (either a type
+name referring to a tuple type, or a tuple type specification). The resulting
+tuple type is an <em>automatic</em> proto tuple type generated from the original
+tuple type (see <a href="#auto_protos">Automatic proto tuple types</a>
+for details).
+<pre><font color=orangered>
+Type =
+ type_name | ArrayType | MapType | TupleType |
+ OutputType | FunctionType | 'proto' Type.
+type_name = identifier.
+</font></pre>
+
+<h3><a name="basic_types">Basic types</a></h3>
+<em>Basic types</em> are intrinsically known to the system and have
+predeclared names. There are 7 basic types in Sawzall:
+<pre>
+bool # the boolean values true and false
+bytes # arrays of unsigned bytes
+int # 64-bit signed integer values
+float # 64-bit IEEE floating point values
+time # unsigned integral representations of time, with microsecond resolution
+fingerprint # unsigned hash values computed by an implementation-defined hash function
+string # arrays of unicode characters
+uint # 64-bit unsigned integer valuse
+</pre>
+
+
+<h3><a name="composite_types">Composite types</a></h3>
+Composite types are composed of 0 or more components. Array, tuple,
+and map types are composite types. Each component
+may have an optional <em>component name</em> (documentation only),
+and each component has a <em>component type</em>.
+<pre><font color=orangered>
+Component = [component_name ':'] ComponentType.
+component_name = identifier.
+ComponentType = Type.
+</font></pre>
+
+<h3>Array types</h3>
+An array is a composite type consisting of an (unspecified) number of
+components, called <em>elements</em>, which are all of the same type.
+An array type specifies the type of the elements, the <em>element type</em>.
+It may also specify an <em>element name</em>, which serves documentary
+purposes only. The number of elements of an array is called its <em>length</em>.
+The length of an array is determined at execution time. The elements of an
+array are designated by indices, which are integer values between 0 and
+the length minus 1.
+
+<pre><font color=orangered>
+ArrayType = 'array' 'of' Element.
+Element = Component.
+</font></pre>
+
+<h4>Examples</h4>
+<pre>
+array of int
+array of names: string
+array of array of point: Point
+</pre>
+
+<h3><a name="tuple_types">Tuple types</a></h3>
+A tuple is a composite type consisting of a fixed number of
+<em>members</em> of possibly different types.
+Members may be <a href="#composite_types">components</a>,
+which are called (simple or proto) <em>fields</em>.
+A tuple type specifies a type for each field, and possibly a name.
+If there is no name, the field is called <em>anonymous</em>.
+No two named fields within a tuple can have the same name.
+Members may also be <a href="#type_declarations">type declarations</a>
+or <a href="#variable_declarations">static variable declarations</a>.
+A tuple type acts as a scope for these declarations.
+<p>
+The type of a tuple field, static declaration or type declaration may refer
+to the name of an enclosing type; that is, type declarations may be
+recursive. When used in a field or type declaration, the inner type reference
+must occur within an array type or map type enclosed by the outer type
+declaration.
+(That is, there must be a way to omit the value of the field. Otherwise
+it would be impossible to represent an object of that type.)
+<p>
+Although types may be recursive, objects can neither contain
+nor refer to enclosing objects or themselves (directly or indirectly).
+This is because Sawzall is value-based and has no pointers or references.
+<p>
+Types cannot be mutually recursive.
+Since Sawzall does not have forward references it is not possible to have two
+types (e.g. tuples) each of which makes use of the other.
+<p>
+Named fields only may also have an integer value
+called (field) <em>tag</em> associated with it. Tags are used to associate
+the field with a field in the protocol buffer representation of the tuple
+(see also
+<a href="http://code.google.com/p/protobuf/">Protocol Buffers</a>). No two fields within a tuple
+can have the same tag, and all tags must be >&nbsp;0. Tuples with tagged fields
+are called <em>proto tuples</em>; tuples without tagged fields are called
+<em>simple tuples</em>. All fields of proto tuples must have tags.
+<p>
+Tagged fields of proto tuples may also
+specify a <em>constant</em> default value. The default value is used to
+initialize a field during proto conversion if the corresponding field value is
+missing in the proto buffer being converted. The special intrinsic
+<code>inproto(field: T): bool</code> can be used to test whether a proto
+tuple field was present in the protocol buffer converted into the proto
+tuple (see also
+<a href="sawzall-intrinsics.html">
+A Manual for the Sawzall Intrinsics</a>).
+<p>
+Tagged fields of proto tuples may also specify
+a corresponding underlying protocol buffer field type. This is the field
+type the tuple field is converted into when the tuple is converted from
+a Sawzall tuple back into the protocol buffer representation. If no
+protocol buffer field type is specified, a default field type is assumed
+(see table below).
+<p>
+<font size=-1>For historical reasons, a proto tuple type may start with
+the keyword <code>parsedmessage</code> to indicate that the tuple is
+a proto tuple. The keyword is treated as a comment.
+</font>
+
+<pre><font color=orangered>
+TupleType = SimpleTupleType | ProtoTupleType.
+
+SimpleTupleType = '{' [SimpleMemberList] '}'.
+SimpleMemberList = SimpleMember {',' SimpleMember} [','].
+SimpleMember = TypeDecl | StaticVarDecl | SimpleFieldDecl.
+SimpleFieldDecl = Component.
+
+ProtoTupleType = ['parsedmessage'] '{' [ProtoMemberList] '}'.
+ProtoMemberList = ProtoMember {',' ProtoMember} [','].
+ProtoMember = TypeDecl | StaticVarDecl | ProtoFieldDecl.
+ProtoFieldDecl = Component ['=' ProtoFieldDefault] '@' proto_field_tag [':' proto_field_type].
+ProtoFieldDefault = Expression.
+proto_field_tag = integer.
+proto_field_type = identifier.
+</font></pre>
+
+The type declarations for proto tuples are usually automatically generated
+by the protocol-compiler from a <code>.proto</code> file. The following table
+shows the relationship between the Sawzall and protocol buffer field types:
+<p>
+<table border="1" width="80%">
+ <tbody>
+ <tr>
+ <td><b>Protocol buffer type</b></td>
+ <td><b>Sawzall type</b></td>
+ <td><b>Comments</b></td>
+ </tr>
+ <tr>
+ <td><code>bool</code></td>
+ <td><code>bool</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ <tr>
+ <td><code>boolean</code></td>
+ <td><code>bool</code></td>
+ </tr>
+ <tr>
+ <td><code>string</code></td>
+ <td><code>bytes</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ <tr>
+ <td><code>fixed32</code></td>
+ <td><code>int</code></td>
+ <td><code>may overflow</code></td>
+ </tr>
+ <td><code>fixed64</code></td>
+ <td><code>fingerprint</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ </tr>
+ <td><code>fixed64</code></td>
+ <td><code>int</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ </tr>
+ <td><code>fixed64</code></td>
+ <td><code>time</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ <tr>
+ <td><code>int32</code></td>
+ <td><code>int</code></td>
+ <td><code>may overflow</code></td>
+ </tr>
+ <tr>
+ <td><code>int64</code></td>
+ <td><code>int</code></td>
+ </tr>
+ <tr>
+ <td><code>uint64</code></td>
+ <td><code>uint</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ <tr>
+ <td><code>float</code></td>
+ <td><code>float</code></td>
+ <td><code>may overflow</code></td>
+ </tr>
+ <tr>
+ <td><code>double</code></td>
+ <td><code>float</code></td>
+ <td><code>default conversion</code></td>
+ </tr>
+ </tbody>
+</table>
+<p>
+For each protocol buffer field type there is a corresponding Sawzall
+field type; for each Sawzall field type there may be more than one
+protocol buffer field type. Some Sawzall fields may lose information
+(overflow may occur) if they are converted into a protocol buffer
+field of a narrower type (note that all basic Sawzall types - except
+<code>bytes</code> and <code>string</code> - are 64 bits wide).
+The default conversions are used if no
+protocol buffer field type is specified explicitly.
+
+<h4><a name="auto_protos">Automatic proto tuple types</a></h4>
+A proto tuple type can be obtained automatically by preceding a
+simple tuple type with the keyword <code>proto</code>. The resulting
+proto tuple type consists of the original tuple type converted into
+a <code>parsedmessage</code> proto tuple type augmented with field
+tags. The tags are assigned in increasing order, starting with tag value
+1 for the first field, tag value 2 for the second field, and so forth. The
+same conversion is applied to any nested tuples in the tuple type (field
+tags start with tag value 1 again; see also the examples below).
+A <code>proto</code> keyword applied to a proto tuple type
+does not change its type (i.e., <code>proto</code> is idempotent).
+
+<h4>Implementation restriction</h4>
+The current implementation may only support a restricted set of constant
+expressions for field default values of proto tuples.
+
+<h4>Examples</h4>
+<pre>
+{}
+
+{ x: float, y: float, int }
+
+# the Vector tuple type
+{ x: float, y: float,
+ static Magnitude := function(p: Vector): float {
+ return sqrt(p.x*p.x + p.y*p.y);
+ }
+}
+
+{ ip: int = 0xffffff00 @ 1,
+ value: bytes = bytes("britney") @ 2, # proto strings are Sawzall bytes
+ timestamp: time @ 5,
+ type Server = {
+ id: int,
+ location: string
+ },
+ static location1 := "ROB",
+ static location2 := "GRI",
+ static location3 := "BGI",
+}
+
+parsedmessage {
+ g: array of TimeProtocol_G @ 1, # 11
+ debug: array of bytes @ 4: bytes # 34
+}
+
+proto T # T must be a tuple type
+
+proto {
+ x: int,
+ y: float
+}
+
+proto proto proto { # proto is idempotent
+ x: int,
+ t: {
+ s: bytes,
+ t: bytes
+ }
+}
+
+parsedmessage { # this type is equivalent to the previous one
+ x: int @ 1,
+ t: parsedmessage {
+ s: bytes @ 1,
+ t: bytes @ 2
+ } @ 2
+}
+</pre>
+
+<h3>Map types</h3>
+A map is a composite type consisting of an (unspecified) number of
+components, called <em>key-value pairs</em>, which are all of the same
+type. A map type specifies the <em>key</em> and <em>value types</em>.
+It may also specify <em>key</em> and <em> value names</em>, which
+serve documentary purposes only. The number of key-value pairs of a map
+is called its <em>length</em>. The length of a map is determined at execution
+time. The values of a map are designated by their keys, which are values of
+key type.
+
+<pre><font color=orangered>
+MapType = 'map' '[' Key ']' 'of' Value.
+Key = Component.
+Value = Component.
+</font></pre>
+
+<h4>Examples</h4>
+<pre>
+map [int] of bool
+map [symbol: string] of int
+map [point: {x: float, y: float}] of name: string
+</pre>
+
+
+<h3><a name = "output_types">Output types</a></h3>
+Output types specify language-external containers which aggregrate data in
+a type-specific fashion. Sawzall is extensible with respect to output types.
+The following output types are supported:
+<ol>
+<li>collection: a simple collection or concatenation of the data
+<li>maximum: a precise sample of the N highest-weighted data items.
+<li>minimum: a precise sample of the N lowest-weighted data items.
+<li>sample: a statistical sampling of N items.
+<li>set: a set (unique elements) containing at most N items per index.
+<li>sum: an arithmetic sum of the data.
+<li>top: statistical estimators for the`most frequent N' data items.
+<li>unique: statistical estimators for the total number of unique data items.
+<li>weightedsample: a statistical sampling of N items, biased towards items with higher weights.</li>
+<li>recordio: output to a simple record-based binary file.
+</ol>
+Each kind of output type specifies a particular aggregation method. Output
+types are used to declare output variables which represent the connection
+to the language external aggregators. <a href="#emit"><code>emit</code></a>
+statements are used to send data to an aggregator.
+<p>
+Output types may be parametrized and indexed. Parameters are used to
+set up the aggregation variable. Indices are used to create "arrays" of
+aggregators. Unlike with arrays, indices of various types (not just <code>int</code>)
+may be specified. Some output types require the specification of a weight
+type. When emitting to a variable of output type, the weight is used to
+scale the emitted value for aggregation.
+
+<pre><font color=orangered>
+OutputType =
+ 'table' table_type [Parameter] {Index} 'of' Element [Weight]
+ [FileSpec | ProcSpec ] [FormatSpec].
+table_type = identifier.
+Parameter = '(' Expression ')'.
+Index = '[' Component ']'.
+Element = Component.
+Weight = 'weight' Component.
+FileSpec = 'file' '(' ArgumentList ')'.
+ProcSpec = 'proc' '(' ArgumentList ')'.
+FormatSpec = 'format' '(' ArgumentList ')'.
+ArgumentList = ExprList.
+ExprList = Expression {',' Expression }.
+</font></pre>
+
+If a <code>file</code> or <code>proc</code> specifier is present, data is not emitted
+to an external aggregator, but to a file or process respectively, with the file
+or process name computed via the specifier as described below. Unless
+a <code>format</code> specifier is present as well, the element type must be
+<code>bytes</code>.
+<code>file</code> or <code>proc</code> specifiers are only allowed
+for <code>collection</code> output types.
+<p>
+If a <code>format</code> specifier is present, any value emitted is
+formatted into a string as expressed via the specifier.
+The emitted values are always of type <code>string</code>, independently
+of the element type specified in the output type; except if there is
+a <code>file</code> or <code>proc</code> specifier present as well, in which
+case the string is converted into (UTF-8 encoded) bytes before
+emission.
+<p>
+The <code>file</code>, <code>proc</code>, and <code>format</code> specifier format
+expressions similar to <code>printf</code> in C: The first argument must be
+a string possibly containing format characters. The remaining arguments
+must correspond in number and type to the format characters in the
+format string. Within the arguments of <code>file</code> or <code>proc</code>
+specifiers, output type index values may be referred to via the
+corresponding component name of a particular index. Within the arguments
+of <code>format</code> specifiers, the element value may be referred to via
+its component name.
+
+<h4>Examples</h4>
+<pre>
+# Type of intrinsic output variable stdout
+table collection of x: string file("/dev/stdout") format("%s\n", x)
+
+# Type used to collect all the values into a single stream