Permalink
Browse files

Initial commit.

  • Loading branch information...
0 parents commit af87985f963608d5c0d75f8c86421d22e8783c8f @cespare committed Sep 12, 2012
Showing with 2,693 additions and 0 deletions.
  1. +7 −0 LICENSE
  2. +42 −0 README.md
  3. +149 −0 smaz.go
  4. +133 −0 smaz_test.go
  5. +2,362 −0 testdata/pg5200.txt
@@ -0,0 +1,7 @@
+Copyright (c) 2012 Caleb Spare
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,42 @@
+# go-smaz
+
+go-smaz is a pure Go implementation of [smaz](https://github.com/antirez/smaz), a library for compressing
+short strings (particularly containing English words).
+
+## Installation
+
+ $ go get github.com/cespare/go-smaz
+
+## Usage
+
+``` go
+import "smaz"
+s := "Now is the time for all good men to come to the aid of the party."
+compressed := smaz.Compress([]byte(s)) // type is []byte
+decompressed, err := smaz.Decompress(compressed) // type is []byte; string(decompressed) == s
+```
+
+Also see the [API documentation](go.pkgdoc.org/github.com/cespare/go-smaz).
+
+## Notes
+
+go-smaz is not a direct port of the C version. It is not guaranteed that the output of `smaz.Compress` will be
+precisely the same as the C library. However, the output should be decompressible by the C library, and the
+output of the C library should be decompressible by `smaz.Decompress`.
+
+Right now go-smaz is very slow -- a very hasty benchmark on my beefy quad-core desktop showed compression
+running at < 2MB/s and decompression at ~14MB/s. Initially I just hacked up a working implementation with no
+thought for performance; I'll profile it and make it a lot faster when I get a chance.
+
+## Author
+
+Caleb Spare ([cespare](github.com/cespare))
+
+## License
+
+MIT Licensed.
+
+## Other implementations
+
+* [The original C implementation](https://github.com/antirez/smaz)
+* [Javascript](https://npmjs.org/package/smaz)
149 smaz.go
@@ -0,0 +1,149 @@
+// smaz is an implementation of the smaz library (https://github.com/antirez/smaz) for compressing small
+// strings.
+package smaz
+
+import (
+ "bytes"
+ "errors"
+)
+
+var codes = []string{" ",
+ "the", "e", "t", "a", "of", "o", "and", "i", "n", "s", "e ", "r", " th",
+ " t", "in", "he", "th", "h", "he ", "to", "\r\n", "l", "s ", "d", " a", "an",
+ "er", "c", " o", "d ", "on", " of", "re", "of ", "t ", ", ", "is", "u", "at",
+ " ", "n ", "or", "which", "f", "m", "as", "it", "that", "\n", "was", "en",
+ " ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s", "nd ", "ed ",
+ "w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ", "his",
+ "st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al", "to ",
+ "b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
+ "hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ",
+ "ri", "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p",
+ "es ", "by", "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la",
+ "h ", "ne", "as ", "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o",
+ "ur", "li", "ll", "ch", "had", "this", "e t", "g ", "e\r\n", " wh", "ere",
+ " co", "e o", "a ", "us", " d", "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e",
+ "s a", "ma", "one", "t t", "or ", "but", "el", "so", "l ", "e s", "s,", "no",
+ "ter", " wa", "iv", "ho", "e a", " r", "hat", "s t", "ns", "ch ", "wh", "tr",
+ "ut", "/", "have", "ly ", "ta", " ha", " on", "tha", "-", " l", "ati", "en ",
+ "pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our", "who", "its", "z",
+ "fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><", "ver", "ad",
+ " we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi", "div",
+ "e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com",
+}
+
+var codeArrays = make([][]byte, len(codes))
+var prefixToCode = make(map[string]byte)
+var maxCodeLength = 0 // TODO: Unnecessary when we switch to a trie implementation
+
+// Library initialization.
+func init() {
+ for i, code := range codes {
+ codeArrays[i] = []byte(code)
+ prefixToCode[code] = byte(i)
+ if len(code) > maxCodeLength {
+ maxCodeLength = len(code)
+ }
+ }
+}
+
+// Compress a byte array and return a new byte array with the compressed data.
+// TODO: This function is written in an extremely naive manner for the time being and is very slow. I will
+// reimplement (and then profile/optimize it) after I get done with go-trie, which will be a natural fit for
+// this problem.
+func Compress(input []byte) (compressed []byte) {
+ var outputBuffer bytes.Buffer
+ var verbatim bytes.Buffer
+ remaining := len(input)
+ position := 0
+
+ flushVerbatim := func() {
+ // We can write a max of 255 continuous verbatim characters, because the length of the continous verbatim
+ // section is represented by a single byte.
+ for verbatim.Len() > 0 {
+ chunk := verbatim.Next(255)
+ if len(chunk) == 1 {
+ // 254 is code for a single verbatim byte
+ outputBuffer.WriteByte(byte(254))
+ } else {
+ // 255 is code for a verbatim string. It is followed by a byte containing the length of the string.
+ outputBuffer.WriteByte(byte(255))
+ outputBuffer.WriteByte(byte(len(chunk)))
+ }
+ outputBuffer.Write(chunk)
+ }
+ verbatim.Reset()
+ }
+
+ for remaining > 0 {
+ // Find the longest matching substring, brute-force
+ longestPossibleMatch := maxCodeLength
+ if remaining < longestPossibleMatch {
+ longestPossibleMatch = remaining
+ }
+ matchFound := false
+ for i := longestPossibleMatch; i > 0; i-- {
+ prefix := input[position : position+i]
+ /*fmt.Printf("Prefix: %v\n", string(prefix))*/
+ if code, ok := prefixToCode[string(prefix)]; ok {
+ // Match found
+ remaining -= i
+ position += i
+ flushVerbatim()
+ outputBuffer.WriteByte(code)
+ matchFound = true
+ break
+ }
+ }
+ if !matchFound {
+ verbatim.WriteByte(input[position])
+ remaining -= 1
+ position += 1
+ }
+ }
+ flushVerbatim()
+
+ return outputBuffer.Bytes()
+}
+
+var DecompressionError = errors.New("Invalid or corrupt compressed data.")
+
+// Decompress a byte array and return a new array with the decompressed data. If the decompression fails for
+// some reason, err will be non-nil.
+func Decompress(compressed []byte) (output []byte, err error) {
+ var decompressed bytes.Buffer
+ var remaining = len(compressed)
+ var position = 0
+ var dummy []byte
+
+ for remaining > 0 {
+ switch compressed[position] {
+ case 254:
+ // Verbatim byte
+ if remaining < 2 {
+ return dummy, err
+ }
+ decompressed.WriteByte(compressed[position+1])
+ remaining -= 2
+ position += 2
+ case 255:
+ // Verbatim string
+ if remaining < 2 {
+ return dummy, err
+ }
+ length := int(compressed[position+1])
+ if remaining < length+2 {
+ return dummy, err
+ }
+ decompressed.Write(compressed[position+2 : position+length+2])
+ remaining -= length + 2
+ position += length + 2
+ default:
+ // Look up encoded value
+ decompressed.Write([]byte(codes[int(compressed[position])]))
+ remaining--
+ position++
+ }
+ }
+
+ return decompressed.Bytes(), nil
+}
@@ -0,0 +1,133 @@
+package smaz
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ . "launchpad.net/gocheck"
+ "os"
+ "testing"
+)
+
+func Test(t *testing.T) { TestingT(t) }
+
+type SmazSuite struct{}
+
+var _ = Suite(&SmazSuite{})
+
+var antirezTestStrings = []string{"",
+ "This is a small string",
+ "foobar",
+ "the end",
+ "not-a-g00d-Exampl333",
+ "Smaz is a simple compression library",
+ "Nothing is more difficult, and therefore more precious, than to be able to decide",
+ "this is an example of what works very well with smaz",
+ "1000 numbers 2000 will 10 20 30 compress very little",
+ "and now a few italian sentences:",
+ "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura",
+ "Mi illumino di immenso",
+ "L'autore di questa libreria vive in Sicilia",
+ "try it against urls",
+ "http://google.com",
+ "http://programming.reddit.com",
+ "http://github.com/antirez/smaz/tree/master",
+ "/media/hdb1/music/Alben/The Bla",
+}
+
+func (s *SmazSuite) TestCorrectness(c *C) {
+ // Set up our slice of test strings.
+ inputs := make([][]byte, 0)
+ for _, testInput := range antirezTestStrings {
+ inputs = append(inputs, []byte(testInput))
+ }
+ // An array with every possible byte value in it.
+ allBytes := make([]byte, 256)
+ for i := 0; i < 256; i++ {
+ allBytes[i] = byte(i)
+ }
+ inputs = append(inputs, allBytes)
+ // A long array of all 0s (the longest continuous string that can be represented is 256; any longer than
+ // this and the compressor will need to split it into chunks)
+ allZeroes := make([]byte, 300)
+ for i := 0; i < 300; i++ {
+ allZeroes[i] = byte(0)
+ }
+ inputs = append(inputs, allZeroes)
+
+ for _, testInput := range inputs {
+ compressed := Compress(testInput)
+ decompressed, err := Decompress(compressed)
+ c.Assert(err, IsNil)
+ if len(testInput) == 0 {
+ // Can't use DeepEquals for a nil slice and an empty slice -- they're different
+ c.Assert(decompressed, HasLen, 0)
+ } else {
+ c.Assert(testInput, DeepEquals, decompressed)
+ }
+
+ if len(testInput) > 1 && len(testInput) < 50 {
+ compressionLevel := 100 - ((100.0 * len(compressed)) / len(testInput))
+ if compressionLevel < 0 {
+ fmt.Printf("'%s' enlarged by %d%%\n", testInput, -compressionLevel)
+ } else {
+ fmt.Printf("'%s' compressed by %d%%\n", testInput, compressionLevel)
+ }
+ }
+ }
+}
+
+func loadTestData() [][]byte {
+ file, err := os.Open("./testdata/pg5200.txt")
+ if err != nil {
+ fmt.Printf("Error opening test data file: %v\n", err)
+ os.Exit(1)
+ }
+
+ totalSize := 0
+ testStrings := make([][]byte, 0)
+ currentLine := new(bytes.Buffer)
+ reader := bufio.NewReader(file)
+ var part []byte
+ var prefix bool
+
+ for {
+ if part, prefix, err = reader.ReadLine(); err != nil {
+ break
+ }
+ currentLine.Write(part)
+ totalSize += len(part)
+ if !prefix {
+ testStrings = append(testStrings, currentLine.Bytes())
+ currentLine = new(bytes.Buffer)
+ }
+ }
+ return testStrings
+}
+
+func (s *SmazSuite) BenchmarkCompression(c *C) {
+ c.StopTimer()
+ testStrings := loadTestData()
+ /*fmt.Printf("The test corpus contains %d lines and %d bytes of text.", len(testStrings), totalSize)*/
+ c.StartTimer()
+ for i := 0; i < c.N; i++ {
+ for _, testString := range testStrings {
+ Compress(testString)
+ }
+ }
+}
+
+func (s *SmazSuite) BenchmarkDecompression(c *C) {
+ c.StopTimer()
+ testStrings := loadTestData()
+ compressedStrings := make([][]byte, len(testStrings))
+ for i, testString := range testStrings {
+ compressedStrings[i] = Compress(testString)
+ }
+ c.StartTimer()
+ for i := 0; i < c.N; i++ {
+ for _, compressed := range compressedStrings {
+ Decompress(compressed)
+ }
+ }
+}
Oops, something went wrong.

0 comments on commit af87985

Please sign in to comment.