Add fold benchmark

haskell · Jun 16, 2011 · 8de5ea9 · 8de5ea9
1 parent 4e79b4e
commit 8de5ea9
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 0 deletions.
diff --git a/tests/benchmarks/ruby/fold.rb b/tests/benchmarks/ruby/fold.rb
@@ -0,0 +1,50 @@
+#!/usr/bin/env ruby
+
+require './utils.rb'
+
+def fold(filename, max_width)
+  File.open(filename, 'r:utf-8') do |file|
+    # Words in this paragraph
+    paragraph = []
+
+    file.each_line do |line|
+      # If we encounter an empty line, we reformat and dump the current
+      # paragraph
+      if line.strip.empty? then
+        puts fold_paragraph(paragraph, max_width)
+        puts
+        paragraph = []
+      # Otherwise, we append the words found in the line to the paragraph
+      else
+        paragraph.concat line.split
+      end
+    end
+
+    # Last paragraph
+    puts fold_paragraph(paragraph, max_width) unless paragraph.empty?
+  end
+end
+
+# Fold a single paragraph to the desired width
+def fold_paragraph(paragraph, max_width)
+  # Gradually build our output
+  str = paragraph.first
+  width = str.length
+
+  paragraph.drop(1).each do |word|
+    if width + word.length + 1 <= max_width then
+      str += ' ' + word
+      width += word.length + 1
+    else
+      str += "\n" + word
+      width = word.length
+    end
+  end
+
+  str
+end
+
+ARGV.each do |f|
+  t = benchmark { fold(f, 80) }
+  STDERR.puts "#{f}: #{t}"
+end
diff --git a/tests/benchmarks/src/Data/Text/Benchmarks.hs b/tests/benchmarks/src/Data/Text/Benchmarks.hs
@@ -23,6 +23,7 @@ import qualified Data.Text.Benchmarks.WordFrequencies as WordFrequencies
 
 import qualified Data.Text.Benchmarks.Programs.BigTable as Programs.BigTable
 import qualified Data.Text.Benchmarks.Programs.Cut as Programs.Cut
+import qualified Data.Text.Benchmarks.Programs.Fold as Programs.Fold
 import qualified Data.Text.Benchmarks.Programs.Sort as Programs.Sort
 import qualified Data.Text.Benchmarks.Programs.StripTags as Programs.StripTags
 import qualified Data.Text.Benchmarks.Programs.Throughput as Programs.Throughput
@@ -54,6 +55,7 @@ benchmarks = do
     ps <- bgroup "Programs" `fmap` sequence
         [ Programs.BigTable.benchmark sink
         , Programs.Cut.benchmark (tf "russian.txt") sink 20 40
+        , Programs.Fold.benchmark (tf "russian.txt") sink
         , Programs.Sort.benchmark (tf "russian.txt") sink
         , Programs.StripTags.benchmark (tf "yiwiki.xml") sink
         , Programs.Throughput.benchmark (tf "russian.txt") sink

diff --git a/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs b/tests/benchmarks/src/Data/Text/Benchmarks/Programs/Fold.hs
@@ -0,0 +1,68 @@
+-- | Benchmark which formats paragraph, like the @sort@ unix utility.
+--
+-- Tested in this benchmark:
+--
+-- * Reading the file
+--
+-- * Splitting into paragraphs
+--
+-- * Reformatting the paragraphs to a certain line width
+--
+-- * Concatenating the results using the text builder
+--
+-- * Writing back to a handle
+--
+{-# LANGUAGE OverloadedStrings #-}
+module Data.Text.Benchmarks.Programs.Fold
+    ( benchmark
+    ) where
+
+import Data.List (foldl')
+import Data.List (intersperse)
+import Data.Monoid (mempty, mappend, mconcat)
+import System.IO (Handle)
+import Criterion (Benchmark, bench)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified Data.Text.Lazy.Builder as TLB
+import qualified Data.Text.Lazy as TL
+import qualified Data.Text.Lazy.IO as TL
+
+benchmark :: FilePath -> Handle -> IO Benchmark
+benchmark i o = return $
+    bench "Fold" $ T.readFile i >>= TL.hPutStr o . fold 80
+
+-- | We represent a paragraph by a word list
+--
+type Paragraph = [T.Text]
+
+-- | Fold a text
+--
+fold :: Int -> T.Text -> TL.Text
+fold maxWidth = TLB.toLazyText . mconcat .
+    intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs
+
+-- | Fold a paragraph
+--
+foldParagraph :: Int -> Paragraph -> TLB.Builder
+foldParagraph _    []       = mempty
+foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws
+  where
+    go (builder, width) word
+        | width + len + 1 <= max' =
+            (builder `mappend` " " `mappend` word', width + len + 1)
+        | otherwise =
+            (builder `mappend` "\n" `mappend` word', len)
+      where
+        word' = TLB.fromText word
+        len = T.length word
+
+-- | Divide a text into paragraphs
+--
+paragraphs :: T.Text -> [Paragraph]
+paragraphs = splitParagraphs . map T.words . T.lines
+  where
+    splitParagraphs ls = case break null ls of
+        ([], []) -> []
+        (p,  []) -> [concat p]
+        (p,  lr) -> concat p : splitParagraphs (dropWhile null lr)