Skip to content

Commit

Permalink
Introduce master log compression.
Browse files Browse the repository at this point in the history
Log in the db is stored as chunck of text.
The implementation expected a compress operation to be done at the end
of the step. But it will be costly in time/db operations with db
fragmentation risk.

This patch suggest to do live compression when chunck size to insert
is smaller once compressed.
The compress operation will "just" have then to optimize/regroup
all contiguous text chunk, compress them and replace them in the db.
Compression algorithm is configurable in the master cfg.

Signed-off-by: Sebastien Fusilier <sebastien.fusilier@intel.com>
  • Loading branch information
Sebastien Fusilier committed May 29, 2015
1 parent 0552bea commit 9343126
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 10 deletions.
9 changes: 7 additions & 2 deletions master/buildbot/config.py
Expand Up @@ -297,8 +297,13 @@ def copy_str_param(name, alt_key=None):

if 'logCompressionMethod' in config_dict:
logCompressionMethod = config_dict.get('logCompressionMethod')
if logCompressionMethod not in ('bz2', 'gz'):
error("c['logCompressionMethod'] must be 'bz2' or 'gz'")
if logCompressionMethod not in ('raw', 'bz2', 'gz', 'lz4'):
error("c['logCompressionMethod'] must be 'raw', 'bz2', 'gz' or 'lz4'")
elif logCompressionMethod == "lz4":
try:
import lz4
except ImportError:
error("To set c['logCompressionMethod'] to 'lz4' you must install the lz4 library ('pip install lz4')")
self.logCompressionMethod = logCompressionMethod

copy_int_param('logMaxSize')
Expand Down
70 changes: 63 additions & 7 deletions master/buildbot/db/logs.py
Expand Up @@ -19,12 +19,39 @@
from twisted.internet import defer
from twisted.python import log

def dumps_gzip(data):
import zlib
return zlib.compress(data, 9)

def read_gzip(data):
import zlib
return zlib.decompress(data)

def dumps_lz4(data):
import lz4
return lz4.dumps(data)

def read_lz4(data):
import lz4
return lz4.loads(data)

def dumps_bz2(data):
import bz2
return bz2.compress(data, 9)

def read_bz2(data):
import bz2
return bz2.decompress(data)

class LogsConnectorComponent(base.DBConnectorComponent):

# Postgres and MySQL will both allow bigger sizes than this. The limit
# for MySQL appears to be max_packet_size (default 1M).
MAX_CHUNK_SIZE = 65536
COMPRESSION_MODE = {"raw": {"id": 0, "dumps": lambda x: x, "read": lambda x: x},
"gz": {"id": 1, "dumps": dumps_gzip, "read": read_gzip},
"bz2": {"id": 2, "dumps": dumps_bz2, "read": read_bz2},
"lz4": {"id": 3, "dumps": dumps_lz4, "read": read_lz4}}

def _getLog(self, whereclause):
def thd(conn):
Expand Down Expand Up @@ -68,8 +95,11 @@ def thd(conn):
q = q.order_by(tbl.c.first_line)
rv = []
for row in conn.execute(q):
assert not row.compressed, "compressed rows not supported yet"
content = row.content.decode('utf-8')
# Retrieve associated "reader" and extract the data
data = [y["read"] for y in self.COMPRESSION_MODE.itervalues() if y["id"] == row.compressed][0](
row.content)
content = data.decode(self.master.config.logEncoding)

if row.first_line < first_line:
idx = -1
count = first_line - row.first_line
Expand Down Expand Up @@ -119,20 +149,32 @@ def thd(conn):
# fact that no character but u'\n' maps to b'\n' in UTF-8.

first_line = chunk_first_line = row[0]
remaining = content.encode('utf-8')
remaining = content.encode(self.master.config.logEncoding)
while remaining:
chunk, remaining = self._splitBigChunk(remaining, logid)

last_line = chunk_first_line + chunk.count('\n')

# Set the default compressed mode to "raw" id
compressed_mode = self.COMPRESSION_MODE["raw"]["id"]
# Do we have to compress the chunk?
if self.master.config.logCompressionMethod != "raw":
start = time.clock()
compressed_chunk = self.COMPRESSION_MODE[self.master.config.logCompressionMethod]["dumps"](chunk)
# Is it useful to compress the chunk?
if len(chunk) > len(compressed_chunk):
compressed_mode = self.COMPRESSION_MODE[self.master.config.logCompressionMethod]["id"]
chunk = compressed_chunk

conn.execute(self.db.model.logchunks.insert(),
dict(logid=logid, first_line=chunk_first_line,
last_line=last_line, content=chunk,
compressed=0))
compressed=compressed_mode))
chunk_first_line = last_line + 1

conn.execute(self.db.model.logs.update(whereclause=(self.db.model.logs.c.id == logid)),
num_lines=last_line + 1)
return (first_line, last_line)

return first_line, last_line
return self.db.pool.do(thd)

def _splitBigChunk(self, content, logid):
Expand All @@ -155,7 +197,7 @@ def _splitBigChunk(self, content, logid):
truncline = content[:self.MAX_CHUNK_SIZE]
while truncline:
try:
truncline.decode('utf-8')
truncline.decode(self.master.config.logEncoding)
break
except UnicodeDecodeError:
truncline = truncline[:-1]
Expand All @@ -176,6 +218,20 @@ def thd(conn):

def compressLog(self, logid):
# TODO: compression not supported yet

# Compression is done on chunk directly in appendLog method to:
# - reduce re-processing of the log in this method (done at the end of the step)
# - avoid too many DB operations and so, DB fragmentation risk
#
# So, this method shall 'just' analyze if there are benefits
# to regroup all contiguous chunk, (re)compress them and replace them in the db.
# e.g.:
# - chunk raw 1
# - chunk low compress 2
# - chunk high compress 3
# if grouping and high compressing chunk raw 1&2 is smaller, then this method shall optimize like:
# - chunk high compress 1&2
# - chunk high compress 3
return defer.succeed(None)

def _logdictFromRow(self, row):
Expand Down
11 changes: 10 additions & 1 deletion master/docs/manual/cfg-global.rst
Expand Up @@ -212,9 +212,18 @@ The default value is 4096, which should be a reasonable default on most file sys
This setting has no impact on status plugins, and merely affects the required disk space on the master for build logs.

The :bb:cfg:`logCompressionMethod` controls what type of compression is used for build logs.
The default is 'bz2', and the other valid option is 'gz'.
The default is 'bz2', and the other valid option are 'raw' (no compression), 'gz' or 'lz4' (required lz4 package).
'bz2' offers better compression at the expense of more CPU time.

Please find below some stats extracted from 50x "Pyflakes" runs (results may differ according to log type).

.. csv-table:: Space saving details
:header: "compression", "raw log size", "compressed log size", "space saving", "compression speed"

"bz2", "2.981 MB", "0.603 MB", "79.77%", "3.433 MB/s"
"gz", "2.981 MB", "0.568 MB", "80.95%", "6.604 MB/s"
"lz4", "2.981 MB", "0.844 MB", "71.68%", "77.668 MB/s"

The :bb:cfg:`logMaxSize` parameter sets an upper limit (in bytes) to how large logs from an individual build step can be.
The default value is None, meaning no upper limit to the log size.
Any output exceeding :bb:cfg:`logMaxSize` will be truncated, and a message to this effect will be added to the log's HEADER channel.
Expand Down

0 comments on commit 9343126

Please sign in to comment.