## Tests methods for compressing readings sent through Notecard

In [3]:
import pickle
import random
import string
import time

In [4]:
sensor_ids = []
sensor_ct = 15
for i in range(sensor_ct):
    sensor_ids.append(''.join(random.choices(string.ascii_lowercase, k=20)))
sensor_ids

['jhiohhfcbbkjuvbiysvy',
 'oqccxwwxztehlaijzfxw',
 'tctaomyhafucoibintnu',
 'fypvjmvmeabgjvklatds',
 'juvdkefspavjjxpstwzz',
 'zqtvymwjgeufpcdbuwjd',
 'nhzobkyezpciyonqjfeh',
 'sdfgdqqgjvyxkvwpqsdi',
 'sgylpkfkxptiwpsghgkv',
 'mpdtzfztnbaqkfhfgyue',
 'acyrgmjxhiytmywsvgdi',
 'uppzmvaavvrhvyupqyyk',
 'oslrfivmatrbocogcdhe',
 'poljozyirfdyfkkmurlk',
 'vbjfzyatlnyxqkedetkw']

In [5]:
readings = []
ts = round(time.time(), 1)
for tstep in range(6):
    t = ts + tstep * 600
    for sensor in sensor_ids:
        val = round(random.random(), 6)
        readings.append( (t, sensor, val))
readings

[(1670628481.8, 'jhiohhfcbbkjuvbiysvy', 0.908177),
 (1670628481.8, 'oqccxwwxztehlaijzfxw', 0.777778),
 (1670628481.8, 'tctaomyhafucoibintnu', 0.728349),
 (1670628481.8, 'fypvjmvmeabgjvklatds', 0.215285),
 (1670628481.8, 'juvdkefspavjjxpstwzz', 0.599619),
 (1670628481.8, 'zqtvymwjgeufpcdbuwjd', 0.067323),
 (1670628481.8, 'nhzobkyezpciyonqjfeh', 0.622199),
 (1670628481.8, 'sdfgdqqgjvyxkvwpqsdi', 0.02338),
 (1670628481.8, 'sgylpkfkxptiwpsghgkv', 0.633144),
 (1670628481.8, 'mpdtzfztnbaqkfhfgyue', 0.061425),
 (1670628481.8, 'acyrgmjxhiytmywsvgdi', 0.689591),
 (1670628481.8, 'uppzmvaavvrhvyupqyyk', 0.985525),
 (1670628481.8, 'oslrfivmatrbocogcdhe', 0.393341),
 (1670628481.8, 'poljozyirfdyfkkmurlk', 0.263446),
 (1670628481.8, 'vbjfzyatlnyxqkedetkw', 0.279305),
 (1670629081.8, 'jhiohhfcbbkjuvbiysvy', 0.501815),
 (1670629081.8, 'oqccxwwxztehlaijzfxw', 0.030233),
 (1670629081.8, 'tctaomyhafucoibintnu', 0.533779),
 (1670629081.8, 'fypvjmvmeabgjvklatds', 0.502442),
 (1670629081.8, 'juvdkefspavjjxp

In [6]:
data = str(readings).encode('utf-8')
print('Length as uncompressed text', len(data))

Length as uncompressed text 4494


### Remember that we then have to convert compressed bytes to Base 64
Compression ratio will be lowered to 0.75 x value shown here.

In [7]:
import bz2
# Compress the string representation
c = bz2.compress(data)
print('bz2 compression ratio', len(data) / len(c))

bz2 compression ratio 4.676378772112383


In [8]:
pickle_data = pickle.dumps(readings, pickle.HIGHEST_PROTOCOL)
print('pickle alone', len(data) / len(pickle_data))

pickle alone 1.9446127217654694


In [9]:
print('pickle with bz2 compression', len(data) / len(bz2.compress(pickle_data)))

pickle with bz2 compression 2.9682959048877144


Looks like BZ2 compression of the string representation of the Readings Array is best.

Will then need to convert to Base64 and could assign to the Payload key of the Note.

## Test of compression, decompression

In [20]:
import base64

data = bz2.compress(str(readings).encode('utf-8'))
sdata = base64.b64encode(data).decode('utf-8')
len(str(readings)), len(data), len(sdata)

(4494, 961, 1284)

In [21]:
print(sdata)

QlpoOTFBWSZTWR7YzbYABULbgGAAQOV/4AAKP///8FAE6ru1EiLu8UUooMiNJ6kABobTUABpoGIAVUAAmBGEGCDTCmgkoAAZAAAEVU2k/VPJMamp6j1HqGgDRo0CKUAaNTU8Ap6poAAxHzuQSABd8v5zuROUoYfmWpr5KuFbFmcPXJIwktcYERInRO8tzcmxN8uxZO6F1OHTC+nGDy+gAkiiEkEW0Sc9ULWoFpBEjR0UQqV5O8FdGcU9ppWJ6U5VY+J8QuHU5l1MpRECmqoxo22yichFPFBTwEU1VQmIpsBvEUEcuc52PZrE78FVVZjj7bt4KWY34BQqgeCDrwkwqsk7uhrrHempRwIJHY+1nXFa7ctOM1VJLWAjnvFCqG7664eFdFjHWRYfC+ZQin8G83S8vC4S8PG1q2IltdnJL2SaynJKHa5J6WrtHKCS/W5JukwADYpeJTRQQycp5qam9imdqkJNk3LvkBLeHw7olIOHGxfDdlqPoWFEM5GSxBMcBBBJJBBJJJBJBBLnWXoWZeDEd2MI76U4XesS72LxYYV17u9d8rw81JU801hmicxXVEEh0QSLgQhaPRwUWSCRtYmJKaW5hUIWQSROVriAnjH7VPZWReZbsufcmeTQzLn+Oy+brALw5qqFUOI2tvolpxFORRsQtJwOOCRQwgyQxuMRuQxOJxBwpuMJqRBKQYWdTqJXy7YAD1q7e08WNrOluqo2crA0U/MLEdnEnr6dnr1RzILqkcwzlMQSUriqoyOsE46bAbAdYMwfAOAPUFQZg+gbwe48x5DYPmOANB/B6DyHmPQd4/o7BgfscxmMx9hoHcfUaBhvIgiIjIYG0ZDiMg3iEjUf6DUNg+Q3DuNw6DkNY0jcNQ1jQMhyHyH1HoYGQ6jmOQ0joO47jUajaMDeNR4DA2jkN47x/hmNw9w0BsHMfA4DiOI7DyNY1j7DmNI0DsOoPY8jSOwPQ7j2NhkxEMGA6DA9jMcxmMxxHkdB

In [109]:
sdata = 'QlpoOTFBWSZTWZGlX9gAAPUbgEDlf+AACooj3wAwAPigjRENp6oAANAxgAAAABElMp5TUyYEaYEsiPVOwzuNgTpFdecm6qqq31XGq6cbj0tCQkJOKBwEMjsdgyYAlDQchoNTOsENJem9qOQhoVzodoGhtN1b2ENWrhoiAAYpNC8oaMjMxKq7gSYziUBJcmMC6FUQyBbIhjAheWupMSTMzWAcWkevD0PsYD2HwND8GAPwfQ9x9jQ7ByPsbj8GCGBDUfAh6BoH/F3JFOFCQkaVf2A='

In [110]:
# Restore this to a readings array
data_compressed = base64.b64decode(sdata)
data = bz2.decompress(data_compressed)
readings_restored = eval(data)      # eval seems to work with bytes
readings_restored

[(1670729849.7, 'test_version', 3.8),
 (1670729849.7, 'test_uptime', 27055.0),
 (1670729552.2, 'test_cpu_temp', 53.471),
 (1670730449.8, 'test_version', 3.8),
 (1670730449.8, 'test_uptime', 27655.0),
 (1670730152.2, 'test_cpu_temp', 53.191),
 (1670731049.7, 'test_version', 3.8),
 (1670731049.7, 'test_uptime', 28255.0),
 (1670730752.2, 'test_cpu_temp', 53.142),
 (1670731649.7, 'test_version', 3.8),
 (1670731649.7, 'test_uptime', 28855.0),
 (1670731352.2, 'test_cpu_temp', 53.175),
 (1670732249.8, 'test_version', 3.8),
 (1670732249.8, 'test_uptime', 29455.0),
 (1670731952.2, 'test_cpu_temp', 52.789),
 (1670732849.7, 'test_version', 3.8),
 (1670732849.7, 'test_uptime', 30055.0),
 (1670732552.2, 'test_cpu_temp', 52.533)]

## Another Compression Method

Use a 1-byte number to identify the sensor ID (limit of 256 IDs in one batch of readings),
express timestamp as a delta from a base timestamp.  The delta would be expressed in tenths
of a second; since less than one hour of data is tranmitted, we only need the numbers
0 - 36,000 to epress the delta.  The value would be epressed as a 4-byte, single-precision
floating point value.

Total record size is therefore 7 bytes.  Just build a large bytearray of 7-byte records.
Compress with bz2 and then encode as a base64 string.

In [74]:
raw_str_data = str(readings).encode('utf-8')
len(raw_str_data)

4494

In [75]:
recs = bytearray()
recs.append(4)
recs.append(5)
recs += b'abc'
recs

bytearray(b'\x04\x05abc')

In [86]:
ts_arr, id_arr, val_arr = zip(*readings)

# create dictionary of all unique sensor IDs mapped to a sensor integer
ids_uniq = list(set(id_arr))
sensor_map = dict(zip(ids_uniq, range(len(ids_uniq))))

# get the minimum ts to use as a base
ts_base = min(ts_arr)


In [81]:
# need specify Byte order to avoid automatic padding
import struct
rec = struct.pack('<HBd', 36000, 255, 2.434e-5)
struct.unpack('<HBd', rec)

(36000, 255, 2.434e-05)

In [82]:
# Need to use a Double for the val field in order to accommodate counters.
recs = b''
for ts, sensor_id, val in readings:
    rec = struct.pack(
        '<HBd',                       # need specify Byte order to avoid automatic padding
        int((ts - ts_base) * 10), 
        sensor_map[sensor_id], 
        val)
    recs += rec
len(recs)

990

In [83]:
# Compression **increases** the size of the byte array!  It was already highly compressed.
# This is worse than just compressing the string representation of the readings array,
# and more complicated.
len(bz2.compress(recs)), len(recs)

(1090, 990)

In [93]:
# Curious whether substituting sensor ID integers helps the string compression 
# in the original method
id_int_arr = [sensor_map[id] for id in id_arr]
readings_2 = list(zip(ts_arr, id_int_arr, val_arr))
print('Integer IDs', len(bz2.compress(str(readings_2).encode('utf-8'))))
print('String IDs', len(bz2.compress(str(readings).encode('utf-8'))))

# ***BUT*** When you add in the Sensor Map that you need to send along with this payload
# The total bytes were 1189 bytes, as opposed to just compressing the unaltered string 
# version of the readings array, which comes out to 1284 bytes.  So, the savings are really
# very little and not worth the additional complexity.

Integer IDs 565
String IDs 961


In [94]:
# What about also substituting ts deltas in tenths
ts_delta_arr = [int(10 * (ts - ts_base)) for ts in ts_arr]
readings_3 = list(zip(ts_delta_arr, id_int_arr, val_arr))
print('Integer IDs + Delta ts', len(bz2.compress(str(readings_3).encode('utf-8'))))
# Very little additional compression


Integer IDs + Delta ts 540


In [98]:
# How does this algorithm work for a small array:
d = {
	"storeKey": "124343abc",
	"readings": [
		[6542342.2, "abc123", 23.4],
		[6542344.8, "xyz_456", 33.4]
	]
}
len(str(d['readings']))

59

In [99]:
small_rd = [
		[6542342.2, 0, 23.4],
		[6542344.8, 1, 33.4]
	]
print('Small Compressed', len(bz2.compress(str(small_rd).encode('utf-8'))))

# not terrible, except do need to add the sensor map into the body.


Small Compressed 65


In [100]:
# Try lzma with Integer Sensor ID reading array
import lzma
print('Integer IDs', len(lzma.compress(str(readings_2).encode('utf-8'))))

# a little worse than bz2.


Integer IDs 616


### Experiments

In [112]:
from pathlib import Path
import sys
stg_path = Path('/boot/pi_logger/settings.py')
sys.path.insert(0, str(stg_path.parent))
sys.path

['/boot/pi_logger',
 '/home/alan/notecard-server/test',
 '/home/alan/anaconda3/lib/python39.zip',
 '/home/alan/anaconda3/lib/python3.9',
 '/home/alan/anaconda3/lib/python3.9/lib-dynload',
 '',
 '/home/alan/anaconda3/lib/python3.9/site-packages',
 '/home/alan/anaconda3/lib/python3.9/site-packages/IPython/extensions',
 '/home/alan/.ipython']