In [42]:
from pyln.proto.primitives import varint_decode
import bz2
from binascii import hexlify

In [41]:
import click
import bz2
from pyln.proto.primitives import varint_decode

from pathlib import Path
import struct


In [63]:
from binascii import hexlify


import io
import struct
import ipaddress


class ChannelAnnouncement(object):
    def __init__(self):
        self.num_short_channel_id = None
        self.node_signatures = [None, None]
        self.bitcoin_signatures = [None, None]
        self.features = None
        self.chain_hash = None
        self.node_ids = [None, None]
        self.bitcoin_keys = [None, None]

    @property
    def short_channel_id(self):
        return "{}x{}x{}".format(
            (self.num_short_channel_id >> 40) & 0xFFFFFF,
            (self.num_short_channel_id >> 16) & 0xFFFFFF,
            (self.num_short_channel_id >> 00) & 0xFF,
        )

    def __eq__(self, other):
        return (
            self.num_short_channel_id == other.num_short_channel_id
            and self.bitcoin_keys == other.bitcoin_keys
            and self.chain_hash == other.chain_hash
            and self.node_ids == other.node_ids
            and self.features == other.features
        )

    def serialize(self):
        raise ValueError()

    def __str__(self):
        na = hexlify(self.node_ids[0]).decode("ASCII")
        nb = hexlify(self.node_ids[1]).decode("ASCII")
        return "ChannelAnnouncement(scid={short_channel_id}, nodes=[{na},{nb}])".format(
            na=na, nb=nb, short_channel_id=self.short_channel_id
        )

    def __json__(self):
        return {
            '_type': 'channel_announcement',
            'scid': self.short_channel_id,
            'features': hexlify(self.features).decode('ASCII'),
            'node_id_1': hexlify(self.node_ids[0]).decode('ASCII'),
            'node_id_2': hexlify(self.node_ids[1]).decode('ASCII'),
            'chain_hash': hexlify(self.chain_hash).decode('ASCII'),
        }


class ChannelUpdate(object):
    def __init__(self):
        self.signature = None
        self.chain_hash = None
        self.num_short_channel_id = None
        self.timestamp = None
        self.message_flags = None
        self.channel_flags = None
        self.cltv_expiry_delta = None
        self.htlc_minimum_msat = None
        self.fee_base_msat = None
        self.fee_proportional_millionths = None
        self.htlc_maximum_msat = None

    def __json__(self):
        return {
            '_type': 'channel_update',
            'scid': self.short_channel_id,
            'timestamp': self.timestamp,
            'message_flags': hexlify(self.message_flags).decode('ASCII'),
            'channel_flags': hexlify(self.channel_flags).decode('ASCII'),
            'cltv_expiry_delta': self.cltv_expiry_delta,
            'htlc_minimum_msat': self.htlc_minimum_msat,
            'fee_base_msat': self.fee_base_msat,
            'fee_proportional_millionths': self.fee_proportional_millionths,
            'htlc_maximum_msat': self.htlc_maximum_msat,
            'chain_hash': hexlify(self.chain_hash).decode('ASCII'),
        }

    @property
    def short_channel_id(self):
        return "{}x{}x{}".format(
            (self.num_short_channel_id >> 40) & 0xFFFFFF,
            (self.num_short_channel_id >> 16) & 0xFFFFFF,
            (self.num_short_channel_id >> 00) & 0xFF,
        )

    @property
    def direction(self):
        (b,) = struct.unpack("!B", self.channel_flags)
        return b & 0x01

    def serialize(self):
        raise ValueError()

    def __str__(self):
        return "ChannelUpdate(scid={short_channel_id}, timestamp={timestamp})".format(
            timestamp=self.timestamp, short_channel_id=self.short_channel_id
        )

    def __eq__(self, other):
        return (
            self.chain_hash == other.chain_hash
            and self.num_short_channel_id == other.num_short_channel_id
            and self.timestamp == other.timestamp
            and self.message_flags == other.message_flags
            and self.channel_flags == other.channel_flags
            and self.cltv_expiry_delta == other.cltv_expiry_delta
            and self.htlc_minimum_msat == other.htlc_minimum_msat
            and self.fee_base_msat == other.fee_base_msat
            and self.fee_proportional_millionths == other.fee_proportional_millionths
            and self.htlc_maximum_msat == other.htlc_maximum_msat
        )


class Address(object):
    def __init__(self, typ=None, addr=None, port=None):
        self.typ = typ
        self.addr = addr
        self.port = port

    def __eq__(self, other):
        return (
            self.typ == other.typ
            and self.addr == other.addr
            and self.port == other.port
        )

    def __len__(self):
        l = {
            1: 6,
            2: 18,
            3: 12,
            4: 37,
        }
        return l[self.typ] + 1

    def __str__(self):
        addr = self.addr
        if self.typ == 1:
            addr = ".".join([str(c) for c in addr])

        protos = {
            1: "ipv4",
            2: "ipv6",
            3: "torv2",
            4: "torv3",
        }

        proto = protos.get(self.typ, 'UNKNOWN')
        return f"{proto}://{addr}:{self.port}"


class NodeAnnouncement(object):
    def __init__(self):
        self.signature = None
        self.features = ""
        self.timestamp = None
        self.node_id = None
        self.rgb_color = None
        self.alias = None
        self.addresses = None

    def __str__(self):
        return "NodeAnnouncement(id={hexlify(node_id)}, alias={alias}, color={rgb_color})".format(
            node_id=self.node_id, alias=self.alias, rgb_color=self.rgb_color
        )

    def __eq__(self, other):
        return (
            self.features == other.features
            and self.timestamp == other.timestamp
            and self.node_id == other.node_id
            and self.rgb_color == other.rgb_color
            and self.alias == other.alias
        )

    def __json__(self):
        return {
            '_type': 'node_announcement',
            'features': self.features.hex(),
            'timestamp': self.timestamp,
            'node_id': self.node_id.hex(),
            'rgb_color': self.rgb_color.hex(),
            'addresses': []  # TODO Add missing addresses
        }


def parse(b):
    if not isinstance(b, io.BytesIO):
        b = io.BytesIO(b)
    (typ,) = struct.unpack("!H", b.read(2))

    parsers = {
        256: parse_channel_announcement,
        257: parse_node_announcement,
        258: parse_channel_update,
        3503: parse_ignore,
    }

    if typ not in parsers:
        raise ValueError("No parser registered for type {typ}".format(typ=typ))

    return parsers[typ](b)


def parse_ignore(b):
    return None


def parse_channel_announcement(b):
    if not isinstance(b, io.BytesIO):
        b = io.BytesIO(b)

    ca = ChannelAnnouncement()
    ca.node_signatures = (b.read(64), b.read(64))
    ca.bitcoin_signatures = (b.read(64), b.read(64))
    (flen,) = struct.unpack("!H", b.read(2))
    ca.features = b.read(flen)
    ca.chain_hash = b.read(32)[::-1]
    (ca.num_short_channel_id,) = struct.unpack("!Q", b.read(8))
    ca.node_ids = (b.read(33), b.read(33))
    ca.bitcoin_keys = (b.read(33), b.read(33))
    return ca


def parse_channel_update(b):
    if not isinstance(b, io.BytesIO):
        b = io.BytesIO(b)

    cu = ChannelUpdate()
    cu.signature = b.read(64)
    cu.chain_hash = b.read(32)[::-1]
    (cu.num_short_channel_id,) = struct.unpack("!Q", b.read(8))
    (cu.timestamp,) = struct.unpack("!I", b.read(4))
    cu.message_flags = b.read(1)
    cu.channel_flags = b.read(1)
    (cu.cltv_expiry_delta,) = struct.unpack("!H", b.read(2))
    (cu.htlc_minimum_msat,) = struct.unpack("!Q", b.read(8))
    (cu.fee_base_msat,) = struct.unpack("!I", b.read(4))
    (cu.fee_proportional_millionths,) = struct.unpack("!I", b.read(4))
    t = b.read(8)
    if len(t) == 8:
        (cu.htlc_maximum_msat,) = struct.unpack("!Q", t)
    else:
        cu.htlc_maximum_msat = None

    return cu


def parse_address(b):
    if not isinstance(b, io.BytesIO):
        b = io.BytesIO(b)

    t = b.read(1)
    if len(t) != 1:
        return None

    a = Address()
    (a.typ,) = struct.unpack("!B", t)

    if a.typ == 1:
        a.addr = b.read(4)
        (a.port,) = struct.unpack("!H", b.read(2))
    elif a.typ == 2:
        a.addr = b.read(16)
        a.addr = '[' + format(ipaddress.IPv6Address(a.addr)) + ']'
        (a.port,) = struct.unpack("!H", b.read(2))
    elif a.typ == 3:
        a.addr = b.read(10)
        a.addr = to_base_32(a.addr) + '.onion'
        (a.port,) = struct.unpack("!H", b.read(2))
    elif a.typ == 4:
        a.addr = b.read(35)
        a.addr = to_base_32(a.addr) + '.onion'
        (a.port,) = struct.unpack("!H", b.read(2))
    else:
        a.addr = b.getvalue()[1:]
        a.port = None
    return a

# https://github.com/alexbosworth/bolt07/blob/519c94a7837e687bf7478a74779d5ea493a76a44/addresses/encode_base32.js
def to_base_32(addr):
    alphabet = 'abcdefghijklmnopqrstuvwxyz234567'
    byte = 8
    lastIndex = 31
    word = 5
    bits = 0
    base32 = ''
    value = 0

    for char in addr:
        bits += byte
        value = (value << byte) | char

        while bits >= word:
            base32 += alphabet[(value >> (bits - word)) & lastIndex]
            bits -= word

    if bits > 0:
        base32 += alphabet[(value << (word - bits)) & lastIndex]
    
    return base32

def parse_node_announcement(b):
    if not isinstance(b, io.BytesIO):
        b = io.BytesIO(b)

    na = NodeAnnouncement()
    na.signature = b.read(64)
    (flen,) = struct.unpack("!H", b.read(2))
    na.features = b.read(flen)
    (na.timestamp,) = struct.unpack("!I", b.read(4))
    na.node_id = b.read(33)
    na.rgb_color = b.read(3)
    na.alias = b.read(32)
    (alen,) = struct.unpack("!H", b.read(2))
    abytes = io.BytesIO(b.read(alen))
    na.addresses = []
    while True:
        addr = parse_address(abytes)
        if addr is None:
            break
        else:
            na.addresses.append(addr)

    return na


In [None]:
class DatasetStream:
    def __init__(self, file_stream, decode=True):
        self.stream = file_stream
        self.decode = decode

        # Read header
        header = self.stream.read(4)
        assert len(header) == 4
        assert header[:3] == b"GSP"
        assert header[3] == 1

    def __iter__(self):
        return self

    def __next__(self):
        pos = self.stream.tell()
        length = varint_decode(self.stream)

        if length is None:
            raise StopIteration()

        msg = self.stream.read(length)
        if len(msg) != length:
            raise ValueError(
                "Error reading dataset at {pos}: incomplete read of {length} bytes, only got {mlen} bytes".format(
                    pos=pos, length=length, mlen=len(msg)
                )
            )
        if not self.decode:
            return msg

        return parse(msg)



class GossipStore:
    """A gossip_store file allowing streaming of messages.
    """

    def __init__(self, path: Path):
        self.path = path

    

In [62]:
with open('gossip-20201014.gsp.bz2', 'rb') as f:
    print(struct.unpack("!B", f.read(1)))

(66,)


In [60]:
        with open('gossip-20201014.gsp.bz2', 'rb') as f:
            version, = struct.unpack("!B", f.read(1))
            while True:
                hdr = f.read(8)
                if len(hdr) < 8:
                    break

                length, crc = struct.unpack("!II", hdr)
                if version, > 3:
                    f.read(4)  # Throw away the CRC

                # deleted = (length & 0x80000000 != 0)
                # important = (length & 0x40000000 != 0)
                length = length & (~0x80000000) & (~0x40000000)
                msg = f.read(length)
                typ, = struct.unpack("!H", msg[:2])
                if version, <= 3 and typ in [4096, 4097, 4098]:
                    msg = msg[4:]

    


SyntaxError: invalid syntax (Temp/ipykernel_15700/2842176013.py, line 9)

In [None]:
class DatasetFile(click.File):
    def __init__(self, decode=True):
        click.File.__init__(self)
        self.decode = decode

    def convert(self, value, param, ctx):
        f = bz2.open(value, "rb") if value.endswith(".bz2") else open(value, "rb")
        return DatasetStream(f, self.decode)


In [52]:
with bz2.open('gossip-20201014.gsp.bz2', 'rb') as f:
    content = f.read()
    print(varint_decode(f))


None


In [64]:
parse(content[0:10])

ValueError: No parser registered for type 18259

In [66]:
hexlify(content[0:1000]).decode("ASCII")

'47535001fd01b001008896bdc5a13d79d4fc359174ae270c38d92f77d806d4798819e59b62857bfb00749c7ba98fe565a0d8d27f83bac4221916fb9c71d730a32c9a0ab968876fbada261890127472ffb27d8249a3069befa47dcaf6e5a4244ab53e7345dd98be40633f5c347a59d4b8d0f3c28746403e20b62e35435bde44bff4c4005a723ffa1091743181f8eb36e4622f08068aae12f273a483753f08c962f4654a54d5fe48b8ac673bc1a0357b5f08b341da126ee768f7cf3464f8d819c9b1449feefdad6d12d17fb608bdfd4108f8d600881202970cdaf05bdf5b2061e65191167529a2892fff28f8abf29765a0ff51f25c9e78279a90a0bdcbb71e00714a12624b70d6dfcc5700006fe28c0ab6f1b372c1a6a246ae63f74f931e8365e15a089c68d619000000000009b73d0009a5000003864ef025fde8fb587d989186ce6a4a186895ee44a926bfc370e2c366597a3f8f03b31e5bbf2cdbe115b485a2b480e70a1ef3951a0dc6df4b1232e0e56f3dce18d6039de04ba5d35eb2daa426fd386befd6816aa5b12461f059a2ea69009472423ae2024e2179a7e81758df35e03dbcc38e5ecac57252b97405a96e65354a0e6021e864fd01b00100641039061fd0132b6713ab6634d2c2bf4227feea07ec4dd94b6885b167a2e96510fce2059e5ec9b28b27100576e2f81959af6bba460a209

In [73]:
hexlify(content[0:1000])

b'47535001fd01b001008896bdc5a13d79d4fc359174ae270c38d92f77d806d4798819e59b62857bfb00749c7ba98fe565a0d8d27f83bac4221916fb9c71d730a32c9a0ab968876fbada261890127472ffb27d8249a3069befa47dcaf6e5a4244ab53e7345dd98be40633f5c347a59d4b8d0f3c28746403e20b62e35435bde44bff4c4005a723ffa1091743181f8eb36e4622f08068aae12f273a483753f08c962f4654a54d5fe48b8ac673bc1a0357b5f08b341da126ee768f7cf3464f8d819c9b1449feefdad6d12d17fb608bdfd4108f8d600881202970cdaf05bdf5b2061e65191167529a2892fff28f8abf29765a0ff51f25c9e78279a90a0bdcbb71e00714a12624b70d6dfcc5700006fe28c0ab6f1b372c1a6a246ae63f74f931e8365e15a089c68d619000000000009b73d0009a5000003864ef025fde8fb587d989186ce6a4a186895ee44a926bfc370e2c366597a3f8f03b31e5bbf2cdbe115b485a2b480e70a1ef3951a0dc6df4b1232e0e56f3dce18d6039de04ba5d35eb2daa426fd386befd6816aa5b12461f059a2ea69009472423ae2024e2179a7e81758df35e03dbcc38e5ecac57252b97405a96e65354a0e6021e864fd01b00100641039061fd0132b6713ab6634d2c2bf4227feea07ec4dd94b6885b167a2e96510fce2059e5ec9b28b27100576e2f81959af6bba460a20

In [68]:
from binascii import unhexlify
unhexlify(content[0:1000])

Error: Non-hexadecimal digit found

In [35]:
content[0:5000]

b'GSP\x01\xfd\x01\xb0\x01\x00\x88\x96\xbd\xc5\xa1=y\xd4\xfc5\x91t\xae\'\x0c8\xd9/w\xd8\x06\xd4y\x88\x19\xe5\x9bb\x85{\xfb\x00t\x9c{\xa9\x8f\xe5e\xa0\xd8\xd2\x7f\x83\xba\xc4"\x19\x16\xfb\x9cq\xd70\xa3,\x9a\n\xb9h\x87o\xba\xda&\x18\x90\x12tr\xff\xb2}\x82I\xa3\x06\x9b\xef\xa4}\xca\xf6\xe5\xa4$J\xb5>sE\xdd\x98\xbe@c?\\4zY\xd4\xb8\xd0\xf3\xc2\x87F@> \xb6.5C[\xdeD\xbf\xf4\xc4\x00Zr?\xfa\x10\x91t1\x81\xf8\xeb6\xe4b/\x08\x06\x8a\xae\x12\xf2s\xa4\x83u?\x08\xc9b\xf4eJT\xd5\xfeH\xb8\xacg;\xc1\xa05{_\x08\xb3A\xda\x12n\xe7h\xf7\xcf4d\xf8\xd8\x19\xc9\xb1D\x9f\xee\xfd\xadm\x12\xd1\x7f\xb6\x08\xbd\xfdA\x08\xf8\xd6\x00\x88\x12\x02\x97\x0c\xda\xf0[\xdf[ a\xe6Q\x91\x16u)\xa2\x89/\xff(\xf8\xab\xf2\x97e\xa0\xffQ\xf2\\\x9ex\'\x9a\x90\xa0\xbd\xcb\xb7\x1e\x00qJ\x12bKp\xd6\xdf\xccW\x00\x00o\xe2\x8c\n\xb6\xf1\xb3r\xc1\xa6\xa2F\xaec\xf7O\x93\x1e\x83e\xe1Z\x08\x9ch\xd6\x19\x00\x00\x00\x00\x00\t\xb7=\x00\t\xa5\x00\x00\x03\x86N\xf0%\xfd\xe8\xfbX}\x98\x91\x86\xcejJ\x18h\x95\xeeD\xa9&\xbf\xc3p\xe2\xc3fYz?\x8f\x03\xb3

In [39]:
len(content)

1339792490

In [72]:
content[1]

83

In [21]:
print(type(content))

<class 'bytes'>


In [22]:
for i in content[0:10]:
    print(i)

71
83
80
1
253
1
176
1
0
136


In [19]:
int(content[0:10])

ValueError: invalid literal for int() with base 10: b'GSP\x01\xfd\x01\xb0\x01\x00\x88'

In [None]:
from pyln.proto.primitives import varint_decode
import bz2

def read_dataset(filename: str):
    with bz2.open(filename, 'rb') as f:
        header = f.read(4)
        print(header[3])
        assert(header[:3] == b'GSP' and header[3] == 1)
        while True:
            length = varint_decode(f)
            msg = f.read(length)
            if len(msg) != length:
                raise ValueError(f"Incomplete message read from {filename}")

            yield msg

In [12]:
str(content)

In [6]:
print(content)