## Data Types ##
```
Name   | Number of bytes | Description
------------------------------------
Int    |               4 | integer
String |        Variable | (See Note 1)
```

Note 1: String has three parts; a single byte which will be either 0x00, indicating that the next two parts are not present, or 0x0b (decimal 11), indicating that the next two parts are present. If it is 0x0b, there will then be a ULEB128, representing the byte length of the following string, and then the string itself, encoded in UTF-8. See this.

## collection.db format ##
### Header ###
```
Data type | Description
-----------------------------------
Int       | Version (e.g. 20150203)
Int       | Number of collections
```
### Per collection ###
```
Data type | Description
-------------------------------------------------------------------------------------
String    | Name of the collection
Int       | Number of beatmaps in the collection
String*   | Beatmap MD5 hash. Repeated for as many beatmaps as are in the collection.
```

In [8]:
osu_collection_file = 'C:\\Users\\Ar\\AppData\\Local\\osu!\\collection.db'

In [114]:
def get_uleb128(content):
	print('get_uleb128: content:', content)
	value = 0
	for i in range(0,5):
		tmp = content[i] & 0x7f
		value = tmp << (i * 7) | value
		if (content[i] & 0x80) != 0x80:
			break
	if i == 4 and (tmp & 0xf0) != 0:
		print("parse a error uleb128 number")
		return -1
	return i+1, value

In [100]:
#uleb128 decoder!
def uleb128_value(m, off):
    size = 1
    result = ord(m[off+0] )
    if result > 0x7f :
        cur = ord(m[off+1] )
        result = (result & 0x7f) | ((cur & 0x7f) << 7)
        size += 1
        if cur > 0x7f :
            cur = ord(m[off+2] )
            result |= ((cur & 0x7f) << 14)
            size += 1
            if cur > 0x7f :
                cur = ord(m[off+3] )
                result |= ((cur & 0x7f) << 21)
                size += 1
                if cur > 0x7f :
                    cur = ord(m[off+4] )
                    result |= (cur << 28)
                    size += 1
    print('result:%x, size:%x' % (result, size))
    return result, size

In [101]:
def read_int(f):
    return int.from_bytes(f.read(4), byteorder='little')

In [112]:
def read_string(f):
    print('read_string:')
    # part 1: indicator byte - 0x00 = next two parts are not present, or 0x0b = next two parts are present
    coll_name_indicator = f.read(1)
    print('  indicator:', coll_name_indicator)
    if coll_name_indicator == b'\x0b':
        # part 2: ULEB128 - representing the byte length of the following string
        coll_name_len = get_uleb128(f.read(1))[1]
#         coll_name_len = uleb128_value(str(f.read(2)), 0)[0]
#         coll_name_len = 5
        print('  length:', coll_name_len)
        # part 3: string
        coll_name_bytes = f.read(coll_name_len)
        print('  bytes:', coll_name_bytes)
        coll_name_str = str(coll_name_bytes, 'utf-8')
        print('  string:', coll_name_str)
        return coll_name_str
    elif coll_name_indicator == b'\x00':
        return ''

In [119]:
with open(osu_collection_file, "rb") as f:
    
    # header
    
    # read version
    version = read_int(f)
    print('version:', version)
    
    # read no. of collections
    coll_count = read_int(f)
    print('no. of collections:', coll_count)
    
    # collection
    for c in range(coll_count):
        # read name
        coll_name = read_string(f)
        print('collection name:', coll_name)
        # read no. of beatmaps in collection
        coll_beatmap_count = read_int(f)
        print('no. of beatmaps:', coll_beatmap_count)
        
        for b in range(coll_beatmap_count):
            # read name
            beatmap_md5 = read_string(f)
            print('#', b, 'beatmap MD5:', beatmap_md5)

version: 20190207
no. of collections: 8
read_string:
  indicator: b'\x0b'
get_uleb128: content: b'\x06'
  length: 6
  bytes: b'_Attic'
  string: _Attic
collection name: _Attic
no. of beatmaps: 63
read_string:
  indicator: b'\x0b'
get_uleb128: content: b' '
  length: 32
  bytes: b'b0140f0a92d5aee17239584b3eb20352'
  string: b0140f0a92d5aee17239584b3eb20352
# 0 beatmap MD5: b0140f0a92d5aee17239584b3eb20352
read_string:
  indicator: b'\x0b'
get_uleb128: content: b' '
  length: 32
  bytes: b'a1fe604eeed945a2557fbdc4c9b624ba'
  string: a1fe604eeed945a2557fbdc4c9b624ba
# 1 beatmap MD5: a1fe604eeed945a2557fbdc4c9b624ba
read_string:
  indicator: b'\x0b'
get_uleb128: content: b' '
  length: 32
  bytes: b'0fb944a47b004cc03ceac0774ff78987'
  string: 0fb944a47b004cc03ceac0774ff78987
# 2 beatmap MD5: 0fb944a47b004cc03ceac0774ff78987
read_string:
  indicator: b'\x0b'
get_uleb128: content: b' '
  length: 32
  bytes: b'd9a6ee05d23b026f3f0f627ea976e601'
  string: d9a6ee05d23b026f3f0f627ea976e601
# 3 be