In [2]:
from sys import getsizeof
from collections import Mapping, Container
import array, collections
from tf.fabric import Fabric
from tf.timestamp import Timestamp
tm = Timestamp()

In [3]:
def deep_getsizeof(o, ids):
    """Find the memory footprint of a Python object
 
    This is a recursive function that drills down a Python object graph
    like a dictionary holding nested dictionaries with lists of lists
    and tuples and sets.
 
    The sys.getsizeof function does a shallow size of only. It counts each
    object inside a container as pointer only regardless of how big it
    really is.
 
    :param o: the object
    :param ids:
    :return:
    """
    d = deep_getsizeof
    if id(o) in ids:
        return 0
 
    r = getsizeof(o)
    ids.add(id(o))
 
    if isinstance(o, str):
        return r
 
    if isinstance(o, Mapping):
        return r + sum(d(k, ids) + d(v, ids) for k, v in o.items())
 
    if isinstance(o, Container):
        return r + sum(d(x, ids) for x in o)
 
    return r 

def dgetsizeof(o): return deep_getsizeof(o, set())

In [4]:
def tinf(msg, reset=False):
    if reset:
        tm.indent(reset=True)
    tm.info(msg)

def nbytes(by):
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    result = ''
    for i in range(len(units)):
        if by < 1024 or i == len(units) - 1:
            fmt = '{:>5}{}' if i == 0 else '{:>5.1f}{}'
            return fmt.format(by, units[i])
        by /= 1024

def size(data): return nbytes(dgetsizeof(data))

def sizes(bunch):
    tc = 0
    for x in bunch:
        lc = dgetsizeof(x)
        tc += lc
        print('{:<10}: {:>10}'.format('component', nbytes(lc)))
    print('{:<10}: {:>10}'.format('total', nbytes(tc)))


def fill(n):
    if n == 1000000:
        for i in range(100): yield i
        for i in range(200,300): yield i
        for i in range(500, 1001): yield i
        for i in (1002, 1004, 1006, 1008, 1010, 1012): yield i
        for i in range(1012, 2000): yield 2*i
        for i in range(4000, 1002305): yield i
    elif n == 100000:
        for i in range(100): yield i
        for i in range(200,300): yield i
        for i in range(500, 1001): yield i
        for i in (1002, 1004, 1006, 1008, 1010, 1012): yield i
        for i in range(1012, 2000): yield 2*i
        for i in range(4000, 102305): yield i

def testcases(): return range(1000, 1011)

def check(bunch, getter):
    for i in testcases():
        print('{}={}'.format(repr(i), repr(getter(bunch, i))))

def times(n, bunch, getter):
    tinf('Getting all keys', reset=True)
    for i in range(n):
        x = getter(bunch, i)
    tinf('Done')

def get0(data, i): return data[0].get(i, None)

def experiment(n, data, getter):
    sizes(data)
    check(data, getter)
    times(n, data, getter)

In [5]:
TF = Fabric()

95 features found and 0 ignored


In [6]:
T = TF.load('sp lex')

  0.00s loading features ...
   |     0.08s B otype                from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.90s B oslots               from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.00s M otext                from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.03s B book                 from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.02s B chapter              from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.02s B verse                from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.20s B g_cons               from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.24s B g_cons_utf8          from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.20s B g_voc_lex            from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.36s B g_voc_lex_utf8       from /Users/dirk/github/text-fabric-data/hebrew/etcbc4c
   |     0.31s B g_word        

In [7]:
for ft in sorted(TF.features.keys()):
    print('{:<20} {:>20}'.format(ft, size(TF.features[ft].data)))

__levDown__                       297.9MB
__levUp__                         481.4MB
__levels__                          2.7KB
__order__                          44.2MB
__rank__                           44.2MB
__sections__                        2.7MB
book                                3.4MB
book@am                             6.3KB
book@ar                             5.9KB
book@bn                             6.1KB
book@da                             4.9KB
book@de                             4.9KB
book@el                             6.0KB
book@en                             4.8KB
book@es                             5.2KB
book@fa                             6.0KB
book@fr                             5.1KB
book@he                             5.9KB
book@hi                             6.0KB
book@id                             4.8KB
book@ja                             5.8KB
book@ko                             5.7KB
book@la                             4.8KB
book@nl                           

In [61]:
def optimize1(bd):
    values = []
    keys = array.array('I')
    index = 0
    for i in sorted(bd):
        value = bd[i]
        values.append(value)
        keys.append(index)
        index += len(value)
    return (keys, ''.join(values))

def get1(optD, i):
    (keys, values) = optD
    index = keys[i]
    index2 = keys[i+1] if i+1 < len(keys) else len(values)
    value = values[index:index2]
    return value

In [62]:
def optimize2(bd):
    values = []
    keys = array.array('I')
    lengths = array.array('H')
    index = 0
    for i in sorted(bd):
        value = bd[i]
        values.append(value)
        keys.append(index)
        lval = len(value)
        lengths.append(lval)
        index += lval
    return (keys, lengths, ''.join(values))

def get2(optD, i):
    (keys, lengths, values) = optD
    index = keys[i]
    length = lengths[i]
    return values[index:index+length]

In [63]:
def optimize3(bd):
    keys = {}
    values = []

    valindexes = array.array('I')
    vallengths = array.array('H')
    valindex = 0
    index = 0
    
    for i in sorted(bd):
        keys[i] = index

        value = bd[i]
        lval = len(value)

        values.append(value)
        valindexes.append(valindex)
        valindex += lval
        vallengths.append(lval)
        index +=1
    return (keys, valindexes, vallengths, ''.join(values))

def get3(optD, i):
    (keys, valindexes, vallengths, values) = optD
    index = keys.get(i, None)
    if index == None: return None
    valindex = valindexes[index]
    vallength = vallengths[index]
    return values[valindex:valindex+vallength]

In [90]:
def experiments(n, exclude=set()):
    bunch0 = dict((i, str(i)*10) for i in fill(n))
    experiments = (
        ((bunch0,), get0),
        (optimize1(bunch0), get1),
        (optimize2(bunch0), get2),
        (optimize3(bunch0), get3),
    )
    for (i,x) in enumerate(experiments):
        if i in exclude: continue
        print('Experiment {}'.format(i))
        experiment(n, *x)

In [91]:
experiments(100000, exclude=set())

Experiment 0
component :     18.1MB
total     :     18.1MB
1000='1000100010001000100010001000100010001000'
1001=None
1002='1002100210021002100210021002100210021002'
1003=None
1004='1004100410041004100410041004100410041004'
1005=None
1006='1006100610061006100610061006100610061006'
1007=None
1008='1008100810081008100810081008100810081008'
1009=None
1010='1010101010101010101010101010101010101010'
  0.00s Getting all keys
  0.04s Done
Experiment 1
component :      3.1MB
component :      4.7MB
total     :      7.8MB
1000='2610261026102610261026102610261026102610'
1001='2612261226122612261226122612261226122612'
1002='2614261426142614261426142614261426142614'
1003='2616261626162616261626162616261626162616'
1004='2618261826182618261826182618261826182618'
1005='2620262026202620262026202620262026202620'
1006='2622262226222622262226222622262226222622'
1007='2624262426242624262426242624262426242624'
1008='2626262626262626262626262626262626262626'
1009='2628262826282628262826282628262826282628'
101

In [19]:
x = 1
getsizeof(1)

28

In [24]:
x = array.array('I', range(100))

In [25]:
getsizeof(x)

472

In [22]:
x.append(1)

In [57]:
def in256(n):
    digits = []
    x = n
    for i in range(3):
        digits.append(x%256)
        x = int(x/256)
    return tuple(digits)

In [58]:
in256(1000000)

(64, 66, 15)

In [66]:
dgetsizeof(1000000)

28

In [67]:
x = in256(1000000)

In [68]:
dgetsizeof(x)

156

In [69]:
dgetsizeof(64)

28

In [85]:
x = dict(((i,i) for i in range(1000000)))
y = {}
for i in range(1000000):
    (t1, t2, t3) = in256(i)
    y.setdefault(t1, {}).setdefault(t2, {})[t3] = i

In [86]:
print('x={}\ny={}'.format(nbytes(dgetsizeof(x)), nbytes(dgetsizeof(y))))

x= 74.7MB
y= 83.7MB
