In [9]:
import numpy as np
import numba
import itertools
import os
import sys
import deepdish as dd

# Speed/Memory Tradeoff for using Python ints vs. Numpy.uint8

In [56]:
%timeit 5

6.36 ns ± 0.086 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [57]:
sys.getsizeof(5)

28

In [58]:
%timeit np.uint8(5)

260 ns ± 5.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [59]:
sys.getsizeof(np.uint8(5))

25

All those bytes are going to add up, but no one will ever notice the time.

I'm still unsure about this. The speed difference should pretty much always take precedence EXCEPT in cases that the data is being stored.

In [60]:
%timeit 2 + 2

9.33 ns ± 0.231 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [61]:
%timeit 2 + np.uint(2)

3.28 µs ± 81.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [78]:
%timeit np.uint8(2) + np.uint8(2)

565 ns ± 11.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


Pretty sure addition will never be done but clearly arithmetic operations are better with with `np.uint8`'s

# Initial card array building

In [69]:
%timeit np.array([0,0,0,0,0], dtype=np.uint8)

1.78 µs ± 22.9 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [71]:
%timeit np.array([0,0,0,0,0], dtype=int)

1.66 µs ± 22.2 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [34]:
%timeit np.zeros(5, dtype=np.uint8)

1.08 µs ± 5.48 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [45]:
%timeit np.zeros(np.uint(5), dtype=np.uint8)

1.6 µs ± 6.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [63]:
%timeit np.zeros(np.uint(5), dtype=int)

1.51 µs ± 9.59 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [64]:
%timeit np.zeros(5, dtype=int)

999 ns ± 7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


There is no point in using `np.uint8`'s as arguments.

In [76]:
sys.getsizeof(np.zeros(5, dtype=np.uint8))

101

In [77]:
sys.getsizeof(np.zeros(5, dtype=int))

136

I'd say the size difference of the different int types is worth the ~600 nanosecond difference in building time.

# Hashing

In [137]:
np.random.seed(12341234)
arr = np.random.randint(low=1, high=53, size=5)

In [138]:
%timeit hash(str(arr))

43.9 µs ± 367 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [139]:
%timeit hash(arr.tostring())

170 ns ± 1.48 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [140]:
%timeit hash(str(arr.data))

926 ns ± 6.92 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [141]:
%timeit hash(str(arr.data.tobytes))

1.21 µs ± 16.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


`.tostring()` is the winner.

## Prove completely uncolliding hashes

In [43]:
combo_dict = {}

In [44]:
cards = [0] * 5 + list(range(1, 53))

In [45]:
possible_hands_with_repeats = itertools.combinations(cards, 5)

In [46]:
possible_hands = set(possible_hands_with_repeats)

In [47]:
for hand in possible_hands:
    curr_hand = np.array(hand, dtype=np.uint8)
    curr_hash = hash(curr_hand.tostring())
    if curr_hash in combo_dict:
        print(f"already in: {curr_hash}, {combo_dict[curr_hash]}")
        print(f"to insert: {curr_hash}, {curr_hand}")
        break
    else:
        combo_dict[curr_hash] = curr_hand

# Space Differences of .h5 files using `ints` vs. `np.uint8's`

In [5]:
int_dict = {}
for i in range(10000):
    int_dict[i] = i
dd.io.save("int_dict.h5", int_dict)

In [15]:
a = os.path.getsize("int_dict.h5")
a

1114072

In [7]:
uint8_dict = {}
for i in range(10000):
    uint8_dict[i] = np.uint8(i)
dd.io.save("uint8_dict", uint8_dict)

In [16]:
b = os.path.getsize("uint8_dict")
b

1204784

In [17]:
a - b

-90712

Normal Python ints are actually smaller.

# Misc

In [38]:
a = np.array([1, 3, 5, 13, 51], dtype=np.uint8)

In [39]:
a

array([ 1,  3,  5, 13, 51], dtype=uint8)

In [40]:
a.tostring()

b'\x01\x03\x05\r3'

In [29]:
b = [1, 3, 5, 13, 51]

In [31]:
str(b)

'[1, 3, 5, 13, 51]'