In [39]:
import collections
import sys
import re

# Thought for the day:
# Optimization is the altar where maintainability is sacrificed


In [10]:
# obj is hashable if it has a hash value that never changes over it's lifetime
# needs a __hash__() method
# atomic immutable types (str, bytes, numeric types) are all hashable
# lists arent hashable , I think because mutable

a = (1, 2, 3)
hash(a)

# hash[1, 2, 4]  would get error

529344067295497451

In [5]:
s = frozenset([1, 2, 4, 5])  # returns immutable obj from mutable input
print(s)
hash(s)

frozenset({1, 2, 4, 5})


-7299030190757708740

In [15]:
# dictcomp = dictionary comprehension. A way to make a dict. Similar syntax to listcomp:
nation_data = [('UK', 1), ('China', 2), ('USA', 3)]
nation_dict = {country:code for country, code in nation_data} # iterates over each pair
nation_dict

{'UK': 1, 'China': 2, 'USA': 3}

In [14]:
hash(frozenset(nation_dict))

7811147137430375765

In [19]:
# 'duck typing' = see if there is a keys method; if so use it, if not iterate assuming items are key/value pairs


# set default adds a key/value pair if the key is missing, otherwise does nothing
index = {'a': 1, 'b': 2}
index.setdefault('a', 3)  
index.setdefault('c', 7)
index

{'a': 1, 'b': 2, 'c': 7}

In [37]:
# defaultdict = dict subtype, provides factory func to coerce value of key/value pairs to certain types

d = collections.defaultdict(list)  # sets values in key/value pairs as list type

d['red'].append(2)
d['red'].append(3)
d['orange'].append(3)

d

defaultdict(list, {'red': [2, 3], 'orange': [3]})

In [47]:
class StrKeyDict0(dict):
    def __missing__(self, key): 
        if isinstance(key, str):  # check if key obj is of type string
            raise KeyError(key) 
        return self[str(key)]
    def get(self, key, default=None): 
        try:
            return self[key] 
        except KeyError:
            return default
    def __contains__(self, key):
        return key in self.keys() or str(key) in self.keys()

a = StrKeyDict0()
a['b'] = 4
print(a['b'])
a.keys()

dict_keys(['b'])

In [80]:
##### types of dict
# collections.OrderedDict: can iterate in order as well as use keys

# chainmap: groups 1+ dicts or mappings in a ChainMap (like a single dict). Get keys using list(chain_map_name)
import builtins
pylookup = collections.ChainMap(locals(), globals(), vars(builtins))
print(len(list(pylookup)))
print(list(pylookup)[:10])  # view first 10 items
print(pylookup['nation_data'])  


311
['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__build_class__', '__import__', 'abs', 'all', 'any']
[('UK', 1), ('China', 2), ('USA', 3)]


In [83]:
## counter dict: tells you freq of times things included, increases counter whenever something mentioned
ct = collections.Counter('abracadabra')
print(ct)
ct.update('aaaa')
print(ct)
print(ct.most_common(4))

Counter({'a': 5, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
Counter({'a': 9, 'b': 2, 'r': 2, 'c': 1, 'd': 1})
[('a', 9), ('b', 2), ('r', 2), ('c', 1)]


In [85]:
# collections.UserDict - works just like a normal dict apparently
# userDict is easier to subclass than std dict, eg: below which subclasses it

class StrKeyDict(collections.UserDict):
    def __missing__(self, key): 
        if isinstance(key, str):
            raise KeyError(key) 
        return self[str(key)]
    def __contains__(self, key): 
        return str(key) in self.data
    def __setitem__(self, key, item): 
        self.data[str(key)] = item
        
a = StrKeyDict()
a['b'] = 4
print(a['b'])
print(a.keys())


# UserDict subclasses  collections.MutableMapping (mutable obj that supports key/value pairs)

4
KeysView({'b': 4})


In [100]:
fset = frozenset([1, 2, 3, 3, 3])  # takes iterable and returns immutable set
print(fset)        # removes duplicates
print(set(fset))         
print(list(fset))     # returns de-duped list: a popular use for sets (unnecessary to use a frozen set for this)

set2 = {1, 4, 3, 2 , 10}  # defining a set this way is using "set literals"
                        # assigns twice as fast as set([list])  (the set constructor)
                    # according to https://renzolucioni.com/pythons-set-literals/
print(fset.intersection(set2)) # searching fset takes order of 1microsec per val in set2 
                # (more or less insensitive to size of fset)
print(fset.union(set2))

{}  # makes an empty dict, not set




# iterable = any object capable of returning values one at a time

frozenset({1, 2, 3})
{1, 2, 3}
[1, 2, 3]
frozenset({1, 2, 3})
frozenset({1, 2, 3, 4, 10})


{}

In [102]:
# byte code for constructor and set literal processes of initialisation
from dis import dis
print(dis('set([1])'))
print(dis('{1}'))
# one involves more operations. Either this or the values in the final column show it will take longer

  1           0 LOAD_NAME                0 (set)
              2 LOAD_CONST               0 (1)
              4 BUILD_LIST               1
              6 CALL_FUNCTION            1
              8 RETURN_VALUE
None
  1           0 LOAD_CONST               0 (1)
              2 BUILD_SET                1
              4 RETURN_VALUE
None


In [108]:
from unicodedata import name
print(name(chr(100)))   # name() returns long name of that character
print(name('d')) 
{chr(i) for i in range(32, 256) if 'SIGN' in name(chr(i),'')}  # see chars with 'SIGN' in their names


LATIN SMALL LETTER D
LATIN SMALL LETTER D


{'#',
 '$',
 '%',
 '+',
 '<',
 '=',
 '>',
 '¢',
 '£',
 '¤',
 '¥',
 '§',
 '©',
 '¬',
 '®',
 '°',
 '±',
 'µ',
 '¶',
 '×',
 '÷'}

In [7]:
# hash table is a sparse array. The cells in the table are called 'buckets'
# at least a third of buckets are kept empty
# a dict hash table has a bucket for each item, which contains a reference to the key and a reference to the value
# 
print(hash('adam'))
print(hash(1.00000001))
print(hash(1.00000002))

# objects that are similar but not equal should have hash values that differ hugely

print(bin(hash(1.00000001)))  # view hash codes in binary
print(bin(hash(1.00000002)))
print(bin(hash(1.00000003)))

# not all bits of the hashed value will be used to search a dictionary's hash table: esp for
# smaller tables only some are used. If the lookup returns multiple matches this is a 'hash collision',
# in which case another sample of bits will be sampled from the input hash and the lookup
# tried again. If this still doesnt work you might get an error (IRL it never gets to the point of having an error)
# perturb() in C is used to shuffle hash bits in the event of hash collision

# whenever an item is added to a dict, it evaluates whether it needs to grow the hash table. This
# may also involve reordering the items in the dict, so iterating through it making changes
# might get messed up if it reorders itself halfway through your iterating
# If you need a dict to be ordered, used collections.OrderedDict

# for an object to be hashable it needs to respond to hash() and == operator, which are initialised by defining:
# __hash__()
# __eq__().   eg: a == b invokes a.__eq__(b)





354905103511653659
23058429953
46116860417
0b10101011110011000111011100000000001
0b101010111100110001110111001000000001
0b1000000011011001010110010101000000001


NameError: name 'eq' is not defined

In [None]:
# tuples are more memory efficient than dicts, as dont have to store hash tables

# defining __slots__() lets you change how obj's data is stored: it's a dict by default, which might
# be generally good, but not always memory-optimal (though with a modern machine, might take a while
# before you have to worry about this)

#  __missing__() on a dict-type sets what to do if a key can't be found

