Anna Mrukwa  
Makrokierunek sem. 6

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Supporting functions

In [None]:
def is_not_prime(number):
  for i in range(2, number//2):
    if number % i == 0:
      return True
  return False

In [None]:
def string_hash(element, size):
  h = 0 
  c = 29
  for s in range(len(element)):
    h = (c*h+s) % size
  return h

def string_hash_2(element, size):
  h = 0
  c1 = 31415
  c2 = 27183
  for s in range(len(element)):
    h = (c1*h+s) % size
    c1 = (c1*c2) % (size-1)
  if h < 0:
    return h + size
  if h == 0:
    return 1
  return h

In [None]:
def dict_hash(element, size):
  for key in element:
    return string_hash(key, size)

def dict_hash_2(element, size):
  for key in element:
    return string_hash_2(key, size)


# Basic hash table

In [None]:
class hash_table():
  def __init__(self,size) :
    # works for inheriting classes
    self.size = size
    self.table = [None]*self.size
    self.taken_slots = 0
  
  def is_empty(self):
    # works for inheriting classes
    if self.taken_slots == 0:
      return True
    return False

  def _find_next_size(self):
    # works for inheriting classes
    newsize = self.size*2
    while is_not_prime(newsize):
      newsize += 1
    return newsize

  def _rehash(self, newsize):
    rehashed_els = 0
    newtable = [None]*newsize
    for i in range(self.size):
      if self.table[i] is not None:
        # found element
        rehashed_els +=1
        el = self.table[i]
        # get new hash
        idx = self._hash(el, newsize)
        # and offset in case collision occurs
        offset = self._hash_2(el, newsize)
        while newtable[idx] is not None: 
          # collision
          # no need to check for repetitions here
          idx += offset
          idx = idx % newsize
        newtable[idx] = el
      if rehashed_els == self.taken_slots:
        break
    return newtable
  
  def _expand_table(self):
    # works for inheriting classes
    newsize = self._find_next_size()
    rehashed_els = 0
    self.table = self._rehash(newsize)
    self.size = newsize

  def _hash(self, element, size):
    # works for inheriting classes
    if isinstance(element, str):
      return string_hash(element, size)
    if isinstance(element, dict):
      return dict_hash(element, size)
    return element % size

  def _hash_2(self, element, size):
    # should be overwritten
    offset = 0
    return offset

  def _resolve_collision(self, idx, element):
    # works for inheriting classes
    offset = self._hash_2(element, self.size)
    while self.table[idx] is not None and self.table[idx] != element:
      idx += offset
      idx = idx % self.size
    return idx

  def _find_idx(self, element):
    # works for inheriting classes
    ex_no = 1
    idx = self._hash(element, self.size)
    if self.table[idx] == element:
      return idx, ex_no
    first_pos = idx
    offset = self._hash_2(element, self.size)
    while self.table[idx] != element:  
      ex_no += 1
      idx += offset
      idx = idx % self.size
      if idx == first_pos: # and self.table[idx] is not None could be added when using the magic value in the remove_element method
        # print("Item not found")
        return -1, ex_no
    return idx, ex_no

  def insert_element(self, element): 
    # works for inheriting classes
    idx = self._hash(element, self.size)
    if self.table[idx] is not None and self.table[idx] != element:
      idx = self._resolve_collision(idx, element)
    self.table[idx] = element
    self.taken_slots += 1
    if self.taken_slots == self.size:
      self._expand_table()
  
  def remove_element(self, element): 
    # works for inheriting classes
    idx, _ = self._find_idx(element)
    if idx != -1:
      self.taken_slots -= 1
      self.table[idx] = None # or magic number signalling the deletion

  def find_element(self, element): 
    # works for inheriting classes
    if self.is_empty():
      print("Hash table is empty")
      return -1, 0
    idx, ex_no = self._find_idx(element)
    return idx, ex_no


# Linear probing

In [None]:
class ht_lp(hash_table):
  def __init__(self, size=1000003):
    super().__init__(size)

  def _hash_2(self, element, size=None):
    offset = 1
    return offset

# Double hashing

In [None]:
class ht_dh(hash_table):
  def __init__(self, size=1000003, small_prime=251):
    self.small_prime=small_prime
    super().__init__(size)

  def _hash_2(self, element, size):
    # parametrized depending on element
    if isinstance(element, str):
      return string_hash_2(element, size)
    if isinstance(element, dict):
      return dict_hash_2(element, size)
    return self.small_prime - (element % self.small_prime)

# Testing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
load_factors = [str(i*10)+"%" for i in range(5,10)]
lf = [i/10 for i in range(5,10)]
methods = ["Linear probing", "Double hashing"]
tasks = ["search hit", "search miss"]

## Integers

In [None]:
hit_numbers = pd.read_csv("/content/drive/My Drive/AaDS/set_of_1050000_random_numbers.txt", header=None)
hit_numbers = hit_numbers.to_numpy().flatten()

In [None]:
miss_numbers = pd.read_csv("/content/drive/My Drive/AaDS/set_of_1050000_random_numbers_for_search_miss.txt", header=None)
miss_numbers = miss_numbers.to_numpy().flatten()

In [None]:
lp_counts_miss = []
lp_counts_hit = []
dh_counts_miss = []
dh_counts_hit = []
prev_el_no = 0
probing = ht_lp()
double_h = ht_dh()

for cap in lf:
  elements_no = int(probing.size*cap)
  print(cap)
  # fill the table to the desired capacity
  for i in tqdm(range(prev_el_no, elements_no)):
    element = hit_numbers[i]
    probing.insert_element(element)
    double_h.insert_element(element)
  # perform search
  # because we dont want only the ones that did not need collision solving
  sampling_indices = np.random.choice(range(elements_no), 
                                      size=100, replace=False)
  miss_probing = np.zeros(100)
  hit_probing = np.zeros(100)
  miss_dh = np.zeros(100)
  hit_dh = np.zeros(100)
  for i in tqdm(range(100)):
    idx = sampling_indices[i]
    _, hit_probing[i] = probing.find_element(hit_numbers[idx])
    _, miss_probing[i] = probing.find_element(miss_numbers[idx])
    _, hit_dh[i] = double_h.find_element(hit_numbers[idx])
    _, miss_dh[i] = double_h.find_element(miss_numbers[idx])
  lp_counts_miss.append(miss_probing.mean())
  lp_counts_hit.append(hit_probing.mean())
  dh_counts_miss.append(miss_dh.mean())
  dh_counts_hit.append(hit_dh.mean())
  prev_el_no = elements_no

0.5


100%|██████████| 500001/500001 [00:02<00:00, 225453.47it/s]
100%|██████████| 100/100 [02:18<00:00,  1.39s/it]


0.6


100%|██████████| 100000/100000 [00:00<00:00, 145050.67it/s]
100%|██████████| 100/100 [02:19<00:00,  1.39s/it]


0.7


100%|██████████| 100001/100001 [00:00<00:00, 110843.07it/s]
100%|██████████| 100/100 [02:18<00:00,  1.38s/it]


0.8


100%|██████████| 100000/100000 [00:01<00:00, 74455.44it/s]
100%|██████████| 100/100 [02:20<00:00,  1.41s/it]


0.9


100%|██████████| 100000/100000 [00:02<00:00, 34790.08it/s]
100%|██████████| 100/100 [02:17<00:00,  1.37s/it]


In [None]:
del miss_numbers, hit_numbers # for memory's sake

In [None]:
df = {}
# Linear probing
data = dict(zip(tasks, [lp_counts_hit, lp_counts_miss]), index= load_factors)
df[methods[0]] = pd.DataFrame.from_dict(data)
df[methods[0]].rename(columns = {'index':'load factor'}, inplace=True)
df[methods[0]].set_index('load factor', inplace=True)
# Double hashing
data = dict(zip(tasks, [dh_counts_hit, dh_counts_miss]), index= load_factors)
df[methods[1]] = pd.DataFrame.from_dict(data)
df[methods[1]].rename(columns = {'index':'load factor'}, inplace=True)
df[methods[1]].set_index('load factor', inplace=True)

df = pd.concat(df, axis=1)
df

Unnamed: 0_level_0,Linear probing,Linear probing,Double hashing,Double hashing
Unnamed: 0_level_1,search hit,search miss,search hit,search miss
load factor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
50%,1.67,1000004.0,1.43,1000004.0
60%,1.59,1000004.0,1.48,1000004.0
70%,2.3,1000004.0,2.0,1000004.0
80%,3.06,1000004.0,2.25,1000004.0
90%,6.47,1000004.0,2.64,1000004.0


Hit search number: initial hash + size of the hash table (the key is rehashed until we reach the initial hash again). The solution would be that after deletion of the element some sort of magic value would be inserted instead of None (None could be used as a stop condition as well - if the element was not inserted, then there is the None value), which would signal that the search should not be stopped because of the deletion of the colliding element that was inserted beforehead.

## Strings

In [None]:
dictionary = pd.read_csv("/content/drive/My Drive/AaDS/count_1w.txt", sep='\t', header=None)
dictionary = dictionary.drop([2577, 12819]) # it is "null" and "nan" and numpy does not read it correctly
dictionary = dictionary.iloc[:,0].to_numpy().flatten() 
hit_words = dictionary[:dictionary.shape[0]//2]
miss_words = dictionary[dictionary.shape[0]//2:]
del dictionary

In [None]:
lp_counts_miss = []
lp_counts_hit = []
dh_counts_miss = []
dh_counts_hit = []


prev_el_no = 0
probing = ht_lp(size=166667)
double_h = ht_dh(size=166667)

for cap in lf:
  elements_no = int(probing.size*cap)
  print(cap)
  # fill the table to the desired capacity
  for i in tqdm(range(prev_el_no, elements_no)):
    element = hit_words[i]
    probing.insert_element(element)
    double_h.insert_element(element)
  # perform search
  # because we dont want only the ones that did not need collision solving
  sampling_indices = np.random.choice(range(elements_no), 
                                      size=100, replace=False)
  miss_probing = np.zeros(100)
  hit_probing = np.zeros(100)
  miss_dh = np.zeros(100)
  hit_dh = np.zeros(100)
  for i in tqdm(range(100)):
    idx = sampling_indices[i]
    _, hit_probing[i] = probing.find_element(hit_words[idx])
    _, miss_probing[i] = probing.find_element(miss_words[idx])
    _, hit_dh[i] = double_h.find_element(hit_words[idx])
    _, miss_dh[i] = double_h.find_element(miss_words[idx])
  lp_counts_miss.append(miss_probing.mean())
  lp_counts_hit.append(hit_probing.mean())
  dh_counts_miss.append(miss_dh.mean())
  dh_counts_hit.append(hit_dh.mean())
  prev_el_no = elements_no

0.5


100%|██████████| 83333/83333 [06:17<00:00, 220.87it/s]
100%|██████████| 100/100 [00:12<00:00,  8.11it/s]


0.6


100%|██████████| 16667/16667 [03:40<00:00, 75.68it/s]
100%|██████████| 100/100 [00:12<00:00,  7.95it/s]


0.7


100%|██████████| 16666/16666 [04:34<00:00, 60.75it/s]
100%|██████████| 100/100 [00:13<00:00,  7.69it/s]


0.8


100%|██████████| 16667/16667 [06:41<00:00, 41.54it/s]
100%|██████████| 100/100 [00:13<00:00,  7.32it/s]


0.9


100%|██████████| 16667/16667 [07:49<00:00, 35.52it/s]
100%|██████████| 100/100 [00:13<00:00,  7.21it/s]


In [None]:
del hit_words, miss_words # for memory's sake

In [None]:
df = {}
# Linear probing
data = dict(zip(tasks, [lp_counts_hit, lp_counts_miss]), index= load_factors)
df[methods[0]] = pd.DataFrame.from_dict(data)
df[methods[0]].rename(columns = {'index':'load factor'}, inplace=True)
df[methods[0]].set_index('load factor', inplace=True)
# Double hashing
data = dict(zip(tasks, [dh_counts_hit, dh_counts_miss]), index= load_factors)
df[methods[1]] = pd.DataFrame.from_dict(data)
df[methods[1]].rename(columns = {'index':'load factor'}, inplace=True)
df[methods[1]].set_index('load factor', inplace=True)

df = pd.concat(df, axis=1)
df

Unnamed: 0_level_0,Linear probing,Linear probing,Double hashing,Double hashing
Unnamed: 0_level_1,search hit,search miss,search hit,search miss
load factor,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
50%,7140.81,166668.0,5343.14,166668.0
60%,9804.51,166668.0,6534.08,166668.0
70%,12298.85,166668.0,8605.33,166668.0
80%,13789.52,166668.0,9991.13,166668.0
90%,16719.68,166668.0,11315.71,166668.0


# Phone book

In [None]:
class phonebook(ht_dh):
  # insert remains the same cause there may be 2 people with the same name
  # and different phone numbers
  def __init__(self, size=97, small_prime=37):
    super().__init__(size, small_prime)  

  def _find_all(self, name):
    indices = []
    idx = self._hash(name, self.size)
    identifier = None
    if self.table[idx] is not None:
      for key in self.table[idx]:
        if key == name:
          indices.append(idx)
    first_pos = idx

    while True:
      idx += self._hash_2(name, self.size)
      idx = idx % self.size
      if idx == first_pos:
        break
      if self.table[idx] is not None:
        for key in self.table[idx]:
          if key == name:
            indices.append(idx)
          break

    if len(indices)==0:
      print("User not found")
    return indices

  def find_element(self, name): 
    if self.is_empty():
      print("Phonebook is empty")
    indices = self._find_all(name)
    for idx in indices:
      for key in self.table[idx]:
        print("User found: "+key+" "+self.table[idx][key])
        break

  def remove_element(self, element): 
    idx, _ = self._find_idx(element)
    if idx != -1:
      self.taken_slots -= 1
      self.table[idx] = None
      print("User removed.")
    else:
      print("User already not in the data.")
  
  def show(self):
    for user in self.table:
      if user is None:
        print("XXXXX XXXXXX XXXXXXXXXXX")
      else:
        for key in user:
          print(key+": "+user[key])
  

## Generator

In [None]:
!pip install names-dataset
import numpy as np
from names_dataset import NameDataset

In [None]:
def generate_number(rng):
  x = rng.choice(10, size=9).tolist()
  x = "".join([str(i) for i in x])
  x = ' '.join([str(x[i:i+3]) for i in range(0, len(x), 3)])
  return x

In [None]:
def generate_name(rng, fnames, lnames):
  fname = rng.choice(fnames)
  lname = rng.choice(lnames)
  return fname +" "+ lname

In [None]:
def generate_person(rng, fnames, lnames):
  number = generate_number(rng)
  name = generate_name(rng, fnames, lnames)
  return {name: number}

### Get people

In [None]:
nd = NameDataset()

In [None]:
fname = nd.get_top_names(n=50, gender='Male', country_alpha2='US')["US"]["M"]
fname.extend(nd.get_top_names(n=50, gender='Female', country_alpha2='US')["US"]["F"])
lname = nd.get_top_names(n=100, country_alpha2='US',use_first_names=False)["US"]

In [None]:
rng = np.random.default_rng(1)

In [None]:
people = []
for i in range(20):
  person = generate_person(rng, fname, lname)
  print(person)
  people.append(person)

{'Matt Ortega': '457 901 892'}
{'Katie James': '428 246 500'}
{'Oscar Mitchell': '858 347 131'}
{'Luis Thomas': '134 925 207'}
{'Michelle Vega': '441 979 072'}
{'Scott Vazquez': '271 394 521'}
{'Jennifer Guerrero': '473 679 407'}
{'Anna Ortiz': '430 467 825'}
{'Mark Medina': '385 565 970'}
{'Tony Diaz': '806 778 158'}
{'Steven Marie': '402 868 883'}
{'Christina Thomas': '290 862 748'}
{'Brian Martinez': '426 688 998'}
{'Javier Allen': '438 745 580'}
{'Jesus Allen': '499 848 062'}
{'Christopher Patel': '782 883 078'}
{'Jeff Mendez': '137 306 715'}
{'Diana Taylor': '062 546 136'}
{'Stephanie Johnson': '075 614 883'}
{'Chris Lee': '823 958 179'}


Generate additional person outside the phonebook:

In [None]:
person = generate_person(rng, fname, lname)
person

{'Emily Contreras': '123 082 971'}

Add people in the phonebook and the person not in it to the search:

In [None]:
searched_users = []

for key in people[7]:
  searched_users.append(key)

for key in people[8]:
  searched_users.append(key)

for key in person:
  searched_users.append(key)

## Show phonebook work

In [None]:
Phonebook =  phonebook(31, 23)
for p in people:
  Phonebook.insert_element(p)

In [None]:
Phonebook.find_element(searched_users[0])
Phonebook.find_element(searched_users[1])
Phonebook.find_element(searched_users[2])

User found: Anna Ortiz 430 467 825
User found: Mark Medina 385 565 970
User not found


In [None]:
Phonebook.show()

XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Scott Vazquez: 271 394 521
Stephanie Johnson: 075 614 883
Diana Taylor: 062 546 136
Jesus Allen: 499 848 062
Christopher Patel: 782 883 078
Chris Lee: 823 958 179
Tony Diaz: 806 778 158
Christina Thomas: 290 862 748
Katie James: 428 246 500
XXXXX XXXXXX XXXXXXXXXXX
Jennifer Guerrero: 473 679 407
Luis Thomas: 134 925 207
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Anna Ortiz: 430 467 825
Javier Allen: 438 745 580
XXXXX XXXXXX XXXXXXXXXXX
Mark Medina: 385 565 970
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Jeff Mendez: 137 306 715
Brian Martinez: 426 688 998
Matt Ortega: 457 901 892
Steven Marie: 402 868 883
XXXXX XXXXXX XXXXXXXXXXX
Oscar Mitchell: 858 347 131
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Michelle Vega: 441 979 072


In [None]:
Phonebook.remove_element(people[7])
Phonebook.find_element(searched_users[0])

User removed.
User not found


In [None]:
Phonebook.show()

XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Scott Vazquez: 271 394 521
Stephanie Johnson: 075 614 883
Diana Taylor: 062 546 136
Jesus Allen: 499 848 062
Christopher Patel: 782 883 078
Chris Lee: 823 958 179
Tony Diaz: 806 778 158
Christina Thomas: 290 862 748
Katie James: 428 246 500
XXXXX XXXXXX XXXXXXXXXXX
Jennifer Guerrero: 473 679 407
Luis Thomas: 134 925 207
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Javier Allen: 438 745 580
XXXXX XXXXXX XXXXXXXXXXX
Mark Medina: 385 565 970
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Jeff Mendez: 137 306 715
Brian Martinez: 426 688 998
Matt Ortega: 457 901 892
Steven Marie: 402 868 883
XXXXX XXXXXX XXXXXXXXXXX
Oscar Mitchell: 858 347 131
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Michelle Vega: 441 979 072


In [None]:
Phonebook.find_element(searched_users[1])
Phonebook.insert_element(person)
Phonebook.find_element(searched_users[2])
Phonebook.insert_element(person)
print("Adding another person with the same name and searching again:")
Phonebook.insert_element({searched_users[2]: '123 456 789'})
Phonebook.find_element(searched_users[2])

User found: Mark Medina 385 565 970
User found: Emily Contreras 123 082 971
Adding another person with the same name and searching again:
User found: Emily Contreras 123 082 971
User found: Emily Contreras 123 456 789


In [None]:
Phonebook.show()

XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Scott Vazquez: 271 394 521
Stephanie Johnson: 075 614 883
Diana Taylor: 062 546 136
Jesus Allen: 499 848 062
Christopher Patel: 782 883 078
Chris Lee: 823 958 179
Tony Diaz: 806 778 158
Christina Thomas: 290 862 748
Katie James: 428 246 500
Emily Contreras: 123 456 789
Jennifer Guerrero: 473 679 407
Luis Thomas: 134 925 207
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Javier Allen: 438 745 580
XXXXX XXXXXX XXXXXXXXXXX
Mark Medina: 385 565 970
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Jeff Mendez: 137 306 715
Brian Martinez: 426 688 998
Matt Ortega: 457 901 892
Steven Marie: 402 868 883
Emily Contreras: 123 082 971
Oscar Mitchell: 858 347 131
XXXXX XXXXXX XXXXXXXXXXX
XXXXX XXXXXX XXXXXXXXXXX
Michelle Vega: 441 979 072
