# Data Engineering 1: Lab 06 - Solution
---------------

### Task 01: Consistent Hashing
#### Fill the parts marked with Todo for a complete implementation of Consistent Hashing

In [1]:
!pip3 install names



In [2]:
import names
import pandas as pd
import numpy as np
from collections import Counter

n_names = 100
n_servers = 3

#returns list of size n with random names
def getData(n):
    data = []
    i = 0
    while i < n:
        data.append(names.get_first_name())
        i += 1
    return data

#returns list of hash values for list of data
def getHashs(data):
    hashs = []
    for word in data:
        hashs.append(abs(hash(word)) % (10 ** 8))
    return hashs

#returns list of size m with random server names
def getServers(m):
    data = []
    i = 0
    while i < m:
        data.append('Server_' + names.get_first_name())
        i += 1
    return data

#TodO: create a DataFrame with the list created with the function getdata(100) 
# and a type column filled with the type key
data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for server in servers:
    df2 = pd.DataFrame({'name': server, 'hashvalue': abs(hash(server)) % (10 ** 8), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])
    #df = df.append(df2, ignore_index = True)

#Todo: add a column to the DataFrame with the angle on the circle
def getAngles(hashs):
    angles = []
    maxh = max(hashs)
    for hashi in hashs:
        angle = (hashi / maxh) * 360.0
        angles.append(angle)
    return angles
        
angles = getAngles(df['hashvalue'])
df['angle'] = angles

#Todo: add a column to the DataFrame with the corresponding server
def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    return server_names[a == a.min()][0]
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    df['server'] = server_assotiated
    return server_assotiated

server_assotiated = get_server_assotiation(df)
df['server'] = server_assotiated

df.sort_values(by=['angle'], inplace=True)
print('Unequal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df


Unequal distribution:
Counter({'Server_Cheryl': 45, 'Server_Diane': 45, 'Server_Sharon': 13})


Unnamed: 0,name,type,hashvalue,angle,server
68,Antoine,key,4165561,15.236361,Server_Cheryl
43,Gloria,key,5681668,20.781821,Server_Cheryl
86,Gloria,key,5681668,20.781821,Server_Cheryl
78,Charles,key,6722281,24.588069,Server_Cheryl
79,Charles,key,6722281,24.588069,Server_Cheryl
...,...,...,...,...,...
101,Server_Cheryl,server,94630802,346.130824,Server_Cheryl
54,Emma,key,95236095,348.344803,Server_Cheryl
73,Mariano,key,97547194,356.798103,Server_Cheryl
40,Marie,key,97607220,357.017660,Server_Cheryl


### Task 02: Balanced Consistent Hashing
#### Extend the code from Task 01 to ensure object keys are evenly distributed among servers

In [3]:
import names
import pandas as pd
from collections import Counter

#returns list of size n with random names
def getData(n):
    data = []
    i = 0
    while i < n:
        data.append(names.get_first_name())
        i += 1
    return data

#returns list of hash values for list of data
def getHashs(data, granulat):
    hashs = []
    for word in data:
        hashs.append(abs(hash(word)) % (granulat))
    return hashs

#returns list of size m with random server names
def getServers(m):
    data = []
    i = 0
    while i < m:
        data.append('Server_' + names.get_first_name())
        i += 1
    return data

n_names = 100
n_servers = 3
granulat = 10 ** 5

data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data, granulat)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for i in range(len(servers)):
    server = servers[i]
    df2 = pd.DataFrame({'name': server, 'hashvalue': int(i*granulat/n_servers + 1), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])

#Todo: add a column to the DataFrame with the angle on the circle
def getAngles(hashs):
    angles = []
    maxh = max(hashs)
    for hashi in hashs:
        angle = (hashi / maxh) * 360.0
        angles.append(angle)
    return angles
        
angles = getAngles(df['hashvalue'])
df['angle'] = angles

def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    return server_names[a == a.min()][0]
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    df['server'] = server_assotiated
    return server_assotiated


server_assotiated = get_server_assotiation(df)
df['server'] = server_assotiated

df.sort_values(by=['angle'], inplace=True)
print('Equal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df

Equal distribution:
Counter({'Server_Wanda': 42, 'Server_Clark': 32, 'Server_Paul': 29})


Unnamed: 0,name,type,hashvalue,angle,server
100,Server_Wanda,server,1,0.003636,Server_Wanda
86,Natalie,key,525,1.909129,Server_Wanda
52,Christina,key,550,2.000040,Server_Wanda
23,Denise,key,3221,11.712964,Server_Wanda
95,David,key,3797,13.807552,Server_Wanda
...,...,...,...,...,...
43,Ramona,key,95970,348.988868,Server_Wanda
41,Herbert,key,96599,351.276187,Server_Wanda
29,Carl,key,96876,352.283480,Server_Wanda
51,Willie,key,97561,354.774440,Server_Wanda


### Task 03: Weighted Balanced Consistent Hashing
#### Extend the code from Task 01 to ensure object keys are evenly distributed among servers but also with a weighting of the single servers.

In [4]:
n_names = 100
n_servers = 3
granulat = 10 ** 5

data = getData(n_names)
df = pd.DataFrame(data, columns=['name'])
df['type'] = 'key'

#Todo: add a column to the DataFrame with the hashs created with the getHashs() method
hashs = getHashs(data, granulat)
df['hashvalue'] = hashs

#Todo: create m servers with the getServers function and add them to the DataFrame with the type server
servers = getServers(n_servers)
for i in range(len(servers)):
    server = servers[i]
    df2 = pd.DataFrame({'name': server, 'hashvalue': int(i*granulat/n_servers + 1), 'type': 'server'}, index=[df.shape[0]])
    df = pd.concat([df,df2])
    
angles = getAngles(df['hashvalue'])
df['angle'] = angles

def get_closest_server(angle, server_angles, server_names):
    a = angle - server_angles
    a = np.abs((a + 180) % 360 - 180)
    
    output = list(server_names[a == a.min()])
    
    weights = 1/a
    weights = weights/weights.sum()
    output.extend([item for item in weights])
    return tuple(output)
    

def get_server_assotiation(df):
    server_inds = np.array(df.index[df.type != 'key'])
    server_names = np.array(df.name.values[server_inds])
    server_angles = np.array(df.angle.values[df.type != 'key'])
    server_assotiated = [get_closest_server(angle, server_angles, server_names) for angle in df.angle.values] 
    columns = ['server']
    columns.extend(list(server_names))
    df1 = pd.DataFrame(server_assotiated, columns = columns, index=df.index)
    df  = pd.concat([df,df1], axis=1)
    return df


df = get_server_assotiation(df)
#df['server'] = server_assotiated

df.sort_values(by=['angle'], inplace=True)
df.replace({np.nan:1})
print('Equal distribution:')
print(Counter(df.server.values))
#Todo: print the final DataFrame
df

Equal distribution:
Counter({'Server_Kenneth': 40, 'Server_Mary': 33, 'Server_Amy': 30})


  weights = 1/a
  weights = weights/weights.sum()


Unnamed: 0,name,type,hashvalue,angle,server,Server_Kenneth,Server_Mary,Server_Amy
100,Server_Kenneth,server,1,0.003628,Server_Kenneth,,0.000000,0.000000
73,Leroy,key,688,2.496246,Server_Kenneth,0.959960,0.020201,0.019839
50,Kenneth,key,1516,5.500448,Server_Kenneth,0.915682,0.043600,0.040718
68,Kenneth,key,1516,5.500448,Server_Kenneth,0.915682,0.043600,0.040718
28,Juanita,key,2292,8.315981,Server_Kenneth,0.877540,0.064765,0.057695
...,...,...,...,...,...,...,...,...
92,Bonita,key,92120,334.235696,Server_Kenneth,0.687444,0.120743,0.191813
71,Teresa,key,92236,334.656575,Server_Kenneth,0.691329,0.119785,0.188886
65,Hubert,key,93769,340.218704,Server_Kenneth,0.745270,0.104779,0.149951
10,Michael,key,98998,359.190897,Server_Kenneth,0.986579,0.006586,0.006835
