In [1]:
import numpy as np
import os
import string
import random
import csv

# Core Functions

In [2]:
def hash_funtion(key):
    
    ascii_sum = sum(ord(char) for char in str(key))
    #taking ASCII value of each character and then adding them and then converting it into binary format
    return '{0:016b}'.format(key)
    

class Bucket:
    
    def __init__(self,local_depth,index,empty_spaces,id):
        
        self.id = id        #bucket number
        self.local_depth = local_depth     #each bucket has local depth for overflow condition
        self.index = index                 #it is the array that stores variable
        self.empty_spaces = empty_spaces   #keeps track of empty space for overflow condition

class Directory:
    
    def  __init__(self,global_depth,directory_records):
        
        self.global_depth = global_depth,                 #each directory has global depth for overflow condition
        self.directory_records = directory_records        #keeps track of directory points to which bucket

class DirectoryRecord:
    
    def __init__(self,bucket, hash_prefix):
        
        self.hash_prefix = hash_prefix                   #it is kind of an ID of the directory
        self.value = bucket                              #it keeps track of which bucket directory is pointing to

# Hyper Parameters

In [3]:
bucket_capacity = 3     #Number of variables a bucket can store
bucket_number = 3       #Initially 2 required 3 is for overflow condition
global_depth = 1        #Initially global depth is always 1

In [4]:
# Initialization of buckets
bucket1 = Bucket(local_depth = 1, empty_spaces = bucket_capacity, index = [], id = 1)
bucket2 = Bucket(local_depth = 1, empty_spaces = bucket_capacity, index = [], id = 2)

# Initialization of directory
directory_records = list()  

#Conneting bucket to directory
directory_records.append(DirectoryRecord(hash_prefix = 0, bucket = bucket1))
directory_records.append(DirectoryRecord(hash_prefix = 1, bucket = bucket2))

#Initialized directory with global depth 1 and 2 empty buckets
directory = Directory(global_depth = 1, directory_records = directory_records)

In [5]:
directory_records

[<__main__.DirectoryRecord at 0x1087c1070>,
 <__main__.DirectoryRecord at 0x107fb2db0>]

# Insertion Algorithm

In [6]:
def insert(index):
    
    global directory
    global bucket_number
    
    hash_key = hash_funtion(int(index))      #Converting the value to be inserted into binary
    
    hash_prefix = int(hash_key[-directory.global_depth[0]:], 2)       #converting binary into number to get the directory

    bucket = directory.directory_records[hash_prefix].value           #using directory getting the bucket number
    bucket.index.append(index)                                        #Adding the number into the bucket
    bucket.empty_spaces = int(bucket.empty_spaces)-1                  #Reducing the empty space in the bucket

    if(bucket.empty_spaces < 0):                                      #Checking for overflow condition

        tempopary_memory = bucket.index                               #Saving the data of the overflown bucket
        bucket.index = []                                             #Emptying the bucket
        bucket.empty_spaces = bucket_capacity                         #Resetting the bucket capacity
        

        if (directory.global_depth[0] > bucket.local_depth):          #if global > local -> no need to double directory

            
            bucket.local_depth = bucket.local_depth + 1               #Since Overflow we increase local depth
            
            # NUMBER OF LINKED BUCKETS
            number_of_links = 2**(directory.global_depth[0] - bucket.local_depth) #Total number of links associated
            number_of_modify_links = number_of_links/2   #Divide 2 because some are already attached 

            #Creating a new bucket for overflow condition
            new_bucket = Bucket(local_depth = bucket.local_depth, index=[], empty_spaces = bucket_capacity, id = bucket_number)

            #Checking for each directory if bucket needs to be updated or not
            for directory_record in directory.directory_records:

                if(directory_record.value == bucket):     #Checking if directory was related to overflown bucket
                    if(number_of_modify_links != 0):      #Checking if there is directory that needs to be updated
                        number_of_modify_links = number_of_modify_links - 1
                    else:
                        directory_record.value = new_bucket         #Remaining point to new bucket
                        bucket_number = bucket_number + 1           #Inc bucket no, for the next overflow condition

            for i in range(len(tempopary_memory)):
                insert(tempopary_memory[i])        #Insert all the numbers that was present in overflown bucket
                

        elif (directory.global_depth[0] == bucket.local_depth):    #if global = local -> need to double directory
            
            new_directory_len = 2 * len(directory.directory_records)   #Doubling the directory
            new_directory_records = []                                 #Clearing the prev connection

            #Each directory is being associated to a bucket
            for directory_record_number in range(new_directory_len):
                new_directory_records.append(DirectoryRecord(hash_prefix=directory_record_number,bucket=Bucket(local_depth=1,index=[],empty_spaces=bucket_capacity,id=bucket_number)))
                bucket_number = bucket_number + 1
            
            new_directory = Directory(global_depth=directory.global_depth[0]+1,directory_records=new_directory_records)

            # REHASING
            #To the previous hash_prefix, we add 0 and 1, this is how new directory with different prefix is created
            for directory_record in directory.directory_records:
                haskey1 = '0'+hash_funtion(directory_record.hash_prefix)
                haskey2 = '1'+hash_funtion(directory_record.hash_prefix)
                new_index1 = int(haskey1[-directory.global_depth[0]:],2)
                new_index2 = int(haskey2[-directory.global_depth[0]:],2)
                
                #Updating the hash_prefix of a directory
                new_directory.directory_records[new_index1].value = directory_record.value
                new_directory.directory_records[new_index2].value = directory_record.value

            directory= new_directory

            for i in range(len(tempopary_memory)):               #Insert all the numbers that was present in overflown bucket
                insert(tempopary_memory[i])
    


In [7]:
def visualize():
    print(f"Global Depth: {directory.global_depth}")   #Printing global depth
    for i, record in enumerate(directory.directory_records):  #Iterating over each directory
        bucket = record.value          #Fetching the bucket associated with directory
        if bucket.empty_spaces != bucket_capacity:         #Checking if bucket is empty
            print(f"Directory Index {i}: Bucket ID {bucket.id}, Local Depth {bucket.local_depth}, Contents {bucket.index}, Empty Spaces {bucket.empty_spaces}")


# Main

### INSERTING THROUGH ARRAY

In [8]:
values_to_insert = [16, 22, 26, 20, 3, 1, 12, 11, 13, 19, 38, 47, 46]           #Values to be inserted
for value in values_to_insert:                                                  #After each value we call the visualize function
    insert(value)
    visualize()
    print("--------------------------------------------------")

Global Depth: (1,)
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [16], Empty Spaces 2
--------------------------------------------------
Global Depth: (1,)
Directory Index 0: Bucket ID 1, Local Depth 1, Contents [22], Empty Spaces 2
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [16], Empty Spaces 2
--------------------------------------------------
Global Depth: (1,)
Directory Index 0: Bucket ID 1, Local Depth 1, Contents [22, 26], Empty Spaces 1
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [16], Empty Spaces 2
--------------------------------------------------
Global Depth: (1,)
Directory Index 0: Bucket ID 1, Local Depth 1, Contents [22, 26, 20], Empty Spaces 0
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [16], Empty Spaces 2
--------------------------------------------------
Global Depth: (1,)
Directory Index 0: Bucket ID 1, Local Depth 1, Contents [22, 26, 20], Empty Spaces 0
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [16, 3], 

### INSERTING THROUGH FILE

In [9]:
# Read numbers from input.txt and store them in values_to_insert
with open('input.txt', 'r') as file:
    values_to_insert = list(map(int, file.read().strip().split()))


In [10]:
for value in values_to_insert:                                                  #After each value we call the visualize function
    insert(value)
    visualize()
    print("--------------------------------------------------")

Global Depth: (4,)
Directory Index 0: Bucket ID 1, Local Depth 2, Contents [22, 26, 13], Empty Spaces 0
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [1], Empty Spaces 2
Directory Index 2: Bucket ID 5, Local Depth 1, Contents [11, 20, 19], Empty Spaces 0
Directory Index 3: Bucket ID 6, Local Depth 3, Contents [3, 12], Empty Spaces 1
Directory Index 7: Bucket ID 14, Local Depth 1, Contents [16, 7], Empty Spaces 1
Directory Index 10: Bucket ID 25, Local Depth 1, Contents [46], Empty Spaces 2
Directory Index 11: Bucket ID 26, Local Depth 1, Contents [38, 47], Empty Spaces 1
--------------------------------------------------
Global Depth: (4,)
Directory Index 0: Bucket ID 1, Local Depth 2, Contents [22, 26, 13], Empty Spaces 0
Directory Index 1: Bucket ID 2, Local Depth 1, Contents [1], Empty Spaces 2
Directory Index 2: Bucket ID 5, Local Depth 1, Contents [11, 20, 19], Empty Spaces 0
Directory Index 3: Bucket ID 6, Local Depth 3, Contents [3, 12], Empty Spaces 1
Directory Index 

### INSERTING THROUGH USER INPUT

In [None]:
while True:
    num = int(input("Enter a number (negative to stop): "))
    if num < 0:
        break
    insert(num)
    visualize()

In [None]:
import heapq

# Merges k sorted lists
def merge_lists(output_list, k, lists):
    harr = []
    result = []

    # Create a min heap with k heap nodes.
    # Every heap node has the first element of each list
    for i in range(k):
        if lists[i]:
            heapq.heappush(harr, (lists[i][0], i))

    count = 0
    while count < k:
        # Get the minimum element and store it in the result list
        root = heapq.heappop(harr)
        result.append(root[0])

        # Find the next element that will replace the current root of the heap
        if len(lists[root[1]]) > 1:
            next_elem = lists[root[1]][1]  # Get the next element in the same list
            lists[root[1]] = lists[root[1]][1:]  # Update the list to remove the processed element
            heapq.heappush(harr, (next_elem, root[1]))
        else:
            # No more elements in this list
            count += 1

    output_list.extend(result)

# Create initial runs from input list
def create_initial_runs(input_list, run_size, num_ways):
    # Create output lists for sorted runs
    out_lists = [[] for _ in range(num_ways)]           #Array to store sorted partitions

    more_input = True                                   #keeps track of if we have input
    next_output_list = 0                                #keeps track of partition

    start = 0                                           #keeps track of how many bits/numbers are read
    while more_input:
        end = min(start + run_size, len(input_list))    #calculating end
        data = input_list[start:end]                    #storing the data
        start = end                                     #moving the pointer

        # Sort the data
        data.sort()

        # Write the records to the appropriate output list
        out_lists[next_output_list] = data

        if start >= len(input_list):                    #If all data has been read then more_input is turned to false
            more_input = False

        next_output_list += 1                           #Inc the count of partition

    return out_lists                                    #Return the array having sorted partitions

# For sorting data stored in an array
def external_sort(input_list, output_list, num_ways, run_size):
    # Create the initial runs and assign the runs to the output lists
    out_lists = create_initial_runs(input_list, run_size, num_ways)
    # We preform initial runs where we take small partation and make them sorted
    # Similar to merge sort where we break array into small parts and then merge the sorted part together
    
    # Now we have sorted partitions
    # Merge the runs using K-way merging
    merge_lists(output_list, num_ways, out_lists)


In [None]:
def main():
    # No. of Partitions
    num_ways = 10

    # The size of each partition
    run_size = 1000

    # Generate input array with 10,000 unique random numbers
    input_list = list(range(10000, 0, -1))

    # List to store the sorted output
    output_list = []

    # Perform the external sort
    external_sort(input_list, output_list, num_ways, run_size)


    # Print the first 10 elements of the sorted output
    print("First 10 sorted numbers:", output_list[:10])

if __name__ == "__main__":
    main()