# Notebook for the creation of the disk lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/disk

In [18]:
# Importing nescessary modules
import os
import shutil

from schemes.disk_lsh import DiskLSH
from schemes.experiments import hashing
from utils import metafile_handler as mfh

import timeit as ti
from tqdm import tqdm 

from multiprocessing import Pool

In [19]:
# Declaring global variables:

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER_PORTO = "../data/hashed_data/disk/porto/"
OUTPUT_FOLDER_ROME = "../data/hashed_data/disk/rome/"

PORTO_DATA = "../data/chosen_data/porto/"
ROME_DATA = "../data/chosen_data/rome/"

P_MAX_LON = -8.57
P_MIN_LON = -8.66
P_MAX_LAT = 41.19
P_MIN_LAT = 41.14

R_MAX_LON = 12.53
R_MIN_LON = 12.44
R_MAX_LAT = 41.93
R_MIN_LAT = 41.88

## Porto LSH

Beginning with the porto set

In [20]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_PORTO):
        file_path = os.path.join(OUTPUT_FOLDER_PORTO, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [21]:
# Create a disk-based LSH object over Porto

layers = 4
diameter = 1.5
num_disks = 50
meta_file = "../data/chosen_data/porto/META-1000.TXT"

DiskPorto = DiskLSH("Porto D1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, num_disks, layers, diameter, meta_file, PORTO_DATA)

In [22]:
# Generating the disk-based LSH objects hashes and saving them to file

hashes = DiskPorto.compute_dataset_hashes_with_KD_tree()

for key in hashes:
    with open(f'{OUTPUT_FOLDER_PORTO}/{key}.txt', 'w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close

# Copying meta_files as well
meta_files = mfh.get_meta_files(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

In [23]:
# Cell for methdos use to measure the hashing performance efficiency, utilises parallell programming for optimal measurements
layers = 4
diameter = 1.5
num_disks = 50
meta_file = "../data/chosen_data/porto/META-200.TXT"

def fun_wrapper_naive(x):
    disk = DiskLSH("Porto D1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, num_disks, layers, diameter, meta_file, PORTO_DATA)
    return disk.measure_hash_computation(1,1)[0]

def fun_wrapper_quadrants(x):
    disk = DiskLSH("Porto D1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, num_disks, layers, diameter, meta_file, PORTO_DATA)
    return disk.measure_hash_computation_with_quad_tree(1,1)[0]

def fun_wrapper_KD_tree(x):
    disk = DiskLSH("Porto D1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, num_disks, layers, diameter, meta_file, PORTO_DATA)
    return disk.compute_dataset_hashes_with_KD_tree(1,1)[0]

#runtimes, number = DiskPorto.measure_hash_computation_with_KD_tree(number=1, repeat=10)
#print(runtimes)
#print(f"Ran the hash computation {len(runtimes)} times. Fastest runtime: {min(runtimes)}s for {number} hashes")

## Rome LSH

Continuing with the rome set

In [24]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_ROME):
        file_path = os.path.join(OUTPUT_FOLDER_ROME, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [25]:
# Creating a disk based LSH hashing object over rome

layers = 4
diameter = 1.5
num_disks = 50
meta_file = "../data/chosen_data/rome/META-1000.TXT"

DiskRome = DiskLSH("Porto D1", R_MIN_LAT, R_MAX_LAT, R_MIN_LON, R_MAX_LON, num_disks, layers, diameter, meta_file, ROME_DATA)

In [26]:
# Generating the hashes and storing them in output folder along with the meta-files
hashes = DiskRome.compute_dataset_hashes_with_KD_tree()

for key in hashes:
    with open(f'{OUTPUT_FOLDER_ROME}/{key}.txt', 'w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying meta_files as well
meta_files = mfh.get_meta_files(ROME_DATA)

for filename in meta_files:
    shutil.copy(ROME_DATA + filename, OUTPUT_FOLDER_ROME)

# Measuring run-times of hash generation
The cells below are created to measure the time-efficiency of the hash computation


In [27]:
# Measuring Disk Naive over Porto
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_p_naive, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[4.328125], [4.65625], [4.46875], [4.28125], [4.609375], [5.21875], [4.640625], [4.359375], [4.46875], [4.921875]]
Ran the hash computation 10 times. Fastest runtime: [4.28125]s


In [28]:
# Measuring Disk Quadrants over Porto
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_p_quadrants, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[2.5625], [2.6875], [2.265625], [2.484375], [2.0625], [2.71875], [2.4375], [2.390625], [2.21875], [2.671875]]
Ran the hash computation 10 times. Fastest runtime: [2.0625]s


In [29]:
# Measuring Disk KD-tree over Porto
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_p_KD_tree, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[1.296875], [1.34375], [1.484375], [1.546875], [1.109375], [1.234375], [1.359375], [1.390625], [1.21875], [1.46875]]
Ran the hash computation 10 times. Fastest runtime: [1.109375]s


In [30]:
# Measuring Disk Naive over Rome
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_r_naive, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[8.078125], [8.1875], [8.59375], [8.203125], [8.5], [8.0625], [8.34375], [8.90625], [8.109375], [8.046875]]
Ran the hash computation 10 times. Fastest runtime: [8.046875]s


In [31]:
# Measuring Disk Quadrant over Rome
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_r_quadrants, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[3.703125], [3.90625], [3.765625], [4.015625], [3.5625], [3.765625], [3.578125], [4.1875], [3.921875], [3.625]]
Ran the hash computation 10 times. Fastest runtime: [3.5625]s


In [32]:
# Measuring Disk KD-tree over Rome
with Pool() as pool:
    result = pool.map(hashing.fun_wrapper_r_KD_tree, [[1000, 50, 4, 2] for _ in range(10)])
print(result)
print(f"Ran the hash computation {len(result)} times. Fastest runtime: {min(result)}s")

[[1.328125], [1.40625], [1.28125], [1.375], [1.3125], [1.375], [1.328125], [1.390625], [1.140625], [1.59375]]
Ran the hash computation 10 times. Fastest runtime: [1.140625]s
