# Notebook for the creation of the disk lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/disk

In [1]:
# Importing nescessary modules
import os
import shutil

from schemes.disk_lsh import DiskLSH
from utils import metafile_handler as mfh

import timeit as ti
from tqdm import tqdm 


In [2]:
# Declaring global variables:

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER_PORTO = "../data/hashed_data/disk/porto/"
OUTPUT_FOLDER_ROME = "../data/hashed_data/disk/rome/"

PORTO_DATA = "../data/chosen_data/porto/"
ROME_DATA = "../data/chosen_data/rome/"

P_MAX_LON = -8.57
P_MIN_LON = -8.66
P_MAX_LAT = 41.19
P_MIN_LAT = 41.14

R_MAX_LON = 12.53
R_MIN_LON = 12.44
R_MAX_LAT = 41.93
R_MIN_LAT = 41.88

## Porto LSH

Beginning with the porto set

In [3]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_PORTO):
        file_path = os.path.join(OUTPUT_FOLDER_PORTO, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [9]:
# Create a disk-based LSH object over Porto

layers = 4
diameter = 1.5
num_disks = 50
meta_file = "../data/chosen_data/porto/META-200.TXT"

DiskPorto = DiskLSH("Porto D1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, num_disks, layers, diameter, meta_file, PORTO_DATA)

In [6]:
# Generating the disk-based LSH objects hashes and saving them to file

hashes = DiskPorto.compute_dataset_hashes()

for key in hashes:
    with open(f'{OUTPUT_FOLDER_PORTO}/{key}.txt', 'w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close

# Copying meta_files as well
meta_files = mfh.get_meta_files(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

In [10]:
# Cell for measuring the hashing performance
runtimes, number = DiskPorto.measure_hash_computation(number=1, repeat=10)
print(runtimes)
print(f"Ran the hash computation {len(runtimes)} times. Fastest runtime: {min(runtimes)}s for {number} hashes")

[3.3125, 2.921875, 3.5625, 4.0, 3.421875, 3.3125, 3.609375, 2.453125, 3.3125, 2.8125]
Ran the hash computation 10 times. Fastest runtime: 2.453125s for 200 hashes


## Rome LSH

Continuing with the rome set

In [4]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_ROME):
        file_path = os.path.join(OUTPUT_FOLDER_ROME, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [None]:
# Creating a disk based LSH hashing object over rome

layers = 4
diameter = 1.5
num_disks = 50
meta_file = "../data/chosen_data/rome/META-50.TXT"

DiskRome = DiskLSH("Porto D1", R_MIN_LAT, R_MAX_LAT, R_MIN_LON, R_MAX_LON, num_disks, layers, diameter, meta_file, ROME_DATA)

In [None]:
# Generating the hashes and storing them in output folder along with the meta-files
hashes = DiskRome.compute_dataset_hashes()

for key in hashes:
    with open(f'{OUTPUT_FOLDER_ROME}/{key}.txt', 'w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying meta_files as well
meta_files = mfh.get_meta_files(ROME_DATA)

for filename in meta_files:
    shutil.copy(ROME_DATA + filename, OUTPUT_FOLDER_ROME)

In [None]:
# Cell for measuring the hashing performance
runtimes, number = DiskRome.measure_hash_computation(number=1, repeat=10)
print(runtimes)
print(f"Ran the hash computation {len(runtimes)} times. Fastest runtime: {min(runtimes)}s for {number} hashes")