# Notebook for the creation of the grid lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/grid

In [1]:
# Importing nescessary modules
import os
import shutil

from schemes.grid_lsh import GridLSH
from utils import metafile_handler as mfh

import timeit as ti
from tqdm import tqdm 



In [2]:
# Declaring global variables:

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER_PORTO = "../data/hashed_data/grid/porto/"
OUTPUT_FOLDER_ROME = "../data/hashed_data/grid/rome/"

PORTO_DATA = "../data/chosen_data/porto/"
ROME_DATA = "../data/chosen_data/rome/"

P_MAX_LON = -8.57
P_MIN_LON = -8.66
P_MAX_LAT = 41.19
P_MIN_LAT = 41.14

R_MAX_LON = 12.53
R_MIN_LON = 12.44
R_MAX_LAT = 41.93
R_MIN_LAT = 41.88

---
## Porto LSH

Beginning with the porto set

In [3]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_PORTO):
        file_path = os.path.join(OUTPUT_FOLDER_PORTO, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [4]:
# Create Grid LSH objec for Porto

resolution = 0.25 # km
layers = 4
meta_file = "../data/chosen_data/porto/META-1000.TXT"

GridPorto = GridLSH("Porto G1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, resolution, layers, meta_file, PORTO_DATA )



In [5]:
# Creating the hashes and saving them to output folder

hashes = GridPorto.compute_dataset_hashes()

# Saving the hashes to files
for key in hashes:
    with open(f'{OUTPUT_FOLDER_PORTO}/{key}.txt','w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying the meta_files
meta_files = mfh.get_meta_files(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

In [6]:
# Cell for measuring the time needed to generete the hashes. 

runtimes, number = GridPorto.measure_hash_computation(repeat=10, number=1)
print(runtimes)
print(f"Ran the hash computation {len(runtimes)} times. Fastest runtime: {min(runtimes)}s for {number} hashes")


[0.2734697999985656, 0.27407850000599865, 0.28004120000696275, 0.27589330000046175, 0.2772757000057027, 0.2835670999920694, 0.27879710000706837, 0.2727778999978909, 0.3143601000047056, 0.34022929999628104]
Ran the hash computation 10 times. Fastest runtime: 0.2727778999978909s for 1000 hashes


---
## Rome LSH

Continuing with the rome set

In [7]:
# Run this cell to clear the chosen files in the ROME folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_ROME):
        file_path = os.path.join(OUTPUT_FOLDER_ROME, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [8]:
# Create Grid hash object for Rome and saves them to output folder. Also copies the metafiles denoting the different datasets

resolution = 0.25 # km
layers = 4
meta_file = "../data/chosen_data/rome/META-1000.TXT"

GridRome = GridLSH("Rome G1", R_MIN_LAT, R_MAX_LAT, R_MIN_LON, R_MAX_LON, resolution, layers, meta_file, ROME_DATA)


In [9]:
# Generate the hashes and save them to output folder

hashes = GridRome.compute_dataset_hashes()

# Saving the hashes to files
for key in hashes:
    with open(f"{OUTPUT_FOLDER_ROME}/{key}.txt", "w") as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying the meta_files:
meta_files = mfh.get_meta_files(ROME_DATA)

for filename in meta_files:
    shutil.copy(ROME_DATA + filename, OUTPUT_FOLDER_ROME)

In [10]:

runtimes, number = GridRome.measure_hash_computation(repeat=10, number=1)
print(runtimes)
print(f"Ran the hash computation {len(runtimes)} times. Fastest runtime: {min(runtimes)}s for {number} hashes")

[0.5369671999942511, 0.5320334999996703, 0.5285268000006909, 0.5288125999941258, 0.554998900013743, 0.5491621000110172, 0.6291765000059968, 0.5757102999923518, 0.6177523999940604, 0.6490723000024445]
Ran the hash computation 10 times. Fastest runtime: 0.5285268000006909s for 1000 hashes
