# Notebook for the creation of the grid lsh hashes for both datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/grid

In [1]:
# Importing nescessary modules
import os
import shutil

from schemes.grid_lsh import GridLSH
from utils import metafile_handler as mfh

from schemes.experiments import hashing

from multiprocessing import Pool

In [2]:
# Declaring global variables:

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER_PORTO = "../data/hashed_data/grid/subset-100000/"

PORTO_DATA = "../data/chosen_data/subset-100000/"

P_MAX_LON = -8.45
P_MIN_LON = -8.72
P_MAX_LAT = 41.26
P_MIN_LAT = 41.07

---
## Porto LSH

Beginning with the porto set

In [3]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_PORTO):
        file_path = os.path.join(OUTPUT_FOLDER_PORTO, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [4]:
# Create Grid LSH objec for Porto

resolution = 1.6 # km
layers = 5
meta_file = "../data/chosen_data/subset-100000/META-100000.txt"


GridPorto = GridLSH("Porto G1", P_MIN_LAT, P_MAX_LAT, P_MIN_LON, P_MAX_LON, resolution, layers, meta_file, PORTO_DATA )



29.04: The code below used 2m 35.9s

In [5]:
# Creating the hashes and saving them to output folder

hashes = GridPorto.compute_dataset_hashes()

# Saving the hashes to files
for key in hashes:
    with open(f'{OUTPUT_FOLDER_PORTO}/{key}.txt','w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying the meta_files
meta_files = mfh.get_meta_files(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

# Measuring run-times of hash generation
The cells below are created to measure the time-efficiency of the hash computation
The time measured is the computation time only. I.E. does not include read/write to files, only the computationof the values from in-memory structures.


In [6]:
#Cell for measrung GridLSH hash generation times for both datasets 
import pandas as pd
from itertools import chain

output_folder = "../code/schemes/experiments/runtimes/"
file_name = "hashing_runtimes_grid_lsh.csv"

hashing_map = {
    "porto" : hashing.fun_wrapper_p_grid
}

config = {
    "porto" : [1000, 1.6, 5]

}

runs = 10

df = pd.DataFrame(columns=[f"Run_{run+1}" for run in range(runs)])

for key in hashing_map.keys():
    with Pool() as pool:
        result = pool.map(hashing_map[key], [config[key] for _ in range(runs)])
        df.loc[key] = list(chain.from_iterable(result))

df.to_csv(os.path.join(output_folder, file_name))

In [7]:
# Cell for Hash generation times analysis

import pandas as pd
import os

folder = "../code/schemes/experiments/runtimes/"
grid_name = "hashing_runtimes_grid_lsh.csv"
disk_name = "hashing_runtimes_disk_lsh.csv"

GRID = pd.DataFrame()
DISK = pd.DataFrame()

grid = pd.read_csv(os.path.join(folder, grid_name), index_col=0)
disk = pd.read_csv(os.path.join(folder, disk_name), index_col=0) 

GRID["Average runtime"] = grid.mean(axis=1)
GRID["Minimum runtime"] = grid.min(axis=1)
GRID["Maximum runtime"] = grid.max(axis=1)

DISK["Average runtime"] = disk.mean(axis=1)
DISK["Minimum runtime"] = disk.min(axis=1)
DISK["Maximum runtime"] = disk.max(axis=1)
print(GRID)
print(DISK)

       Average runtime  Minimum runtime  Maximum runtime
porto          0.28294         0.279428         0.286657
                 Average runtime  Minimum runtime  Maximum runtime
porto_naive             3.680610         3.649289         3.716283
porto_quadrants         1.838062         1.708252         2.024608
porto_kd_tree           1.852066         1.792141         1.904740
