# Notebook for the creation of the grid lsh hashes for Porto datasets

Sheet that converts the extracted data from the data/chosen_data folder to hashes that will be stored in data/hashed_data/subset-'size'

In [1]:
# Importing nescessary modules
import os
import shutil

import global_variables

from schemes.grid_lsh import GridLSH
from utils import metafile_handler as mfh

from schemes.experiments import hashing

from multiprocessing import Pool

In [2]:
# Declaring global variables:

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER_PORTO = f"../data/hashed_data/{global_variables.CHOSEN_SUBSET_NAME}/"

PORTO_DATA = f"../data/chosen_data/{global_variables.CHOSEN_SUBSET_NAME}/"

---
## LSH - grid

In [4]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER_PORTO):
        file_path = os.path.join(OUTPUT_FOLDER_PORTO, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [5]:
# Create Grid LSH objec for Porto

resolution = 1.6 # km
layers = 5
meta_file = f"../data/chosen_data/{global_variables.CHOSEN_SUBSET_NAME}/META.txt"


GridPorto = GridLSH("Porto G1", global_variables.P_MIN_LAT, global_variables.P_MAX_LAT, global_variables.P_MIN_LON, global_variables.P_MAX_LON, resolution, layers, meta_file, PORTO_DATA )



29.04: The code below used 2m 35.9s

In [6]:
# Creating the hashes and saving them to output folder

hashes = GridPorto.compute_dataset_hashes()

# Saving the hashes to files
for key in hashes:
    with open(f'{OUTPUT_FOLDER_PORTO}{key}.txt','w') as file:
        for hash in hashes[key]:
            file.write("%s\n" % hash)
        file.close()

# Copying the meta_files
meta_files = mfh.get_meta_file(PORTO_DATA)

for filename in meta_files:
    shutil.copy(PORTO_DATA + filename, OUTPUT_FOLDER_PORTO)

### Calculate similarity of hashes

In [None]:
from utils.similarity_measures.distance import py_edit_distance_penalty_parallell as py_edp_parallell

similarities = py_edp_parallell(hashes)
output_path = f"../code/experiments/similarities/grid_porto-{global_variables.CHOSEN_SUBSET_NAME}.csv"
similarities.to_csv(os.path.abspath(output_path))

print(f"Check ../code/experiments/similarities/, it should be a file here named grid_porto-{global_variables.CHOSEN_SUBSET_NAME}.csv which contains the similarities in the dataset.")
