# Sheet for computing the benchmarks and true similarities for the project

In [3]:
# Importing nescessary modules
import os
import re
import shutil
import numpy as np
import pandas as pd


from utils import metafile_handler as mfh
from utils import file_handler as fh

from benchmarks import dtw
from benchmarks import frechet


In [4]:
DATA_PORTO = "../data/chosen_data/porto/"
SIM_OUT_FOLDER = "../code/benchmarks/similarities/"

PORTO_FULL_SET = "../data/chosen_data/porto/META-1000.txt"

TEST_SET_PORTO = "../data/chosen_data/porto/META-50.txt"

PORTO_DTW_FILE = "porto-dtw.csv"
PORTO_FRECHET_FILE = "porto-frechet.csv"

PORTO_DTW_FILE_TEST = "porto-dtw-test.csv"
PORTO_FRECHET_FILE_TEST = "porto-frechet-test.csv"

In [5]:
def deleteFile(file_name: str) -> None:
    file_path = os.path.join(SIM_OUT_FOLDER, file_name)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print("Failed to remove %s. Reason: %s" % (file_path, e))


def portoSet(file_size: int) -> str:
    return f"../data/chosen_data/porto/META-{file_size}.txt"

---

## DTW distance compuation

Cells for generating and storing the similarities using dtw

In [6]:
## Using Cython DTW, to speed things up

def generate_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def generate_parallell_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [7]:
# DTW over PORTO
# generate_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE)

In [8]:
# Parallell Porto DTW
generate_parallell_dtw_similarities(DATA_PORTO, PORTO_FULL_SET, PORTO_DTW_FILE)


In [9]:
# TEST SET OVER PORTO DTW
generate_parallell_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE_TEST)

---

## Frechet distance computation

Cells for generating and storing the similarities using frechet distance

In [10]:
## Python frechet
def generate_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def compute_parallell_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [9]:
# Frechet over Porto # SLOW METHOD
generate_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001B21C96DC70>>
Traceback (most recent call last):
  File "C:\Users\47412\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 785, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\47412\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1534, in enumerate
    def enumerate():
    
KeyboardInterrupt: 


In [None]:
# Paralell Porto Frechet // Took 429 minutes
compute_parallell_frechet_similarities(DATA_PORTO, portoSet(1000), PORTO_FRECHET_FILE)

In [None]:
# TEST SET PORTO FRECHET
compute_parallell_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE_TEST)


---

## Time efficiency measuring

Cells below are used for computing the time efficiency of DTW and Frechet

In [11]:
from multiprocessing import Pool

In [12]:

# Using Python, for measuring computation time fairly against the hash computations

sim = {
    "dtw_py" : dtw.measure_py_dtw,
    "frechet_py" : frechet.measure_py_frechet,
    "dtw_cy" : dtw.measure_cy_dtw,
    "frechet_cy" : frechet.measure_cy_frechet
}

def measure_similarities(measure: str, data_folder: str, meta_file: str, parallell_jobs: int = 10):
    """ Common method for measuring the efficiency of the similarity algorithms """
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    with Pool() as pool:
        result = pool.map(sim[measure], [[trajectories, 1, 1] for _ in range(10)])
    return result


In [13]:
# test

measure_similarities("dtw_py", DATA_PORTO, portoSet(50))


[[53.640625],
 [54.921875],
 [54.953125],
 [55.046875],
 [54.515625],
 [54.625],
 [53.0],
 [54.921875],
 [8.140625],
 [8.15625]]

In [14]:
measure_similarities("dtw_cy", DATA_PORTO, portoSet(200))

In [None]:
# Takes a long time, more than five minutes for portoset-50
measure_similarities("frechet_cy", DATA_PORTO, portoSet(50))

../data/chosen_data/porto/META-50.txt
[[211.71875], [209.15625], [211.453125], [210.890625], [208.453125], [207.828125], [210.703125], [208.859375], [211.359375], [210.40625]]


In [None]:
# Something strange with this one -> Resulting in some strange error
measure_similarities("frechet_py", DATA_PORTO, portoSet(50))