# Sheet for computing the benchmarks and true similarities for the project

In [21]:
# Importing nescessary modules
import os
import re
import shutil
import numpy as np
import pandas as pd


from utils import metafile_handler as mfh
from utils import file_handler as fh

from benchmarks import dtw
from benchmarks import frechet


In [22]:
DATA_PORTO = "../data/chosen_data/porto/"
DATA_ROME = "../data/chosen_data/rome/"
SIM_OUT_FOLDER = "../code/benchmarks/similarities/"

PORTO_FULL_SET = "../data/chosen_data/porto/META-1000.txt"
ROME_FULL_SET = "../data/chosen_data/rome/META-1000.txt"

TEST_SET_PORTO = "../data/chosen_data/porto/META-50.txt"
TEST_SET_ROME = "../data/chosen_data/rome/META-50.txt"

PORTO_DTW_FILE = "porto-dtw.csv"
PORTO_FRECHET_FILE = "porto-frechet.csv"
ROME_DTW_FILE = "rome-dtw.csv"
ROME_FRECHET_FILE = "rome-frechet.csv"

PORTO_DTW_FILE_TEST = "porto-dtw-test.csv"
PORTO_FRECHET_FILE_TEST = "porto-frechet-test.csv"
ROME_DTW_FILE_TEST = "rome-dtw-test.csv"
ROME_FRECHET_FILE_TEST = "rome-frechet-test.csv"



In [23]:
def deleteFile(file_name: str) -> None:
    file_path = os.path.join(SIM_OUT_FOLDER, file_name)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print("Failed to remove %s. Reason: %s" % (file_path, e))


def portoSet(file_size: int) -> str:
    return f"../data/chosen_data/porto/META-{file_size}.txt"


def romeSet(file_size: int) -> str:
    return f"../data/chosen_data/rome/META-{file_size}.txt"

---

## DTW distance compuation

Cells for generating and storing the similarities using dtw

In [3]:
## Using Cython DTW, to speed things up

def generate_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def generate_parallell_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [17]:
# DTW over PORTO
# generate_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE)

In [18]:
# DTW over ROME
# generate_dtw_similarities(DATA_ROME, TEST_SET_ROME, ROME_DTW_FILE)

In [5]:
# Parallell Porto DTW
generate_parallell_dtw_similarities(DATA_PORTO, PORTO_FULL_SET, PORTO_DTW_FILE)


In [6]:
# parallell Rome DTW
generate_parallell_dtw_similarities(DATA_ROME, ROME_FULL_SET, ROME_DTW_FILE)

In [6]:
# TEST SET OVER PORTO and ROME DTW
generate_parallell_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE_TEST)
generate_parallell_dtw_similarities(DATA_ROME, TEST_SET_ROME, ROME_DTW_FILE_TEST)

---

## Frechet distance computation

Cells for generating and storing the similarities using frechet distance

In [4]:
## Python frechet
def generate_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def compute_parallell_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [None]:
# Frechet over Porto # SLOW METHOD
generate_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE)

In [None]:
# Frechet over rome # SLOW METHOD
generate_frechet_similarities(DATA_ROME, TEST_SET_ROME, ROME_FRECHET_FILE)

In [None]:
# Paralell Porto Frechet // Took 429 minutes
compute_parallell_frechet_similarities(DATA_PORTO, portoSet(1000), PORTO_FRECHET_FILE)

In [None]:
# Parallell Rome Frechet // Took 635 minutes
compute_parallell_frechet_similarities(DATA_ROME, romeSet(1000), ROME_FRECHET_FILE)

In [8]:
# TEST SET PORTO FRECHET
compute_parallell_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE_TEST)
compute_parallell_frechet_similarities(DATA_ROME, TEST_SET_ROME, ROME_FRECHET_FILE_TEST)


---

## Time efficiency measuring

Cells below are used for computing the time efficiency of DTW and Frechet

In [25]:
from multiprocessing import Pool

In [26]:

# Using Python, for measuring computation time fairly against the hash computations

sim = {
    "dtw_py" : dtw.measure_py_dtw,
    "frechet_py" : frechet.measure_py_frechet,
    "dtw_cy" : dtw.measure_cy_dtw,
    "frechet_cy" : frechet.measure_cy_frechet
}

def measure_similarities(measure: str, data_folder: str, meta_file: str, parallell_jobs: int = 10):
    """ Common method for measuring the efficiency of the similarity algorithms """
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    with Pool() as pool:
        result = pool.map(sim[measure], [[trajectories, 1, 1] for _ in range(10)])
    return result


In [None]:
# test

measure_similarities("dtw_py", DATA_PORTO, portoSet(50))


[[17.15625], [17.5], [17.953125], [17.671875], [17.265625], [17.390625], [18.28125], [17.90625], [18.328125], [17.34375]]


In [None]:
measure_similarities("dtw_cy", DATA_PORTO, portoSet(200))

[[6.109375], [6.21875], [6.4375], [6.484375], [6.078125], [5.765625], [6.328125], [6.609375], [5.84375], [5.703125]]


In [None]:
# Takes a long time, more than five minutes for portoset-50
measure_similarities("frechet_cy", DATA_PORTO, portoSet(50))

../data/chosen_data/porto/META-50.txt
[[211.71875], [209.15625], [211.453125], [210.890625], [208.453125], [207.828125], [210.703125], [208.859375], [211.359375], [210.40625]]


In [None]:
# Something strange with this one -> Resulting in some strange error
measure_similarities("frechet_py", DATA_PORTO, portoSet(50))

In [27]:
# Measuring the computation times of true similarities

runs = 10
data_sets = range(500,501,100)
output_folder = "../code/experiments/timing/"
file_name = "similarity_runtimes_true_dtw_rome_500.csv"

df = pd.DataFrame( index=[f"run_{x+1}" for x in range(runs)], columns=[x for x in data_sets])

for size in data_sets:
    print(f"Computing size {size}", end="\r")
    execution_times = measure_similarities("dtw_py", DATA_ROME, romeSet(size), parallell_jobs=runs)
    df[size] = [element[0] for element in execution_times]

df.to_csv(os.path.join(output_folder, file_name))
df

Computing size 500

Unnamed: 0,500
run_1,5859.484375
run_2,5837.28125
run_3,5836.828125
run_4,5850.578125
run_5,5833.421875
run_6,5841.53125
run_7,5857.796875
run_8,5819.328125
run_9,5842.859375
run_10,5835.4375


In [28]:
of = pd.read_csv("C:\\Users\\bafla\\Desktop\\Projects\\master\\code\\experiments\\timing\\similarity_runtimes_true_dtw_rome.csv", index_col=0)
f1 = pd.read_csv("C:\\Users\\bafla\\Desktop\\Projects\\master\\code\\experiments\\timing\\similarity_runtimes_true_dtw_rome_500.csv", index_col=0)

of["500"] = f1["500"]
#of["700"] = f1["700"]

of.to_csv("C:\\Users\\bafla\\Desktop\\Projects\\master\\code\\experiments\\timing\\similarity_runtimes_true_dtw_rome.csv")
20


20