# Sheet for computing the benchmarks and true similarities for the project

In [1]:
# Importing nescessary modules
import os
import re
import shutil
import numpy as np
import pandas as pd

from utils import metafile_handler as mfh
from utils import file_handler as fh

from benchmarks import dtw
from benchmarks import frechet



In [2]:
DATA_PORTO = "../data/chosen_data/porto/"
SIM_OUT_FOLDER = "../code/benchmarks/similarities/"

PORTO_FULL_SET = "../data/chosen_data/porto/META-1000.txt"

TEST_SET_PORTO = "../data/chosen_data/porto/META-50.txt"

PORTO_DTW_FILE = "porto-dtw.csv"
PORTO_FRECHET_FILE = "porto-frechet.csv"

PORTO_DTW_FILE_TEST = "porto-dtw-test.csv"
PORTO_FRECHET_FILE_TEST = "porto-frechet-test.csv"

In [3]:
def deleteFile(file_name: str) -> None:
    file_path = os.path.join(SIM_OUT_FOLDER, file_name)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print("Failed to remove %s. Reason: %s" % (file_path, e))


def portoSet(file_size: int) -> str:
    return f"../data/chosen_data/porto/META-{file_size}.txt"

---

## DTW distance compuation

Cells for generating and storing the similarities using dtw

In [5]:
## Using Cython DTW, to speed things up

def generate_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def generate_parallell_dtw_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = dtw.cy_dtw_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [5]:
# DTW over PORTO
# generate_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE)

In [6]:
# Parallell Porto DTW
generate_parallell_dtw_similarities(DATA_PORTO, PORTO_FULL_SET, PORTO_DTW_FILE)


In [7]:
# TEST SET OVER PORTO DTW
generate_parallell_dtw_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_DTW_FILE_TEST)

---

## Frechet distance computation

Cells for generating and storing the similarities using frechet distance

In [10]:
## Python frechet
def generate_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)

    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))


def compute_parallell_frechet_similarities(data_folder: str, meta_file: str, file_name: str):
    deleteFile(file_name)
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    df = frechet.cy_frechet_pool(trajectories)
    df.to_csv(os.path.join(SIM_OUT_FOLDER, file_name))

In [9]:
# Frechet over Porto # SLOW METHOD // idun (1node, 6cores, 128GB): 3min
generate_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE)

In [10]:
# Paralell Porto Frechet // Took 429 minutes // amalie: 357 // idun: 
compute_parallell_frechet_similarities(DATA_PORTO, portoSet(1000), PORTO_FRECHET_FILE)

Cy Pool Frehet: 0/1000
Cy Pool Frehet: 1/1000
Cy Pool Frehet: 2/1000
Cy Pool Frehet: 3/1000
Cy Pool Frehet: 4/1000
Cy Pool Frehet: 5/1000
Cy Pool Frehet: 6/1000
Cy Pool Frehet: 7/1000
Cy Pool Frehet: 8/1000
Cy Pool Frehet: 9/1000
Cy Pool Frehet: 10/1000
Cy Pool Frehet: 11/1000
Cy Pool Frehet: 12/1000
Cy Pool Frehet: 13/1000
Cy Pool Frehet: 14/1000
Cy Pool Frehet: 15/1000
Cy Pool Frehet: 16/1000
Cy Pool Frehet: 17/1000
Cy Pool Frehet: 18/1000
Cy Pool Frehet: 19/1000
Cy Pool Frehet: 20/1000
Cy Pool Frehet: 21/1000
Cy Pool Frehet: 22/1000
Cy Pool Frehet: 23/1000
Cy Pool Frehet: 24/1000
Cy Pool Frehet: 25/1000
Cy Pool Frehet: 26/1000
Cy Pool Frehet: 27/1000
Cy Pool Frehet: 28/1000
Cy Pool Frehet: 29/1000
Cy Pool Frehet: 30/1000
Cy Pool Frehet: 31/1000
Cy Pool Frehet: 32/1000
Cy Pool Frehet: 33/1000
Cy Pool Frehet: 34/1000
Cy Pool Frehet: 35/1000
Cy Pool Frehet: 36/1000
Cy Pool Frehet: 37/1000
Cy Pool Frehet: 38/1000
Cy Pool Frehet: 39/1000
Cy Pool Frehet: 40/1000
Cy Pool Frehet: 41/1000
Cy

KeyboardInterrupt: 

In [None]:
# TEST SET PORTO FRECHET
compute_parallell_frechet_similarities(DATA_PORTO, TEST_SET_PORTO, PORTO_FRECHET_FILE_TEST)


Cy Pool Frehet: 0/1000
Cy Pool Frehet: 1/1000
Cy Pool Frehet: 2/1000
Cy Pool Frehet: 3/1000
Cy Pool Frehet: 4/1000
Cy Pool Frehet: 5/1000
Cy Pool Frehet: 6/1000
Cy Pool Frehet: 7/1000
Cy Pool Frehet: 8/1000
Cy Pool Frehet: 9/1000
Cy Pool Frehet: 10/1000
Cy Pool Frehet: 11/1000
Cy Pool Frehet: 12/1000
Cy Pool Frehet: 13/1000
Cy Pool Frehet: 14/1000
Cy Pool Frehet: 15/1000
Cy Pool Frehet: 16/1000
Cy Pool Frehet: 17/1000
Cy Pool Frehet: 18/1000
Cy Pool Frehet: 19/1000
Cy Pool Frehet: 20/1000
Cy Pool Frehet: 21/1000
Cy Pool Frehet: 22/1000
Cy Pool Frehet: 23/1000
Cy Pool Frehet: 24/1000
Cy Pool Frehet: 25/1000
Cy Pool Frehet: 26/1000
Cy Pool Frehet: 27/1000
Cy Pool Frehet: 28/1000
Cy Pool Frehet: 29/1000
Cy Pool Frehet: 30/1000
Cy Pool Frehet: 31/1000
Cy Pool Frehet: 32/1000
Cy Pool Frehet: 33/1000
Cy Pool Frehet: 34/1000
Cy Pool Frehet: 35/1000
Cy Pool Frehet: 36/1000
Cy Pool Frehet: 37/1000
Cy Pool Frehet: 38/1000
Cy Pool Frehet: 39/1000
Cy Pool Frehet: 40/1000
Cy Pool Frehet: 41/1000
Cy

---

## Time efficiency measuring

Cells below are used for computing the time efficiency of DTW and Frechet

In [11]:
from multiprocessing import Pool

In [12]:

# Using Python, for measuring computation time fairly against the hash computations

sim = {
    "dtw_py" : dtw.measure_py_dtw,
    "frechet_py" : frechet.measure_py_frechet,
    "dtw_cy" : dtw.measure_cy_dtw,
    "frechet_cy" : frechet.measure_cy_frechet
}

def measure_similarities(measure: str, data_folder: str, meta_file: str, parallell_jobs: int = 10):
    """ Common method for measuring the efficiency of the similarity algorithms """
    files = mfh.read_meta_file(meta_file)
    trajectories = fh.load_trajectory_files(files, data_folder)

    with Pool() as pool:
        result = pool.map(sim[measure], [[trajectories, int(1), int(1)] for _ in range(10)])
    return result


In [13]:
# test

measure_similarities("dtw_py", DATA_PORTO, portoSet(50))


[[15.373772226],
 [15.343935642],
 [15.28037576],
 [15.339583458],
 [15.292721019],
 [15.661133506],
 [15.284736379],
 [15.201094405],
 [15.270890839],
 [15.227085964999999]]

In [14]:
measure_similarities("dtw_cy", DATA_PORTO, portoSet(200))

[[4.981131684],
 [4.976493147],
 [4.963856002],
 [5.036284585],
 [5.0507470990000005],
 [4.981072051],
 [4.9820235120000005],
 [4.981873599],
 [4.987114651000001],
 [4.9832983330000005]]

In [15]:
# Takes a long time, more than five minutes for portoset-50
measure_similarities("frechet_cy", DATA_PORTO, portoSet(50))

[[159.975121728],
 [158.253995097],
 [158.049499556],
 [158.100317227],
 [158.504733155],
 [158.358057334],
 [158.57250038499998],
 [158.440759303],
 [158.125782362],
 [158.175755072]]

In [16]:
# Something strange with this one -> Resulting in some strange error
measure_similarities("frechet_py", DATA_PORTO, portoSet(50))

TypeError: list indices must be integers or slices, not float