# Sheet for extracting raw-data from rome dataset

Itererer gjennom listen med datapunkter og oppretter trajectories.

Det opprettes en trajectory, der alle påfølgende punkter legges inn, så lenge det er samme sjåfør, innenfor et tidsvindu og innenfor et bounding rectangle.

Et trajectory blir lagret dersom det er minst MIN_LEN punkter langt, er innenfor MIN_DIST og MAX_DIST, samt at avstanden mellom start og sluttpunkt ikke er mer enn 2.5 ganger sporets reelle lengde.

Sporene blir konstruert slik at siste punkt i et spor, kan ha sitt neste (naturlige) punkt i det neste genererte sporet, såfremt det tilfreddsstiller de andre kravene under generering. Dette bør ikke bli et problem, men er greit å opplyse om i oppgaven. De genererte sporene fra Roma-settet vil trolig ha mer komplisert data enn Porto, da det ikke er gitt om de genererte sporene er faktiske taxiturer, deler av taxiturer, snirkling - det kan være hva som helst, og mest sannsynlig en blanding av flere kategorier av kjøretyper.

In [12]:
# Importing necessary modules

import pandas as pd
from tqdm import tqdm
import math
import re
import numpy as np

from datetime import datetime, timedelta
import os, shutil

# From utils
from utils.alphabetical_incrementer import increment_alphabetical
from utils.trajectory_distance import calculate_trajectory_distance
from utils.metafile_handler import create_meta_files, get_meta_files, delete_meta_files

In [2]:
# Declaring variables and constants for this sheet

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER = "../data/chosen_data/rome/"
RAW_DATA_FILE = "../data/raw_data/taxi_february.txt"

LOG = False # Set to true for printing/debug during data extraction

MAX_LON = 12.53
MIN_LON = 12.44
MAX_LAT = 41.93
MIN_LAT = 41.88

MIN_LEN = 40 # Perhaps change this one
MIN_DIST = 4 # In km
MAX_DIST = 6 # In km

MAX_TIME_DIFF_SECONDS = 32 # Maximum time difference between two consecutive trackpoints in a trajectory

NUMBER_OF_TRACES = 1000


In [3]:
# Calculating the size of the minimum bounding rectangle

X = calculate_trajectory_distance([ ( MIN_LAT, MIN_LON ) , ( MAX_LAT, MIN_LON ) ])
Y = calculate_trajectory_distance([ ( MIN_LAT, MIN_LON ) , ( MIN_LAT, MAX_LON ) ])

print(f"Bounded rectangle have sidelengths (X,Y): {X, Y} km")

Bounded rectangle have sidelengths (X,Y): (5.559754011676299, 7.451072531046803) km


In [4]:
# Reading the dataset into a dataframe

raw_df = pd.read_csv(RAW_DATA_FILE, delimiter=";", names=["index", "timestamp", "location"])

In [5]:
# Must sort the values according to their index and timestamp 
# Also copying the dataframe as we will touch the values in the "location" column
# Will probably take some time

df = raw_df.head(1500000).sort_values(by=["index", "timestamp"]).copy()

# Deleting raw_df for performance reasons
del raw_df


In [6]:
# Cell that translates the location column of the dataframe to only the locations
# Will probably take some time to finish

df["location"] = df["location"].apply(lambda x: re.search(r'\(.*?\)', x).group(0)[1:-1].replace(" ", ","))


In [7]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [8]:
# Some helper functions for controling and saving the trajectories

#Save function
def save_current_trajectory( file_name: str, trajectory: list[tuple[float]] ) -> None:
    with open(f'{OUTPUT_FOLDER}/R_{file_name}.txt', "w") as file:
        for coordinate in trajectory:
            lat, lon = coordinate
            file.write("%s, %s\n" % (lat, lon))
        file.close()
        return


# Control function:
def control_current_trajectory( trajectory: list[tuple[float]] ) -> None:
    pass
    # TODO



In [9]:
# Cell to extract traces that are of a minimum length of N locations and within a given distance window
counter = 0
name_counter = "AAA"

last_driver_id = None
last_timestamp = None
last_location = None
trajectory = []

for index, row in df.iterrows():

    current_driver_id = row["index"]
    current_timestamp = row["timestamp"]
    
    current_location = row["location"].split(",")
    lat, lon = current_location

    # On first run or change in driver_id:
    if (not last_timestamp) or (last_driver_id != current_driver_id):
        last_timestamp = current_timestamp
        last_location = current_location
        last_driver_id = current_driver_id
        trajectory.clear()
        continue

    # Location outside bounded rectangle, go to next row
    if ( not ( MIN_LON <= float(lon) <= MAX_LON )) or ( not ( MIN_LAT <= float(lat) <= MAX_LAT )):
        if LOG: print("Location outside bounded rectangle: ", name_counter, lat,lon)
        continue
    
    try:

        # Checking whether time between two consecutive points are greater than a given const 
        t_last = datetime.strptime(last_timestamp[0:last_timestamp.index(".")], "%Y-%m-%d %H:%M:%S")
        t_current = datetime.strptime(current_timestamp[0:current_timestamp.index(".")], "%Y-%m-%d %H:%M:%S")

    except:
        last_timestamp = current_timestamp
        last_location = current_location
        last_driver_id = current_driver_id
        trajectory.clear()
        continue


    # If timedelta small enough
    if t_current - t_last <= timedelta(seconds=MAX_TIME_DIFF_SECONDS):
        if LOG: print("Timedelta is small enough")
        
        trajectory.append((float(lat), float(lon)))

    if MIN_LEN <= len(trajectory):
        trajectory_distance = calculate_trajectory_distance(trajectory)
        shortest_distance = calculate_trajectory_distance([trajectory[0],trajectory[-1]])

        if  MIN_DIST <= trajectory_distance <= MAX_DIST and trajectory_distance < shortest_distance * 2.5:
            # TODO Ensure that all points in trajectory are valid


            
            # Saving trajectory and updating variables
            save_current_trajectory(name_counter, trajectory=trajectory)
            if LOG: print(f"Total distance: {name_counter, trajectory_distance, shortest_distance, len(trajectory)}")

            counter +=1
            name_counter = increment_alphabetical(name_counter)
            trajectory.clear()

        if trajectory_distance > MAX_DIST:
            trajectory.clear()

    
    # If enough trajetories are generated: 
    if counter >= NUMBER_OF_TRACES: 
        break
    
    last_timestamp = current_timestamp
    last_driver_id = last_driver_id
    last_location = last_location

print(f"Created {counter} trajectories")

Created 1000 trajectories


In [13]:
# Creating the metafiles that will function as index files for the datasets

if get_meta_files(OUTPUT_FOLDER):
    delete_meta_files(OUTPUT_FOLDER)

create_meta_files(path_to_files=OUTPUT_FOLDER, data_prefix="R_", create_test_set=True)