# Sheet for extracting raw-data from rome dataset

In [43]:
# Importing necessary modules

import pandas as pd
from tqdm import tqdm
import math
import re
import numpy as np
from haversine import haversine, Unit
from datetime import datetime, timedelta
import os, shutil


In [41]:
# Declaring variables and constants for this sheet

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER = "../data/chosen_data/rome/"
RAW_DATA_FILE = "../data/raw_data/taxi_february.txt"

LOG = False # Set to true for printing during data extraction

MAX_LON = 12.53
MIN_LON = 12.44
MAX_LAT = 41.93
MIN_LAT = 41.88
MIN_LEN = 40 # Perhaps change this one

MAX_TIME_DIFF_SECONDS = 60 # Maximum time difference between two consecutive trackpoints in a trajectory

NUMBER_OF_TRACES = 1000


In [5]:
# Reading the dataset into a dataframe

raw_df = pd.read_csv(RAW_DATA_FILE, delimiter=";", names=["index", "timestamp", "location"])

In [6]:
# Helper function to compute distance between a list of coordinates (Trajectory distance)
# Haversine distance used

def calculate_trajectory_distance(positions: list[tuple[float]]) -> float:
    distance = 0
    for i in range(1, len(positions)):
        from_location = positions[i-1]
        to_location = positions[i]

        distance += haversine(from_location, to_location, unit=Unit.KILOMETERS)
    return distance

#calculate_trajectory_distance([(63.40788858583628, 10.406207470471212),(59.92062511800243, 10.719254233646412),(62.48483421400412, 17.058008125391442)])

In [7]:
# Printing numner of unique drivers in set
print(raw_df["index"].unique().__len__())

316


In [8]:
# Must sort the values according to their index and timestamp 
# Also copying the dataframe as we will touch the values in the "location" column

raw_df.head(1000000).sort_values(by=["index", "timestamp"]).loc[raw_df['index'] == 2]

df = raw_df.copy()


In [9]:
# Cell that translates the location column of the dataframe to only the locations

df["location"] = df["location"].apply(lambda x: re.search(r'\(.*?\)', x).group(0)[1:-1].replace(" ", ","))


In [10]:
df.loc[df['index']==2].head(1548)

Unnamed: 0,index,timestamp,location
33,2,2014-02-01 00:00:10.168741+01,"41.9081300994706,12.5043668987191"
89,2,2014-02-01 00:00:25.774039+01,"41.9086630527271,12.5066009228874"
138,2,2014-02-01 00:00:40.26196+01,"41.9090395026945,12.5079114632066"
191,2,2014-02-01 00:00:56.451894+01,"41.9090395026945,12.5079114632066"
240,2,2014-02-01 00:01:09.895936+01,"41.9090395026945,12.5079114632066"
...,...,...,...
39913,2,2014-02-01 06:08:00.070977+01,"41.8880515893667,12.3763677048198"
39948,2,2014-02-01 06:08:14.330084+01,"41.8879860192086,12.3763204528913"
244650,2,2014-02-01 21:05:10.951712+01,"41.8956181485921,12.4182874889476"
244651,2,2014-02-01 21:05:10.952329+01,"41.8956181485921,12.4182874889476"


In [55]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [56]:
# Some helper functions for controling and saving the trajectories

#Save function
def save_current_trajectory(driver_id, trajectory: list[tuple[float]]) -> None:
    with open(f'{OUTPUT_FOLDER}/{str(trajectory[0][0])[3:]+driver_id}.txt', "w") as file:
        for coordinate in trajectory:
            lon, lat = coordinate
            file.write("%s, %s\n" % (lon, lat))
        file.close()
        return


# Control function:
def control_current_trajectory(driver_id, trajectory: list[tuple[float]]) -> None:
    pass
    # TODO


save_current_trajectory("12", [(12.12223,22.12223),(13.1111,31.1111),(33.2345,33)])

In [46]:
# Cell to extract traces that are of a minimum length of N locations and within a given distance window
# For each "found" trajectory that satisfies the requirements, a file containing the locations of the trajecotry should be genereated 

counter = 0
last_driver_id = None
last_timestamp = None
trajectory = []

for index, row in df.loc[df['index']==2].head(1548).head(3).iterrows():
    current_driver_id = row["index"]
    current_timestamp = row["timestamp"]
    
    print(current_timestamp) 

    if last_timestamp:

        # Checking whether time between two consecutive points are greater than a given const 
            t_last = datetime.strptime(last_timestamp[0:last_timestamp.index("+")], "%Y-%m-%d %H:%M:%S.%f")
            t_current = datetime.strptime(current_timestamp[0:current_timestamp.index("+")], "%Y-%m-%d %H:%M:%S.%f")
            
            # Case: Too big timedelta between points
            if t_current - t_last > timedelta(seconds=MAX_TIME_DIFF_SECONDS):
                if LOG: print("Timedelta is greater than treshold")
                # TODO

    # If no other 


    # At end of each iteration: 
    last_timestamp = row["timestamp"]
    last_driver_id = current_driver_id

IndentationError: expected an indented block after 'for' statement on line 11 (1953614413.py, line 15)