# Sheet for extracting raw-data from Porto dataset

In [37]:
# Importing necessary modules

import pandas as pd
import os, shutil


# From utils
from utils.alphabetical_number import increment_alphabetical
from utils.trajectory_distance import calculate_trajectory_distance
from utils.metafile_handler import create_meta_files, get_meta_files, delete_meta_files


In [38]:
# Containning variables and constants for this spreadsheet
SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER = "../data/chosen_data/subset-100000/"
RAW_DATA_FILE = "../data/raw_data/subset-100000-6-percent.csv"

LOG = False # Set to true for printing during data extraction

NUMBER_OF_TRACES = 100000

In [39]:
# Reading the original dataset into dataframe

raw_df = pd.read_csv(RAW_DATA_FILE)

In [40]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [41]:
# This cell will read the data and generate a file for each trajectory in the given output directory
# Will choose traces longer than n points that are within a given rectangle of the city

name_counter = "AAAA"

for index, row in raw_df.iterrows():
    trace_id = row["TRIP_ID"] 
    trace = row["POLYLINE"][2:-2].split("],[")

    with open(f'{OUTPUT_FOLDER}/P_{name_counter}.txt','w') as file:
        for coordinate in trace:
            lon, lat = coordinate.split(",")
            file.write("%s, %s\n" % (lat, lon))
        file.close()

    name_counter = increment_alphabetical(name_counter)


In [42]:
# Creating the metafiles that will function as index files for the datasets

if get_meta_files(OUTPUT_FOLDER):
    delete_meta_files(OUTPUT_FOLDER)

create_meta_files(path_to_files=OUTPUT_FOLDER, data_prefix="P_", create_test_set=True)