# Sheet for extracting raw-data from rome dataset

In [148]:
# Importing necessary modules

import pandas as pd
from tqdm import tqdm
import os, shutil

In [149]:
# Containning variables and constants for this spreadsheet

SHOULD_DELETE_OLD_FILES= True
OUTPUT_FOLDER = "../data/chosen_data/porto/"
RAW_DATA_FILE = "../data/raw_data/train.csv"

LOG = False # Set to true for printing during data extraction

MAX_LON = -8.57
MIN_LON = -8.66
MAX_LAT = 41.19
MIN_LAT = 41.14
MIN_LEN = 40

NUMBER_OF_TRACES = 1000

In [150]:
# Reading the dataset into dataframe

raw_df = pd.read_csv(RAW_DATA_FILE)

In [151]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))



In [152]:

# This cell will read the data and generate a file for each trajectory in the given output directory
# Will choose traces longer than n points that are within a given rectangle of the city

counter = 0

for index, row in raw_df.iterrows():
    trace_id = row["TRIP_ID"] 

    # If row is misssing data: ignore row
    if row["MISSING_DATA"] == True: 
        if LOG: print(trace_id, "is missing data") 
        continue

    trace = row["POLYLINE"][2:-2].split("],[")

    # If trace-length less than "MIN_LEN": ignore row
    if len(trace) < MIN_LEN: 
        if LOG:  print(trace_id, "is less than preferred length")
        continue


    # If trace are outside bounded rectangle: ignore row
    for coordinate in trace:
        lon, lat = coordinate.split(",")
        if not (MIN_LON <= float(lon) <= MAX_LON):
            if LOG: print(trace_id, "is outside bounded rectangle")
            break
    


    # Else, everything is good so far --> Write trajectory to file
    else:
        
        with open(f'{OUTPUT_FOLDER}/{trace_id}.txt','w') as file:
            for coordinate in trace:
                lon, lat = coordinate.split(",")
                file.write("%s, %s\n" % (lon, lat))
            file.close()

        counter += 1

        if counter >= NUMBER_OF_TRACES: 
            break



print("Number of trajectories written to file:", counter)

Number of trajectories written to file: 1000
