# Sheet for extracting raw-data from rome dataset

In [2]:
# Importing necessary modules

import pandas as pd
from tqdm import tqdm
import os, shutil

# From utils
from utils.alphabetical_incrementer import increment_alphabetical


In [11]:
# Containning variables and constants for this spreadsheet

SHOULD_DELETE_OLD_FILES= True

OUTPUT_FOLDER = "../data/chosen_data/porto/"
RAW_DATA_FILE = "../data/raw_data/train.csv"

LOG = True # Set to true for printing during data extraction

MAX_LON = -8.57
MIN_LON = -8.66
MAX_LAT = 41.19
MIN_LAT = 41.14
MIN_LEN = 40

NUMBER_OF_TRACES = 1000

In [4]:
# Reading the dataset into dataframe

raw_df = pd.read_csv(RAW_DATA_FILE)

In [12]:
# Run this cell to clear the chosen files in the PORTO folder

if SHOULD_DELETE_OLD_FILES:
    for filename in os.listdir(OUTPUT_FOLDER):
        file_path = os.path.join(OUTPUT_FOLDER, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print("Failed to remove %s. Reason: %s" % (file_path, e))

In [13]:
# This cell will read the data and generate a file for each trajectory in the given output directory
# Will choose traces longer than n points that are within a given rectangle of the city

counter = 0
name_counter = "AAA"

for index, row in raw_df.iterrows():
    trace_id = row["TRIP_ID"] 

    # If row is misssing data: ignore row
    if row["MISSING_DATA"] == True: 
        if LOG: print(trace_id, "is missing data") 
        continue

    trace = row["POLYLINE"][2:-2].split("],[")

    # If trace-length less than "MIN_LEN": ignore row
    if len(trace) < MIN_LEN: 
        if LOG:  print(trace_id, "is less than preferred length")
        continue


    # If trace are outside bounded rectangle: ignore row
    for coordinate in trace:
        lon, lat = coordinate.split(",")
        if ( not ( MIN_LON <= float(lon) <= MAX_LON )) or ( not ( MIN_LAT <= float(lat) <= MAX_LAT )):
            if LOG: print(trace_id, "is outside bounded rectangle")
            break
    


    # Else, everything is good so far --> Write trajectory to file
    else:
        
        with open(f'{OUTPUT_FOLDER}/P_{name_counter}.txt','w') as file:
            for coordinate in trace:
                lon, lat = coordinate.split(",")
                file.write("%s, %s\n" % (lon, lat))
            file.close()

        counter +=1
        
        name_counter = increment_alphabetical(name_counter)

        if counter >= NUMBER_OF_TRACES: 
            break



print("Number of trajectories written to file:", counter)

1372636858620000589 is less than preferred length
1372637303620000596 is less than preferred length
1372637091620000337 is less than preferred length
1372636965620000231 is less than preferred length
1372637210620000456 is less than preferred length
1372637299620000011 is less than preferred length
1372637274620000403 is less than preferred length
1372637905620000320 is less than preferred length
1372636875620000233 is less than preferred length
1372637984620000520 is outside bounded rectangle
1372637343620000571 is less than preferred length
1372638595620000233 is less than preferred length
1372638151620000231 is less than preferred length
1372639135620000570 is less than preferred length
1372637482620000005 is less than preferred length
1372639181620000089 is less than preferred length
1372638161620000423 is less than preferred length
1372637254620000657 is outside bounded rectangle
1372638502620000320 is less than preferred length
1372639960620000309 is less than preferred length
13