In [1]:
import numpy as np
import pandas as pd
import itertools
from datetime import datetime
from ast import literal_eval
import argparse

In [4]:
# parser = argparse.ArgumentParser(description='Rome dataset parser.')
# parser.add_argument('--dataset_directory', type=str, default="/local/data1/users/anadiri/collision/",
#                     help='the directory where the dataset is located')
# parser.add_argument('--window_size', type=int, default=15,
#                     help='the size of the collision window')

# args = parser.parse_args()

dataset_directory = "/local/data1/users/anadiri/collision/" # args.dataset_directory
# window_size = args.window_size
# for window_size in [15,30,60,120,300]:
    # Loop through the specified dataset files
    # for dataset_name in ["rome_res7.csv"]:#, "rome_res8.csv", "rome_res9.csv"]:
window_size = 15
dataset_name = "rome_res7.csv"
time_interval = 7  # The time interval between each record in the dataset

dataset_path = dataset_directory + dataset_name  # Construct the full dataset path

# Load the dataset, keeping only relevant columns
data = pd.read_csv(dataset_path)
data = data[['taxi_id', 'timestamp', 'count']]


In [5]:

# Normalize timestamps to start from 0
data['timestamp'] = data['timestamp'].apply(
    lambda x: datetime.timestamp(datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f%z")))
start_time = min(data['timestamp'])
data['timestamp'] = data['timestamp'] - start_time

In [6]:

# Function to process each row, expanding 'count' into time windows
def process(row):
    time = row['timestamp']
    x = literal_eval(row['count'])
    x = [item[0] for item in x for i in range(item[1])]
    res = dict()
    for item in range(len(x)):
        window = int((time+item*time_interval)/window_size)
        res[window] = (x[item], window)
    return list(res.values())


In [7]:

# Apply processing to expand 'count' into individual records for each time window
data['count'] = data.apply(process, axis=1)

# Explode the 'count' column to separate rows for each location and time window
data = data.explode('count', ignore_index=True)


In [8]:
# Extract location and time window from 'count' and drop unnecessary columns
data['location'] = data['count'].apply(lambda x: x[0])
data['time'] = data['count'].apply(lambda x: x[1])
data.drop(['count', 'timestamp'], axis=1, inplace=True)
data['taxi_id'] = data['taxi_id'].astype(int)

In [9]:
# Group by time and location to identify potential collisions
data = data.groupby(['time', 'location']).agg(list).reset_index()

In [10]:

# Filter for entries where more than one taxi is present
data = data[data['taxi_id'].apply(len) > 1]
data

Unnamed: 0,time,location,taxi_id
2,0,871e80501ffffff,"[187, 197, 224]"
3,0,871e80503ffffff,"[248, 368]"
5,0,871e80505ffffff,"[37, 105, 79, 291, 352]"
9,0,871e80513ffffff,"[343, 321]"
14,0,871e80528ffffff,"[122, 58]"
...,...,...,...
1613803,172814,871e8052affffff,"[42, 22, 113]"
1613807,172815,871e8052affffff,"[42, 22, 113]"
1613811,172816,871e8052affffff,"[42, 22, 113]"
1613815,172817,871e8052affffff,"[42, 22, 113]"


In [11]:


# Generate pairs of taxi IDs for each potential collision
data['pairs'] = data['taxi_id'].apply(
    lambda x: [(a[1],a[0]) if a[0] > a[1] else (a[0],a[1]) for a in itertools.combinations(x, 2)])

In [19]:

# data = data.drop('taxi_id', axis=1)
# data = data.explode('pairs')
data['pairs'] = data['pairs'].apply(lambda x: x[0] if x[0] > x[1] else x[1])
data['pairs'].nunique(), data['pairs'].shape[0]

(314, 15885801)

In [None]:
data = data.drop('pairs', axis=1)
print(window_size, len(data))
# Finalize the dataset and write to a new CSV file
data.to_csv(dataset_directory + "collision_" + str(window_size) +
            "_" + dataset_name, index=False)
