In [58]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import numpy as np
from collections import defaultdict
from datetime import timedelta, datetime
import math
from pprint import pprint

def get_step_trip_list(
    path, step=15, earliest_step=0, max_steps=None, resize_factor=1
):
    """Read trip csv file and return list of trips for each time step.
    The list of trips is saved in a .npy file in the same directory
    of the .csv for fast processing.

    Parameters
    ----------
    path : str
        Trip list .csv file
    step : int, optional
        Time step (min) to aggregate trips, by default 15
    earliest_step : int, optional
        Trip list starts from earliest step, by default 0
    max_steps : int, optional
        Total number of steps from earliest step, by default None
    resize_factor: float
        Percentage of trips sampled in each time step. Used to create
        smaller test cases.

    Returns
    -------
    list if trip info list
        List of trip tuples (time, count, o, d) occuring in each time
        step. 
    """
    
    # Processed trip data (list of trips) is saved in a .npy file
    # for faster reading
    path_npy = (
        f"{path.split('.')[0]}_"
        f"increment={step:02}min_"
        f"earlieststep={earliest_step:02}_"
        f"maxsteps={(f'{max_steps:02}' if max_steps else '--')}_"
        f"resize={resize_factor:02}.npy"
    )

    
    try:
        print(f"Trying to load processed trip data from '{path_npy}'")
        t1 = time.time()
        step_trip_list = np.load(path_npy)
        print(f"Trip list loaded (took {time.time() - t1:10.6f} seconds)")
    
    except:
        print(f"Loading .npy failed. Processing trip data...")
        t1 = time.time()
        df = pd.read_csv(path, index_col="pickup_datetime", parse_dates=True)

        # List of list of trip info (time, passenger count, o_id, d_id)
        step_trip_list = []

        # Time increment
        step_timedelta = timedelta(minutes=step)

        # Earliest time window
        from_datetime = df.index[0]

        # Earliest time
        from_datetime = from_datetime + earliest_step * step_timedelta
        limit_datetime = df.index[-1]

        if max_steps:
            limit_datetime = from_datetime + max_steps * step_timedelta

        while True:
            # Right time window
            to_datetime = from_datetime + step_timedelta
            df_slice = df[from_datetime:to_datetime]

            # Trips associated to timestep
            trip_list = []

            placement_first = df_slice.index[0]
            for i in range(0, len(df_slice) - 1):
                # What time trip has arrived into the system
                placement_time = df_slice.index[i]

                # How many passengers
                passenger_count = df_slice.iloc[i]["passenger_count"]

                # Origin id
                pk_id = df_slice.iloc[i]["pk_id"]

                # Destination id
                dp_id = df_slice.iloc[i]["dp_id"]

                # Trip info tuple is added to step
                trip_list.append(
                    (placement_time, int(passenger_count), int(pk_id), int(dp_id))
                )

            # Update time windows
            from_datetime = to_datetime

            # Sample trips in step
            if resize_factor < 1:
                sample_size = math.ceil(resize_factor * len(trip_list))
                trip_list = random.sample(trip_list, k=sample_size)

            step_trip_list.append(trip_list)
            
            # Finished processing trips
            if from_datetime >= limit_datetime:
                break
        
        print(f"Processed finished {time.time()-t1:10.6f} seconds. Saving...")
        t2 = time.time()
        np.save(path_npy, step_trip_list)
        print(f"Saved in {time.time()-t2:10.6f} seconds.")
    
    return step_trip_list

path = "C:/Users/LocalAdmin/OneDrive/leap_forward/phd_project/reb/code/mod/data/input/nyc/trips_2011-01-04-enriched.csv"
trips = get_step_trip_list(path)
print(trips[0])

Trying to load processed trip data from 'C:/Users/LocalAdmin/OneDrive/leap_forward/phd_project/reb/code/mod/data/input/nyc/trips_2011-01-04-enriched_increment=15min_earlieststep=00_maxsteps=--_resize=01.npy'
Trip list loaded (took   0.909417 seconds)
[(Timestamp('2011-01-04 00:00:00'), 1, 1363, 3139), (Timestamp('2011-01-04 00:00:00'), 5, 35, 1691), (Timestamp('2011-01-04 00:00:00'), 1, 1671, 1221), (Timestamp('2011-01-04 00:00:00'), 2, 3850, 4998), (Timestamp('2011-01-04 00:00:00'), 2, 1942, 2448), (Timestamp('2011-01-04 00:00:00'), 5, 1287, 166), (Timestamp('2011-01-04 00:00:00'), 1, 1722, 5090), (Timestamp('2011-01-04 00:00:00'), 2, 2694, 3193), (Timestamp('2011-01-04 00:00:00'), 1, 365, 2045), (Timestamp('2011-01-04 00:00:00'), 1, 3778, 2210), (Timestamp('2011-01-04 00:00:00'), 1, 1728, 728), (Timestamp('2011-01-04 00:00:00'), 1, 3076, 3710), (Timestamp('2011-01-04 00:00:00'), 1, 4251, 3153), (Timestamp('2011-01-04 00:00:00'), 4, 4239, 209), (Timestamp('2011-01-04 00:00:00'), 2, 39

In [11]:
d = datetime.strptime("2011-02-01 00:00", "%Y-%m-%d %H:%M")

a = {1:[3,4,5], 2:[1,2]}
np.save("dic.npy", a)

(Timestamp('2011-01-04 00:15:00'), 3, 523, 3159), (Timestamp('2011-01-04 00:15:00')

In [14]:
b = np.load("dic.npy")
b.item()

{1: [3, 4, 5], 2: [1, 2]}

In [33]:
times = [[(datetime.strptime('2011-01-04 00:15:00', "%Y-%m-%d %H:%M:%S"), 1, 2505, 2841), (datetime.strptime('2011-01-04 00:15:00', "%Y-%m-%d %H:%M:%S"), 1, 3046, 4907), (datetime.strptime('2011-01-04 00:15:00', "%Y-%m-%d %H:%M:%S"), 1, 3046, 4901)], [(datetime.strptime('2011-01-04 00:15:00', "%Y-%m-%d %H:%M:%S"), 1, 2505, 2841), (datetime.strptime('2011-01-04 00:15:00', "%Y-%m-%d %H:%M:%S"), 1, 3046, 4908)]]
np.save("dic.npy", times)

In [35]:
b = np.load("dic.npy")
for step, trips in enumerate(b):
    for t in trips:
        time, a, b, c = t
        print(step, time, a, b, c)


0 2011-01-04 00:15:00 1 2505 2841
0 2011-01-04 00:15:00 1 3046 4907
0 2011-01-04 00:15:00 1 3046 4901
1 2011-01-04 00:15:00 1 2505 2841
1 2011-01-04 00:15:00 1 3046 4908


In [38]:
import time

t1 = time.time()
b = np.load("dic.npy")
print(f"Loading all trips took {time.time()-t1}.")

Loading all trips took 0.9605541229248047.


In [55]:
path = "C:/Users/LocalAdmin/OneDrive/leap_forward/phd_project/reb/code/mod/data/input/nyc/trips_2011-01-04-enriched.csv"
step=15
earliest_step=0
max_steps=5
resize_factor=1
step_trip_path = (
    f"{path.split('.')[0]}_"
    f"increment={step:02}min_"
    f"earlieststep={earliest_step:02}_"
    f"maxsteps={(f'{max_steps:02}' if max_steps else '--')}_"
    f"resize={resize_factor:02}.npy"
)
step_trip_path

'C:/Users/LocalAdmin/OneDrive/leap_forward/phd_project/reb/code/mod/data/input/nyc/trips_2011-01-04-enriched_increment=15min_earlieststep=00_maxsteps=05_resize=01.npy'