<a href="https://colab.research.google.com/github/Yasharzf/NGSIM_preprocess/blob/master/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
""" Colab notebook to preprocess NGSIM dataset to extract leading and following
vehicle information. 
"""
import numpy as np
import pandas as pd
import bisect
import json
import time 

In [0]:
#######################  Loading  NGSIM ########################
def load_data(download=True, path_to_file=None):
    """load NGSIM dataset.

    Parameters:
    ----------
    download : boolean
        - False: if NGSIM dataset is loaded in google drive and it should be
        mounted to colab. path_to_file is required.
        - True: NGSIM should be downloaded from the website
    path_to_file : str
        Path to file in google drive. This is required if mounted is true.
    
    Returns:
    -------
    data : pandas.Dataframe
        pandas dataframe containing NGSIM dataset.
    """
    print("#### Mounting google colab ###########")
    from google.colab import drive
    drive.mount('/content/drive')

    if download:
        print("#### Downloading NGSIM dataset #######")
        !wget --content-disposition https://data.transportation.gov/api/views/8ect-6jqj/rows.csv?accessType=DOWNLOAD
        path_to_file = '/content/'

    data = pd.read_csv(path_to_file + "Next_Generation_Simulation__NGSIM__Vehicle_Trajectories_and_Supporting_Data.csv")
    
    # drop duplicate, RAW NGSIM HAS DUPLICATED ROWS     
    # example: data[(data.Vehicle_ID==515) & (data.Global_Time==1118848075000)]
    data = data.drop_duplicates(subset=['Vehicle_ID', 'Global_Time'])

    return data

In [0]:
###################### Lead Info ################################

def convert_to_dict(data):
    """Converts pandas dataframe into a dictionay.
    
    Parameters:
    -----------
    data : pandas.Dataframe
        pandas dataframe containing NGSIM dataset
        
    Returns:
    --------
    data_dict : dict
        a dictinoary containing NGSIM data
    """
    print("=== Coverting relavent data to dictionary ===")
    data_dict = {}
    for index, data_i in data.iterrows():
        if index % 50000 == 0:
            print('index: ', index)
        if data_i["Vehicle_ID"] not in data_dict:
            data_dict[data_i["Vehicle_ID"]] = {}
        data_dict[data_i.Vehicle_ID][data_i.Global_Time] = (data_i.Local_X,
                                                            data_i.Local_Y,
                                                            data_i.v_length,
                                                            data_i.v_Vel, 
                                                            data_i.v_Acc,
                                                            data_i.Lane_ID,
                                                            data_i.Space_Headway
                                                            )

    return data_dict


def load_data_dict(data=None, convert=False, path_to_file=None):
    """Returns data dictionary.

    Parameters
    ----------
    convert : boolean
        - True if data should be converted to dictionary and saved as json.
        - False if dict data should be loaded from a json file.
    data : pandas.Dataframe
        pandas dataframe containing NGSIM dataset
    path_to_file : str
        path to the json file
    
    Returns
    -------
    data_dict : dict
        a dictinoary containing NGSIM data
    """
    if convert:
        data_dict = convert_to_dict(data)
        json_file = json.dumps(data_dict)
        with open(path_to_file, 'w') as f:       
            f.write(json_file)
    else:
        with open(path_to_file, 'r') as f:
            data_dict = json.load(f)

    return data_dict


def collect_lead_lag_info(data, data_dict_lead_info):
    """Returns data dictionary with additional columns for lead vehicle info.
    
    Parameters
    ----------
    data : pandas.Dataframe
        pandas dataframe containing NGSIM dataset
    path_to_file : str
        path to json file

    Returns
    -------
    data_dict : dict
        a dictinoary containing NGSIM data with additional columns:
          - Local_X_leader : Local_X of the leading vehicle
          - Local_Y_leader : Local_Y of the leading vehicle
          - v_length_leader : v_length (length) of the leading vehicle
          - v_Vel_leader : v_Vel (speed) of the leading vehicle
          - v_Acc_leader : v_Acc (acceleration) of the leading vehicle
          - Gap_leader : bumper to bumper gap to the leading vehicle

    """
    initial_time = time.time()
    # data['Gap'] = 300.  # if no lead vehicle, set to 300 ft
    # data['Lead_Speed'] = 130.  # if no lead vehicle, set to 130 ft/s
    # data['Lead_Length'] = 130.  # if no lead vehicle, set to 130 ft/s

    print("=== Collecting lead and lag info ===")

    total_length = data.shape[0]
    count = 0
    for index, data_i in data.iterrows():
        if count % 50000 == 0:
            print('index: {}, count: {}, of: {}, time_passed: {}'.format(
                    index, count, total_length, time.time()-initial_time))
            
        # Lead vehicle info
        if data_i.Preceding != 0:
            try:
                x_l, y_l, l_l, v_l, a_l, lane_l, sh_l = data_dict_lead_info['{}'.format(data_i.Preceding)]['{}'.format(data_i.Global_Time)]
                data.at[index, 'Local_X_leader'] = x_l
                data.at[index, 'Local_Y_leader'] = y_l
                data.at[index, 'v_length_leader'] = l_l
                data.at[index, 'v_Vel_leader'] = v_l
                data.at[index, 'v_Acc_leader'] = a_l
                data.at[index, 'Lane_ID_leader'] = lane_l

                # Compute bumper to bumper gap (space headway minus length)
                if data_i["Space_Headway"] > 0:
                    data.at[index, 'Gap_leader'] = data_i['Space_Headway'] - l_l
            except:
                pass

        # Following vehicle info
        if data_i.Following != 0:
            try:
                x_f, y_f, l_f, v_f, a_f, lane_f, sh_f = data_dict_lead_info['{}'.format(data_i.Following)]['{}'.format(data_i.Global_Time)]
                data.at[index, 'Local_X_follower'] = x_f
                data.at[index, 'Local_Y_follower'] = y_f
                data.at[index, 'v_length_follower'] = l_f
                data.at[index, 'v_Vel_follower'] = v_f
                data.at[index, 'v_Acc_follower'] = a_f
                data.at[index, 'Lane_ID_follower'] = lane_f
                data.at[index, 'Space_Headway_follower'] = sh_f

                # Compute bumper to bumper gap (space headway minus length)
                if sh_f > 0:
                    data.at[index, 'Gap_follower'] = sh_f - data_i['v_length']
            except:
                pass
        count += 1
    return data

In [0]:
def main_prepare():
    """Import and break apart NGSIM data."""
    print("=== Importing NGSIM data ===")
    data = load_data(download=True)

    # filter by location for memory purposes 
    locations = ['us-101', 'i-80']  # ['peachtree', 'lankershim', 'us-101', 'i-80'] 

    for location in locations:
        # save to csv
        print("++++++++++++ Location: {} +++++++++++++++ ".format(location))
        data[data.Location==location].to_csv('{}.csv'.format(location), index=False)

def main_to_dict_lead():
    locations = ['us-101', 'i-80']   
    for location in locations:
        print("++++++++++++ Location: {} +++++++++++++++ ".format(location))

        # Load the dataset of the specific location.
        data = pd.read_csv("{}.csv".format(location))

        # Convert to and save dict version.
        path_to_file = '{}-lead.json'.format(location)
        data_dict_lead = load_data_dict(data, convert=True, path_to_file=path_to_file)


def main_lead_lag_data():
    locations = ['us-101', 'i-80']  
    for location in locations:
        print("++++++++++++ Location: {} +++++++++++++++ ".format(location))

        # Load the dataset of the specific location.
        print("=== Loading saved information. ===")
        data = pd.read_csv("{}.csv".format(location))
        path_to_file = '{}-lead.json'.format(location)
        data_dict_lead = load_data_dict(None, convert=False, path_to_file=path_to_file)

        # lead and following vehicle info
        print("=== Collecting leading and following vehicle info ===")
        data = collect_lead_lag_info(data, data_dict_lead)

        print("=== Exporting processed data ===")
        data.to_csv('processed_{}.csv'.format(location), index=False)  # FIXME

def main_split_data():
    """Split the date set into free flow and car-following regimes.
    Split is based on the space headway. If space headway is larger than 200 m,
    free flow regime is assigned.
    """
    locations = ['us-101', 'i-80']  
    for location in locations:
        print("++++++++++++ Location: {} +++++++++++++++ ".format(location))

        # Load the dataset of the specific location.
        print("=== Loading saved information. ===")
        data = pd.read_csv('processed_{}.csv'.format(location))

        # Split: filter free flow if the headway is larger than 200 m (656 ft)
        # and if therer is no leading vehicle

        data_cf = data[(data["Space_Headway"] < 656) & (data["Preceding"] != 0)]
        data_ff = data[(data["Space_Headway"] >= 656) | (data["Preceding"] == 0)]

        # sort data by vehicle id and time
        data_cf = data_cf.sort_values(by=['Vehicle_ID','Global_Time'])
        data_ff = data_ff.sort_values(by=['Vehicle_ID','Global_Time'])

        print("=== Exporting processed data ===")
        data_ff.to_csv('/content/drive/My Drive/NGSIM_Aimsun/processed_free_flow_{}.csv'.format(location), index=False)
        data_cf.to_csv('/content/drive/My Drive/NGSIM_Aimsun/processed_car_following_{}.csv'.format(location), index=False)  

In [7]:
t0 = time.time()
main_prepare()
print("Done! Took %.3f seconds" %(time.time() - t0))

=== Importing NGSIM data ===
#### Mounting google colab ###########
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
++++++++++++ Location: us-101 +++++++++++++++ 
++++++++++++ Location: i-80 +++++++++++++++ 
Done! Took 235.506 seconds


In [8]:
t1 = time.time()
data = main_to_dict_lead()
print("Done! Took %.3f seconds" %(time.time() - t1))

++++++++++++ Location: us-101 +++++++++++++++ 
=== Coverting relavent data to dictionary ===
index:  0
index:  50000
index:  100000
index:  150000
index:  200000
index:  250000
index:  300000
index:  350000
index:  400000
index:  450000
index:  500000
index:  550000
index:  600000
index:  650000
index:  700000
index:  750000
index:  800000
index:  850000
index:  900000
index:  950000
index:  1000000
index:  1050000
index:  1100000
index:  1150000
index:  1200000
index:  1250000
index:  1300000
index:  1350000
index:  1400000
index:  1450000
index:  1500000
index:  1550000
index:  1600000
index:  1650000
index:  1700000
index:  1750000
index:  1800000
index:  1850000
index:  1900000
index:  1950000
index:  2000000
index:  2050000
index:  2100000
index:  2150000
index:  2200000
index:  2250000
index:  2300000
index:  2350000
index:  2400000
index:  2450000
index:  2500000
index:  2550000
index:  2600000
index:  2650000
index:  2700000
index:  2750000
index:  2800000
index:  2850000
index

In [9]:
t2 = time.time()
data = main_lead_lag_data()
print("Done! Took %.3f seconds" %(time.time() - t2))

++++++++++++ Location: us-101 +++++++++++++++ 
=== Loading saved information. ===
=== Collecting leading and following vehicle info ===
=== Collecting lead and lag info ===
index: 0, count: 0, of: 4098933, time_passed: 3.9248156547546387
index: 50000, count: 50000, of: 4098933, time_passed: 26.326353311538696
index: 100000, count: 100000, of: 4098933, time_passed: 43.08476734161377
index: 150000, count: 150000, of: 4098933, time_passed: 62.79113149642944
index: 200000, count: 200000, of: 4098933, time_passed: 84.45963978767395
index: 250000, count: 250000, of: 4098933, time_passed: 107.28442859649658
index: 300000, count: 300000, of: 4098933, time_passed: 130.3248643875122
index: 350000, count: 350000, of: 4098933, time_passed: 153.52926921844482
index: 400000, count: 400000, of: 4098933, time_passed: 177.49186897277832
index: 450000, count: 450000, of: 4098933, time_passed: 200.78222489356995
index: 500000, count: 500000, of: 4098933, time_passed: 226.34674906730652
index: 550000, cou

In [28]:
t3 = time.time()
data = main_split_data()
print("Done! Took %.3f seconds" %(time.time() - t3))

++++++++++++ Location: us-101 +++++++++++++++ 
=== Loading saved information. ===
=== Exporting processed data ===
++++++++++++ Location: i-80 +++++++++++++++ 
=== Loading saved information. ===
=== Exporting processed data ===
Done! Took 408.911 seconds
