### Preprocess for Dataset
Read the data in the csv file, process the data, divide the processed data into training set, test set and validation set and store them in .pkle file
- Data Cleaning
- Feature Engineering
- Data Transformation

#### Loading Dataset

In [1]:
import sys
import os
import numpy as np
import pickle
import pandas as pd
from IPython.display import clear_output
from matplotlib.animation import FuncAnimation
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import matplotlib.patches as patches
%matplotlib inline

# add current path to system path
sys.path.append(os.getcwd())
# sys.path.append(os.path.dirname(os.path.abspath(__file__)))

In [2]:
# the root path of dataset
dataset_path = '~/hw/cs7641/project/Vehicle-Prediction/dataset'

# the dataset collected from Peachtree street, GA
peachtree_folder = 'Peachtree-Street-Atlanta-GA'
peachtree_csv = 'NGSIM_Peachtree_Vehicle_Trajectories.csv'
peachtree_csv_path = os.path.join(dataset_path, peachtree_folder, peachtree_csv)

# the dataset collected from US101
US101_folder = 'US-101-LosAngeles-CA/us-101-vehicle-trajectory-data/vehicle-trajectory-data'
US101_time1_csv = '0750am-0805am/trajectories-0750am-0805am.csv'
US101_time2_csv = '0805am-0820am/trajectories-0805am-0820am.csv'
US101_time3_csv = '0820am-0835am/trajectories-0820am-0835am.csv'
US101_time1_csv_path = os.path.join(dataset_path, US101_folder, US101_time1_csv)
US101_time2_csv_path = os.path.join(dataset_path, US101_folder, US101_time2_csv)
US101_time3_csv_path = os.path.join(dataset_path, US101_folder, US101_time3_csv)

# the dataset collected from I80
I80_folder = 'Lankershim-Boulevard-LosAngeles-CA'
I80_csv = 'NGSIM__Lankershim_Vehicle_Trajectories.csv'
I80_csv_path = os.path.join(dataset_path, I80_folder, I80_csv)

In [3]:
def read_dataset(dataset_path, folder_path, file_path):
    """
    reading the dataset from csv
    Input:
        dataset_path: str
            the root path of dataset
        folder_path: str
            the folder name of dataset. Different name represent the different city.
        file_path: str
            the file name of csv file  
    Output:
        df: Dataframe
            the content of the csv file
    """
    csv_path = os.path.join(dataset_path, folder_path, file_path)
    df = pd.read_csv(csv_path)
    return df

#### Read csv files one by one

In [4]:
# loading the peachtree street dataset
peachtree_df = read_dataset(dataset_path, peachtree_folder, peachtree_csv)

# loading the us101 dataset 0750am-0805am
us101_df = read_dataset(dataset_path, US101_folder, US101_time1_csv)

# loading the I80
I80_df = read_dataset(dataset_path, I80_folder, I80_csv)

#### Data Cleaning

In [5]:
# Clear data containing NaN
peachtree_df = peachtree_df.dropna()

# Clear different vehicle tracks of the same vehicle at the same time
peachtree_df = peachtree_df.drop_duplicates(subset=['Vehicle_ID', 'Global_Time'])

# Clear data containing NaN
us101_df = us101_df.dropna()

# Clear different vehicle tracks of the same vehicle at the same time
us101_df = us101_df.drop_duplicates(subset=['Vehicle_ID', 'Global_Time'])

# Clear data containing NaN
I80_df = I80_df.dropna()

# Clear different vehicle tracks of the same vehicle at the same time
I80_df = I80_df.drop_duplicates(subset=['Vehicle_ID', 'Global_Time'])

#### Sort Data


In [6]:
us101_df = us101_df.sort_values(by=['Vehicle_ID', 'Global_Time'], ascending=[True, True])
peachtree_df = peachtree_df.sort_values(by=['Vehicle_ID', 'Global_Time', 'Lane_ID'], ascending=[True, True, True])
I80_df = I80_df.sort_values(by=['Vehicle_ID', 'Global_Time'], ascending=[True, True])

display(us101_df)
display(peachtree_df)
display(I80_df)
# print(peachtree_df)
# print(us101_df['Local_Y'].values)
# print(us101_df['Local_X'].values)
print(us101_df['Lane_ID'].unique())

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_Length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Preceeding,Following,Space_Hdwy,Time_Hdwy
0,2,13,437,1118846980200,16.467,35.381,6451137.641,1873344.962,14.5,4.9,2,40.00,0.00,2,0,0,0.00,0.00
1,2,14,437,1118846980300,16.447,39.381,6451140.329,1873342.000,14.5,4.9,2,40.00,0.00,2,0,0,0.00,0.00
2,2,15,437,1118846980400,16.426,43.381,6451143.018,1873339.038,14.5,4.9,2,40.00,0.00,2,0,0,0.00,0.00
3,2,16,437,1118846980500,16.405,47.380,6451145.706,1873336.077,14.5,4.9,2,40.00,0.00,2,0,0,0.00,0.00
4,2,17,437,1118846980600,16.385,51.381,6451148.395,1873333.115,14.5,4.9,2,40.00,0.00,2,0,0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2783,8698,760,1118847848700,55.108,1901.744,6452497.662,1872062.617,17.5,5.4,2,35.64,-3.67,5,2775,2808,51.11,1.43
1048571,2783,8699,760,1118847848800,55.032,1905.296,6452500.375,1872060.416,17.5,5.4,2,35.33,-3.67,5,2775,2808,51.26,1.45
1048572,2783,8700,760,1118847848900,55.044,1908.787,6452503.036,1872058.143,17.5,5.4,2,35.09,-0.48,5,2775,2808,51.49,1.47
1048573,2783,8701,760,1118847849000,55.133,1912.289,6452505.651,1872055.808,17.5,5.4,2,35.05,0.00,5,2775,2808,51.71,1.48


Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,2,5,33,1163019100,34.735,2014.005,2230830.163,1377495.497,15.8,7.2,...,215,5,0,3,1,0,0,0.0,0.0,peachtree
1,2,6,33,1163019200,34.762,2015.005,2230830.037,1377496.489,15.8,7.2,...,215,5,0,3,1,0,0,0.0,0.0,peachtree
2,2,7,33,1163019300,32.789,2016.058,2230827.926,1377497.228,15.8,7.2,...,215,5,0,3,1,0,0,0.0,0.0,peachtree
3,2,8,33,1163019400,29.817,2017.138,2230824.824,1377497.840,15.8,7.2,...,215,5,0,3,1,0,0,0.0,0.0,peachtree
4,2,9,33,1163019500,26.345,2018.231,2230821.226,1377498.389,15.8,7.2,...,215,5,0,3,1,0,0,0.0,0.0,peachtree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873882,1810,2705,36,1163300800,-38.763,2070.137,2230751.588,1377522.631,16.1,6.5,...,215,5,0,4,3,0,0,0.0,0.0,peachtree
873883,1810,2706,36,1163300900,-39.158,2069.871,2230751.239,1377522.308,16.1,6.5,...,215,5,0,4,3,0,0,0.0,0.0,peachtree
873884,1810,2707,36,1163301000,-39.549,2069.609,2230750.892,1377521.989,16.1,6.5,...,215,5,0,4,3,0,0,0.0,0.0,peachtree
873885,1810,2708,36,1163301100,-39.938,2069.350,2230750.548,1377521.674,16.1,6.5,...,215,5,0,4,3,0,0,0.0,0.0,peachtree


Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,...,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
705294,2,1,965,1118936700000,40.583,63.617,6451972.059,1872848.822,15.7,6.4,...,208,1,0,2,3,0,0,0.0,0.0,lankershim
705295,2,2,965,1118936700100,39.190,65.217,6451971.382,1872850.833,15.7,6.4,...,208,1,0,2,3,0,0,0.0,0.0,lankershim
705296,2,3,965,1118936700200,38.296,66.782,6451971.153,1872852.621,15.7,6.4,...,208,1,0,2,3,0,0,0.0,0.0,lankershim
705297,2,4,965,1118936700300,37.402,68.348,6451970.924,1872854.409,15.7,6.4,...,208,1,0,2,3,0,0,0.0,0.0,lankershim
705298,2,5,965,1118936700400,36.508,69.913,6451970.695,1872856.197,15.7,6.4,...,208,1,0,2,3,0,0,0.0,0.0,lankershim
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607314,1603,5510,178,1118937250900,42.529,1273.409,6452476.070,1873947.779,15.2,7.2,...,206,0,4,1,2,0,0,0.0,0.0,lankershim
1607315,1603,5511,178,1118937251000,44.436,1272.763,6452477.624,1873946.615,15.2,7.2,...,206,0,4,1,2,0,0,0.0,0.0,lankershim
1607316,1603,5512,178,1118937251100,46.627,1271.907,6452479.506,1873944.983,15.2,7.2,...,206,0,4,1,2,0,0,0.0,0.0,lankershim
1607317,1603,5513,178,1118937251200,48.079,1270.967,6452480.602,1873943.645,15.2,7.2,...,206,0,4,1,2,0,0,0.0,0.0,lankershim


[2 1 7 6 5 4 3 8]


#### Batch processing of datasets

In [7]:
all_csv_path = [US101_time1_csv_path, US101_time2_csv_path, US101_time3_csv_path, I80_csv_path, peachtree_csv_path]
scenario = ["us101_time1","us101_time2","us101_time3","I80","peachtree"]
df_dict = {}

for name, csv_path in zip(scenario, all_csv_path):
    # loading data
    df = pd.read_csv(csv_path)

    # drop the information we don't need. Due to there are some difference between different dataset, we need to adjust some data
    df = df.drop(['Global_X','Global_Y','Total_Frames','Following'], axis=1)
    if name == 'I80' or name == 'peachtree':
        df = df.drop(['O_Zone','D_Zone','Direction','Preceding'], axis=1)
        df.insert(14, 'Action', 1)       # new feature: 1: maintaining speed, 2: slow down, 3:accelerate; still need to update
    else:
        # add Int_ID
        # add section id
        # add movement
        # add location
        df = df.drop('Preceeding', axis=1)
        # Due to US101 is a stright highway, there are no any intersection,
        # manually adding 'Int_ID' and 'Section_ID' to make sure all dataset consistent
        df.insert(11, 'Int_ID', 0)         # 0 Value of “0” means that the vehicle was not in the immediate vicinity of an intersection 
        df.insert(12, 'Section_ID', 101)   # 101 is a random value for US101 to mark it as "no Intersection". "0" means vehicle driving at intersection
        df.insert(13, 'Movement', 1)       # initialize the movement, they will be updated 
        df.insert(14, 'Action', 1)       # new feature: 1: maintaining speed, 2: slow down, 3:accelerate; still need to update
        df.insert(df.shape[1], 'Location', name)
    # print(name,"===================================")
    # display(df)

    # Initialize a new column for grid neighbors
    max_grid_size = 27 + 6  # Grid has a total of 33 slots (3x11 per lane)
    empty_dict_template = {i: None for i in range(max_grid_size)}
    df.insert(15, 'Grid_Neighbors', [empty_dict_template.copy() for _ in range(len(df))])
    
    # Clear data containing NaN
    # df = df.dropna()
    # Clear different vehicle tracks of the same vehicle at the same time
    df = df.drop_duplicates(subset=['Vehicle_ID', 'Global_Time'])
    if name == 'I80':
        # only keep the land_id from 1 to 6,
        # 11 and 12 for Left turn lane
        # 0 for intersection
        # df = df.loc[((df['Lane_ID'] >= 1) & (df['Lane_ID'] <= 6)) | df['Lane_ID'] == 11 | df['Lane_ID'] == 12 | df['Lane_ID'] == 0]
        # df = df.loc[df['Lane_ID'].isin([0, 11, 12]) | ((df['Lane_ID'] >= 1) & (df['Lane_ID'] <= 6))]
        df.query('6>=Lane_ID>=1 | Lane_ID == 0 | Lane_ID == 11 | Lane_ID == 12')

    elif name == 'peachtree':
        df.query('2>=Lane_ID>=1 | Lane_ID == 0 | Lane_ID == 11 | Lane_ID == 12')
        # df = df.loc[df['Lane_ID'].isin([0, 11, 12]) | ((df['Lane_ID'] >= 1) & (df['Lane_ID'] <= 2))]
        # df = df.loc[((df['Lane_ID'] >= 1) & (df['Lane_ID'] <= 2)) | df['Lane_ID'] == 11 | df['Lane_ID'] == 12 | df['Lane_ID'] == 0]
    else:
        df.query('5>=Lane_ID>=1')
    # df = df.loc[((df['Lane_ID'] >= 1) & (df['Lane_ID'] <= 5))]

    df_dict[name] = df
    display(df)
    

Unnamed: 0,Vehicle_ID,Frame_ID,Global_Time,Local_X,Local_Y,v_Length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Int_ID,Section_ID,Movement,Action,Grid_Neighbors,Space_Hdwy,Time_Hdwy,Location
0,2,13,1118846980200,16.467,35.381,14.5,4.9,2,40.00,0.00,2,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time1
1,2,14,1118846980300,16.447,39.381,14.5,4.9,2,40.00,0.00,2,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time1
2,2,15,1118846980400,16.426,43.381,14.5,4.9,2,40.00,0.00,2,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time1
3,2,16,1118846980500,16.405,47.380,14.5,4.9,2,40.00,0.00,2,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time1
4,2,17,1118846980600,16.385,51.381,14.5,4.9,2,40.00,0.00,2,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,2783,8698,1118847848700,55.108,1901.744,17.5,5.4,2,35.64,-3.67,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",51.11,1.43,us101_time1
1048571,2783,8699,1118847848800,55.032,1905.296,17.5,5.4,2,35.33,-3.67,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",51.26,1.45,us101_time1
1048572,2783,8700,1118847848900,55.044,1908.787,17.5,5.4,2,35.09,-0.48,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",51.49,1.47,us101_time1
1048573,2783,8701,1118847849000,55.133,1912.289,17.5,5.4,2,35.05,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",51.71,1.48,us101_time1


Unnamed: 0,Vehicle_ID,Frame_ID,Global_Time,Local_X,Local_Y,v_Length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Int_ID,Section_ID,Movement,Action,Grid_Neighbors,Space_Hdwy,Time_Hdwy,Location
0,1,270,1118847869000,51.164,112.878,47.0,8.5,3,19.89,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time2
1,1,271,1118847869100,51.153,114.878,47.0,8.5,3,19.89,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time2
2,1,272,1118847869200,51.143,116.878,47.0,8.5,3,19.89,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time2
3,1,273,1118847869300,51.244,118.815,47.0,8.5,3,19.89,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time2
4,1,274,1118847869400,51.234,120.814,47.0,8.5,3,19.89,1.07,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1914,7369,1118848578900,6.751,949.584,16.5,6.4,2,14.89,0.31,1,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",35.13,2.36,us101_time2
1048571,1914,7370,1118848579000,6.763,951.072,16.5,6.4,2,14.93,0.64,1,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",34.97,2.34,us101_time2
1048572,1914,7371,1118848579100,6.777,952.570,16.5,6.4,2,14.98,0.34,1,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",34.82,2.32,us101_time2
1048573,1914,7372,1118848579200,6.789,954.072,16.5,6.4,2,15.00,0.00,1,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",34.66,2.31,us101_time2


Unnamed: 0,Vehicle_ID,Frame_ID,Global_Time,Local_X,Local_Y,v_Length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Int_ID,Section_ID,Movement,Action,Grid_Neighbors,Space_Hdwy,Time_Hdwy,Location
0,1,137,1118848770700,52.660,73.916,15.0,6.4,2,25.00,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time3
1,1,138,1118848770800,52.648,76.416,15.0,6.4,2,25.00,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time3
2,1,139,1118848770900,52.635,78.915,15.0,6.4,2,25.00,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time3
3,1,140,1118848771000,52.622,81.416,15.0,6.4,2,25.00,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time3
4,1,141,1118848771100,52.609,83.916,15.0,6.4,2,25.00,0.00,5,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.00,0.00,us101_time3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1317,6107,1118849367700,38.320,162.991,16.0,5.4,2,12.67,-0.03,4,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",48.87,3.86,us101_time3
1048571,1317,6108,1118849367800,38.353,164.261,16.0,5.4,2,12.64,-0.57,4,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",48.53,3.84,us101_time3
1048572,1317,6109,1118849367900,38.393,165.524,16.0,5.4,2,12.57,-0.91,4,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",48.19,3.83,us101_time3
1048573,1317,6110,1118849368000,38.418,166.779,16.0,5.4,2,12.47,-1.22,4,0,101,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",47.86,3.84,us101_time3


Unnamed: 0,Vehicle_ID,Frame_ID,Global_Time,Local_X,Local_Y,v_length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Int_ID,Section_ID,Movement,Action,Grid_Neighbors,Space_Headway,Time_Headway,Location
0,63,3,1118935680200,44.589,67.404,14.2,7.0,2,0.00,0.00,2,1,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
1,63,4,1118935680300,44.589,67.404,14.2,7.0,2,0.00,0.00,0,1,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
2,63,5,1118935680400,44.624,67.904,14.2,7.0,2,0.00,0.00,0,1,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
3,63,6,1118935680500,44.659,68.402,14.2,7.0,2,0.00,0.00,0,1,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
4,63,7,1118935680600,44.140,69.129,14.2,7.0,2,0.00,0.00,0,1,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1607314,1603,5510,1118937250900,42.529,1273.409,15.2,7.2,2,19.52,12.83,0,0,4,2,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
1607315,1603,5511,1118937251000,44.436,1272.763,15.2,7.2,2,20.30,7.89,0,0,4,2,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
1607316,1603,5512,1118937251100,46.627,1271.907,15.2,7.2,2,20.30,0.00,0,0,4,2,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim
1607317,1603,5513,1118937251200,48.079,1270.967,15.2,7.2,2,20.30,0.00,0,0,4,2,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,lankershim


Unnamed: 0,Vehicle_ID,Frame_ID,Global_Time,Local_X,Local_Y,v_length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,Int_ID,Section_ID,Movement,Action,Grid_Neighbors,Space_Headway,Time_Headway,Location
0,2,5,1163019100,34.735,2014.005,15.8,7.2,2,0.00,0.00,2,5,0,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
1,2,6,1163019200,34.762,2015.005,15.8,7.2,2,0.00,0.00,0,5,0,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
2,2,7,1163019300,32.789,2016.058,15.8,7.2,2,0.00,0.00,0,5,0,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
3,2,8,1163019400,29.817,2017.138,15.8,7.2,2,0.00,0.00,0,5,0,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
4,2,9,1163019500,26.345,2018.231,15.8,7.2,2,0.00,0.00,0,5,0,1,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873882,1810,2705,1163300800,-38.763,2070.137,16.1,6.5,2,4.78,-0.52,0,5,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
873883,1810,2706,1163300900,-39.158,2069.871,16.1,6.5,2,4.73,-0.51,0,5,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
873884,1810,2707,1163301000,-39.549,2069.609,16.1,6.5,2,4.73,0.00,0,5,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree
873885,1810,2708,1163301100,-39.938,2069.350,16.1,6.5,2,4.73,0.00,0,5,0,3,1,"{0: None, 1: None, 2: None, 3: None, 4: None, ...",0.0,0.0,peachtree


#### Feature Engineering
- Neighbors: We need to obtain neighbor vehicles in the lanes to the left and right of the ego vehicle or in the same lane within the next 15 seconds and the past 5 seconds.
- Vehicle Behavior: Lane change to the right;  Lane change to the left; Lane Keeping; accelerate; decelerate; maintaining speed
- The vehicle's turning angle (theta) : We may not be able to obtain this feature accurately (it may not be used)

In [8]:
def generate_movement(df, veh_traj, current_idx):
    """Due to the dataset of US101 stills lack of the movement,
      this function updates movement for US101
    """
    # lateral 
    # Each frame is 0.1s apart. we use current_idx + 40 means we consider the lateral position next 4s and past 4s.
    upper_boundary = min(current_idx + 40, veh_traj.index[-1])
    lower_boundary = max(veh_traj.index[0], current_idx - 40)
    
    if veh_traj.loc[upper_boundary, 'Lane_ID'] > veh_traj.loc[current_idx, 'Lane_ID'] or \
        veh_traj.loc[current_idx, 'Lane_ID'] > veh_traj.loc[lower_boundary, 'Lane_ID']:
        df.at[current_idx, 'Movement'] = 3 # turning right
    elif veh_traj.loc[upper_boundary, 'Lane_ID'] < veh_traj.loc[current_idx, 'Lane_ID'] or \
        veh_traj.loc[current_idx, 'Lane_ID'] < veh_traj.loc[lower_boundary, 'Lane_ID']:
        df.at[current_idx, 'Movement'] = 2 # turning left
    else:
        df.at[current_idx, 'Movement'] = 1 # lane keeping
    
    return df

In [9]:
def add_neighbors(df, frame, ego_local_x, ego_local_y, current_idx, base_index, intersection = False):
    """
    Get the neighbor based on left, right, and current lane
    Input:
        df: the data frame of current dataset
        frame: the data frame of left or right lane 
        ego_local_y: the Longitudinal distance
        current_idx: the row idx of ego in df
    """
    if intersection == False:
        if frame is not None and not frame.empty:  # Check if the frame contains any data
            for _, vehicle in frame.iterrows():
                y_diff = vehicle['Local_Y'] - ego_local_y  # Vertical difference with ego vehicle
                if abs(y_diff) < 90:  # Only consider vehicles within ±90 units
                    grid_index = int(base_index + round((y_diff + 90) / 15))
                    # Store the neighbor vehicle ID in the grid position
                    df.at[current_idx, 'Grid_Neighbors'][grid_index] = vehicle['Vehicle_ID']
    else:
        if frame is not None and not frame.empty:
            for _, vehicle in frame.iterrows():
                y_diff = vehicle['Local_Y'] - ego_local_y
                x_diff = vehicle['Local_X'] - ego_local_x

                if abs(x_diff) < 5 and abs(y_diff) < 90:
                    base_index = 14
                elif x_diff < -5 and x_diff > -13 and abs(y_diff) < 90:
                    base_index = 1
                elif x_diff > 5 and x_diff < -13 and abs(y_diff) < 90:
                    base_index = 27
                else:
                    continue
                
                grid_index = base_index + round((y_diff + 90) / 15)
                df.at[current_idx, 'Grid_Neighbors'][grid_index] = vehicle['Vehicle_ID']
    
    return df

def generate_neighbors(df, name, veh_time, ego_lane_id, ego_intersection_id, ego_local_x, ego_local_y, current_idx):
    """
    Feature Engineering 1: generate Neighbors
    Input:
        df: the data frame of current dataset
        veh_time: Dataframe sorted by time
        ego_lane_id: current lane id of ego
        ego_intersection_id: if ego stay at intersection, ego_intersection_id is the id of this intersection
        ego_local_x: lateral distance
        ego_local_y: the Longitudinal distance
        current_idx: the row idx of ego in df
    Output:
        df
    """
    ego_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id]
    farthest_left_lane = 1
    if name == 'I80':
        farthest_right_lane = 6
    elif name == 'peachtree':
        farthest_right_lane = 3
    else:
        farthest_right_lane = 5
    
    if ego_lane_id == 0:  # intersection
        neighbor_frame = veh_time[veh_time['Lane_ID'] == 0 & veh_time['Int_ID'] == ego_intersection_id]
        df = add_neighbors(df, neighbor_frame, ego_local_x, ego_local_y, current_idx, 1, intersection = True)
    else:
        left_frame = None
        right_frame = None

        if ego_lane_id > farthest_left_lane and ego_lane_id < farthest_right_lane:
            left_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id - 1]
            right_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id + 1]
        elif ego_lane_id == 1:
            left_frame = veh_time[veh_time['Lane_ID'] == 12]
            if left_frame.empty:
                left_frame = veh_time[veh_time['Lane_ID'] == 11]
            right_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id + 1]
        elif ego_lane_id == 6:
            left_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id - 1]
            right_frame = None
        elif ego_lane_id == 11:
            right_frame = veh_time[veh_time['Lane_ID'] == 12]
            if right_frame.empty:
                right_frame = veh_time[veh_time['Lane_ID'] == 1]
            left_frame = None
        elif ego_lane_id == 12:
            left_frame = veh_time[veh_time['Lane_ID'] == ego_lane_id - 1]
            right_frame = veh_time[veh_time['Lane_ID'] == 1]
        
        df = add_neighbors(df, left_frame, ego_local_x, ego_local_y, current_idx, 1, intersection = False)
        df = add_neighbors(df, ego_frame, ego_local_x, ego_local_y, current_idx, 14, intersection = False)
        df = add_neighbors(df, right_frame, ego_local_x, ego_local_y, current_idx, 27, intersection = False)
    return df


In [10]:
final = {}
for name, df in df_dict.items():
    vehicle_traj_dict = {}
    vehicle_time_dict = {}

    # Group data by Vehicle_ID
    for vehicle_id, group in df.groupby('Vehicle_ID'):
        # Store the entire trajectory of each vehicle as a DataFrame
        vehicle_traj_dict[vehicle_id] = group.sort_values('Global_Time')

    # Group data by Global_Time (frames)
    for time, group in df.groupby('Global_Time'):
        # Store all vehicles present at a given time frame as a DataFrame
        vehicle_time_dict[time] = group.sort_values('Vehicle_ID')
    
        
    for idx, data in df.iterrows():
        time = data['Global_Time']
        vehicle_id = data['Vehicle_ID']
        veh_traj = vehicle_traj_dict[vehicle_id]
        # display(veh_traj)
        # display(veh_time)
        veh_time = vehicle_time_dict[time]
        ego_lane_id = data['Lane_ID']
        ego_intersection_id = data['Int_ID']
        ego_local_x = data['Local_X']
        ego_local_y = data['Local_Y']

        # Find the index for the current time frame
        # df_idx = df[df['Global_Time'] == time & df['Vehicle_ID'] == vehicle_id].index[0]
        current_idx = veh_traj[veh_traj['Global_Time'] == time].index.tolist()[0]

        # Update movement for US101
        # if name != 'I80' and name != 'peachtree':
        #     df = generate_movement(df, veh_traj, current_idx)

        # add neighbor feature for each dataset
        df = generate_neighbors(df, name, veh_time, ego_lane_id, ego_intersection_id, ego_local_x, ego_local_y, current_idx)

    # Updated movement
    df_dict[name] = df

KeyboardInterrupt: 

#### Data Visualization

#### Visualization

In [11]:
def data_visualization(df, save_as_gif=False, gif_filename="trajectory_visualization.gif"):
    """
    Dynamic display of vehicle trajectory data
    Input:
        df: Data frame
        save_as_gif: Boolean to save the animation as a gif
        gif_filename: Filename for the gif
    Output:
        None
    """
    # Section limits can be adjusted, here it is 200-800 feet (600 feet long)
    section_limits = [50, 600]

    # Limits of x-y axis
    x_limit = [section_limits[0] - 5, section_limits[1] + 5]
    y_limit = [-170, 250]

    # Get unique frames
    frames = df['Frame_ID'].unique()

    # Set up the figure for animation
    fig, ax = plt.subplots()

    # Function to plot the current frame
    def plot_frame(frame):
        ax.clear()

        frame_data = df[(df['Frame_ID'] == frame) &
                        (df['Local_Y'] >= section_limits[0]) &
                        (df['Local_Y'] <= section_limits[1])]

        # Avoid running for a long time, so end it at 300 frames
        if frame_data.empty:
            return

        # Get needed fields
        lateral_pos = frame_data['Local_X'].values  # Lateral position
        longitude_pos = frame_data['Local_Y'].values  # Longitude position
        vehicle_id = frame_data['Vehicle_ID'].values  # Vehicle ID
        length = frame_data['v_Length'].values  # Vehicle length
        width = frame_data['v_Width'].values  # Vehicle width
        vehicle_class = frame_data['v_Class'].values  # Vehicle class

        # Set title
        ax.set_title(f'NGSIM trajectories - frame: {int(frame_data["Frame_ID"].iloc[0])}')

        # Plot road boundaries
        ax.plot(x_limit, [-60, -60], color='red', linestyle='--')
        ax.plot(x_limit, [60, 60], color='red', linestyle='--')

        # Plot vehicle bounding boxes based on vehicle class
        for i in range(len(frame_data)):
            # Create bounding box for each vehicle
            bounding_box = [longitude_pos[i] - length[i] / 2, lateral_pos[i] - width[i] / 2, length[i], width[i]]
            
            if vehicle_class[i] == 1:
                color = 'orange'  # Motorcycle
            elif vehicle_class[i] == 2:
                color = 'blue'  # Auto
            else:
                color = 'green'  # Truck
            
            rect = patches.Rectangle((bounding_box[0], bounding_box[1]), bounding_box[2], bounding_box[3],
                                    linewidth=1, edgecolor=color, facecolor=color)
            ax.add_patch(rect)
            
            # Add vehicle id to each vehicle
            ax.text(longitude_pos[i] - 2 * length[i] / 3, lateral_pos[i], str(int(vehicle_id[i])),
                    color='blue', fontsize=8, clip_on=True)
        
        # Custom legend for vehicle classes
        legend_patches = [patches.Patch(color='orange', label='Motorcycle'),
                        patches.Patch(color='blue', label='Auto'),
                        patches.Patch(color='green', label='Truck')]
        ax.legend(handles=legend_patches, loc='upper right', fontsize=14)

        # Set limits and labels
        ax.set_xlim(x_limit)
        ax.set_ylim(y_limit)
        ax.set_xlabel('Longitude (feet)')
        ax.set_ylabel('Lateral (feet)')
        
        # Invert x-axis
        ax.invert_xaxis()
        ax.grid(True)

        if frame == frames[-1]:
            ani.event_source.stop()  # Stop the animation
            plt.close(fig)

    # Create the animation
    ani = FuncAnimation(fig, plot_frame, frames=frames, interval=10)

    if save_as_gif:
        # Save the animation as a gif
        ani.save(gif_filename, writer='pillow', fps=10)
        print(f"GIF saved as {gif_filename}")
    else:
        # plt.show()
        display(HTML(ani.to_jshtml()))

In [None]:
visual_df = us101_df.loc[(us101_df['Frame_ID'] >= 0) & (us101_df['Frame_ID'] < 300)]
data_visualization(visual_df,save_as_gif=False)