In [2]:
# By Armin. Slightly modified by me. 

import os
import glob
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split # splitting the data
from sklearn.preprocessing import StandardScaler # normalization
from sklearn.ensemble import RandomForestRegressor # feature importance
from sklearn.experimental import enable_iterative_imputer # filling nan
from sklearn.impute import IterativeImputer # filling nan
from scipy import stats # finding outliers

#### Get the path of the parent directory to preprocess all of the csv files at once 

In [3]:
PATH = os.getcwd()
PATH = (os.path.abspath(os.path.join(PATH, os.pardir)))
PATH += "/data"
print(PATH)

HEADINGS = ['ID_X', 'ID_Y', 'localx_X', 'localy_X', 'globalx_X', 'globaly_X', 'vlength_X','vwidth_X', 'vclass_X', 'vel_X',
            'acc_X', 'laneID_X', 'ozone_X', 'dzone_X', 'int_X', 'section_X', 'direction_X', 'movement_X', 'preceeding_X',
            'following_X', 'space_headway_X', 'time_headway_X', 'location_X', 
            'localx_Y', 'localy_Y', 'globalx_Y', 'globaly_Y', 'vlength_Y','vwidth_Y', 'vclass_Y', 'vel_Y',
            'acc_Y', 'laneID_Y', 'ozone_Y', 'dzone_Y', 'int_Y', 'section_Y', 'direction_Y', 'movement_Y', 'preceeding_Y',
            'following_Y', 'space_headway_Y', 'time_headway_Y', 'location_Y', 'duration']

/Users/JunxiChen/Documents/data


#### Define some functions that will help with preprocessing 

In [377]:
# function to get X and Y values from the dataFrames
def getXandY(dataFrame):
    dataFrame_np = dataFrame.values()
    train, test = train_test_split(dataFrame_np, test_size=0.2)
    yPosition_train = train.shape[1] - 1
    trainX = train[:, :-1].values
    trainY = train[:, yPosition_train]

    yPosition_test = test.shape[1] - 1
    testX = test[:, :-1].values
    testY = test[:, yPosition_test]
    return [trainX, trainY] , [testX, testY]


# function that will Normalize the data using Standard Scaling
def normalize(data):
    scaler = StandardScaler()
    scaler.fit(data)
    data = scaler.transform(data)
    
    return data

# function that will fill in the NaN values using MICE imputation method
def mvImp(df_incomplete):
    # fill the na values with 9999
    df_incomplete = df_incomplete.fillna(9999)
    # create an object of type IterativeImputer from the sklearn class
    imputer = IterativeImputer(missing_values = 9999, max_iter=30, random_state=0,
                 add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean', 
                 max_value=None, min_value=None, 
                 n_nearest_features=None,
                 tol=0.001, verbose=0)
    # fit the object on our incomplete data
    imputer.fit(df_incomplete)
    # transform the data and store into a new variable
    imputedData = imputer.transform(df_incomplete)

    return imputedData

# function that will remove outliers
def removeOutliers(data):
    z = np.abs(stats.zscore(data))
    threshold = 3
    data = data[(z < 3).all(axis=1)]
    
    return data

# function that will save the npArray into either the train or test directory and if it is X or Y
def saveNumpy(npArray, train_or_test, XorY):
    global PATH
    np.savetxt(PATH+'/'+train_or_test+'/'+XorY+'.csv', npArray, delimiter=',')

# function that converts the location to a one hot encoded vector
def convert_location(dataFrame):
    # dictionary of names converted to integers to map quicker
    location_names = {'lankershim' : 0.0, 'us-101' : 1.0, 'peachtree': 2.0, 'i-80': 3.0}
    
    dataFrame = dataFrame.replace(location_names)
    return dataFrame

# function that writes the feature importance to a csv file for analysis (dropping location for now)
def rf_feature_importance(dataFrame, fileName):
    global PATH
    file = (PATH + ' ' + fileName + '.txt', 'w')
    
    dataFrame = dataFrame.drop(['ID_X','ID_Y','ozone_X','ozone_Y','preceeding_X','preceeding_Y','following_X','following_Y',
                                'space_headway_Y','space_headway_X','time_headway_X', 'time_headway_Y', 'localx_Y'], axis=1)
    
    names = dataFrame.columns
    Y = dataFrame.pop('duration')
    X = dataFrame
    
    rf = RandomForestRegressor()
    rf.fit(X, Y)
    
    print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), reverse=True))

# Pre Processing 

#### Add headings to the data

In [378]:

# returns a list of all files with specified ending
all_files = glob.glob(PATH + '/*.csv')
i = 0
data_frame_holder = {}

for filename in all_files:
    print(os.path.basename(filename))
    
    key = os.path.basename(filename)
    value = pd.read_csv(filename, names = HEADINGS)
    data_frame_holder.update({key:value})

Peach_50m.csv


#### 1) Convert the location 2) remove outliers 3) normalize

In [379]:
for name, dataFrame in data_frame_holder.items():
    print(dataFrame.columns)
    dataFrame = convert_location(dataFrame)
    dataFrame = mvImp(dataFrame)
    dataFrame = normalize(dataFrame)
    data_frame_holder.update({name : dataFrame})
    print(dataFrame.shape)
    

Index(['ID_X', 'ID_Y', 'localx_X', 'localy_X', 'globalx_X', 'globaly_X',
       'vlength_X', 'vwidth_X', 'vclass_X', 'vel_X', 'acc_X', 'laneID_X',
       'ozone_X', 'dzone_X', 'int_X', 'section_X', 'direction_X', 'movement_X',
       'preceeding_X', 'following_X', 'space_headway_X', 'time_headway_X',
       'location_X', 'localx_Y', 'localy_Y', 'globalx_Y', 'globaly_Y',
       'vlength_Y', 'vwidth_Y', 'vclass_Y', 'vel_Y', 'acc_Y', 'laneID_Y',
       'ozone_Y', 'dzone_Y', 'int_Y', 'section_Y', 'direction_Y', 'movement_Y',
       'preceeding_Y', 'following_Y', 'space_headway_Y', 'time_headway_Y',
       'location_Y', 'duration'],
      dtype='object')
(10791, 45)


# Feature Importance

#### I would like to see what values account for the most variance by using a random forest regressor

In [380]:
for fileName, dataFrame in data_frame_holder.items():
    holder = pd.DataFrame(dataFrame)
    holder.columns = HEADINGS
    print(holder.columns)
    rf_feature_importance(holder, fileName)

Index(['ID_X', 'ID_Y', 'localx_X', 'localy_X', 'globalx_X', 'globaly_X',
       'vlength_X', 'vwidth_X', 'vclass_X', 'vel_X', 'acc_X', 'laneID_X',
       'ozone_X', 'dzone_X', 'int_X', 'section_X', 'direction_X', 'movement_X',
       'preceeding_X', 'following_X', 'space_headway_X', 'time_headway_X',
       'location_X', 'localx_Y', 'localy_Y', 'globalx_Y', 'globaly_Y',
       'vlength_Y', 'vwidth_Y', 'vclass_Y', 'vel_Y', 'acc_Y', 'laneID_Y',
       'ozone_Y', 'dzone_Y', 'int_Y', 'section_Y', 'direction_Y', 'movement_Y',
       'preceeding_Y', 'following_Y', 'space_headway_Y', 'time_headway_Y',
       'location_Y', 'duration'],
      dtype='object')
[(0.4583, 'vel_Y'), (0.2222, 'acc_Y'), (0.125, 'dzone_X'), (0.1111, 'localy_Y'), (0.0833, 'vel_X'), (0.0, 'vwidth_Y'), (0.0, 'vwidth_X'), (0.0, 'vlength_Y'), (0.0, 'vlength_X'), (0.0, 'vclass_Y'), (0.0, 'vclass_X'), (0.0, 'section_Y'), (0.0, 'section_X'), (0.0, 'movement_Y'), (0.0, 'movement_X'), (0.0, 'location_Y'), (0.0, 'location_X'), (0

