# Random Forest: Classification by Automation Condition (By Participant Sampling)
- Prediction of automation usage using all features and the top 20
- We train and test by automation condition
    - Grouping the data from all participants in a condition

## Necessary Libraries

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import glob as glob
import os as os
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

## Reading and processing the data

### Choose if running on Agave

In [9]:
AGAVE = False


### Choose automation condition

In [188]:
choose_condition  = 0
conditions = ['SH','SL','FH','FL','ALL']
cond = conditions[choose_condition]

### Path to all data files

In [189]:
# Files
if(AGAVE==True):
    files_path = '../../../NewFeatures/' + cond             # Agave
else:
    files_path = '../../../features_data_risk/' + cond      # Local

all_files = glob.glob(os.path.join(files_path, "*.csv"))

### Features that offer extra information

In [190]:
# Features that offer extra information (they're basically the same as boolAuto)
features_extra = ['boolHand','boolTake','brakeOshp','sumAuto','sumHand','sumTake','sumTogg']

### Features that are related to the operator's actions

In [191]:
# Actions
features_internal = ['accAngOshpX','accAngOshpY','accAngOshpZ','accLinOshpX',
					'accLinOshpY','accLinOshpZ','boolButnA','boolButnB',
					'boolViolButn','boolViolLane','boolViolLead','boolViolPeds',
					'boolViolRang','boolViolTraf','boolViolVehs',
					'oriOshpX','oriOshpY','oriOshpZ','psnOshpLane','psnOshpLaneAbs',
					'psnOshpLaneLft','psnOshpX','psnOshpY','psnOshpZ',
					'steerOshp','sumViolButn','sumViolLane','sumViolLead','sumViolPeds',
					'sumViolRang','sumViolTraf','sumViolVehs','throtOshp','timeReact',
					'velAngOshpX','velAngOshpY','velAngOshpZ','velLinOshp','velLinOshpLane',
					'velLinOshpLaneAbs','velLinOshpLaneLft','velLinOshpRang','velLinOshpX',
					'velLinOshpY','velLinOshpZ']

### Read the data files

In [192]:
# Pre-process data
# Assign features from current time-step to future use of automation,
# The idea is to predict future use of automation based on current behavior
delay = 1/60            # How much time in advance we want to predict automation usage (in seconds)
h = (int)(delay*60 - 1) # Turn the time into index (Considering a sampling rate of 60 Hz)
data_list = []
for file in all_files:
    data_participant = pd.read_csv(file)
    data_participant.insert(0,"time",[(i*1/60) for i in range(0,data_participant.shape[0])],True)
    data_auto = data_participant['boolAuto']
    data_auto = np.array(data_auto.iloc[h:])
    data_participant = data_participant.drop('boolAuto', axis=1)
    data_participant = data_participant.head(data_participant.shape[0]-h)
    data_participant.insert(data_participant.shape[1],"boolAuto",data_auto,True)
    data_list.append(data_participant)

### Results of feature selection

In [193]:
SH_feats = ['psnOshpRang'    ,
'score'          ,
'velLinOshpX'    ,
'velLinOshp'     ,
'rrisk'          ,
'sumViolLane'    ,
'velLinLead'     ,
'throtOshp'      ,
'sumViolRang'    ,
'odomRoad'       ,
'velLinOshpRang' ,
'sumViolPeds'    ,
'ttcLead'        ,
'sumViolButn'    ,
'accLinOshpX'    ,
'psnTrafPrxY'    ,
'psnTrafPrxX'    ,
'boolStatRang'   ,
'oriTrafPrxZ'    ,
'oriOshpY'       ]

SL_feats = ['psnOshpRang'    ,
'rrisk'          ,
'sumViolAwrd'    ,
'velLinOshp'     ,
'throtOshp'      ,
'velLinOshpX'    ,
'sumViolRang'    ,
'score'          ,
'accLinOshpX'    ,
'sumViolButn'    ,
'oriOshpY'       ,
'velLinLead'     ,
'odomRoad'       ,
'sumViolLane'    ,
'velLinOshpRang' ,
'psnTrafPrxX'    ,
'psnTrafPrxY'    ,
'accAngOshpY'    ,
'oriLeadZ'       ,
'velAngOshpY'    ]

FH_feats = ['velLinOshpX'    ,
'psnOshpRang'    ,
'score'          ,
'rrisk'          ,
'velLinOshp'     ,
'throtOshp'      ,
'sumViolRang'    ,
'velLinLead'     ,
'odomRoad'       ,
'sumViolButn'    ,
'sumViolLane'    ,
'accLinOshpX'    ,
'velLinOshpRang' ,
'psnOshpLaneLft' ,
'oriTrafPrxZ'    ,
'velAngLeadZAbs' ,
'psnTrafPrxX'    ,
'oriOshpY'       ,
'psnTrafPrxY'    ,
'oriOshpZ'       ]

FL_feats = ['psnOshpRang' ,
'sumViolAwrd' ,
'velLinOshpX' ,
'rrisk'       ,
'throtOshp'   ,
'velLinOshp'  ,
'sumViolLane' ,
'score'       ,
'accLinOshpX' ,
'odomRoad'    ,
'sumViolRang' ,
'oriTrafPrxZ' ,
'sumViolButn' ,
'psnTrafPrxX' ,
'velLinLead'  ,
'sumViolPeds' ,
'psnLeadY'    ,
'timeReact'   ,
'psnTrafPrxY' ,
'psnRoadY'    ]

top_features_all = [SH_feats,SL_feats,FH_feats,FL_feats]
top_feat_cond = top_features_all[choose_condition]


## Evaluate the classification models

In [261]:
# Save the performance results
data_scores = {'Train_Acc':[],'Train_BalAcc':[],'Train_Prec':[],'Train_Rec':[],'Train_Spec':[],
                'Train_AUC':[],'Test_Acc':[],'Test_BalAcc':[],'Test_Prec':[],'Test_Rec':[],
                'Test_Spec':[],'Test_AUC':[]}
perf_scores = pd.DataFrame(data=data_scores)
# Save the ROC curve data
data_roc = pd.DataFrame({'mean_fpr':[], 'tpr_1':[], 'tpr_2':[], 'tpr_3':[], 'tpr_4':[], 
                        'tpr_5':[]}) # 5 KFold
# Save the best set of parameter for the model
best_parameters = pd.DataFrame({'n_estimators':[], 'max_depth':[], 'max_features':[]})

### Random Forest

#### Random Grid Search

In [1]:
# Get the best set of parameters, using Grid Search
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_num_depth = 6000
if(AGAVE==True):
    max_num_depth = max_num_depth/10
max_depth = [int(x) for x in np.linspace(100, max_num_depth, num = 15)]
max_depth.append(None)
# Maximum number of features
max_features = [4,5,8,10,15]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'max_features': max_features}

NameError: name 'np' is not defined

In [227]:
# Number of iterations (Number of paramater combinations to try)
n_iter = 40
# Indices of participants
idx_part =list(range(0,16))
idx_label = [0]*8 + [1]*8

# Save the best parameters found
SEED = 10
best_params = {'n_estimators': 0,
               'max_depth': 0,
               'max_features': 0,
               'auc': 0.0}

for i in range(0, n_iter):
    # Create the k-fold
    cv = StratifiedKFold(n_splits=5)
    # Choose a random combination of parameters
    n_estim_rs = random.choice(random_grid['n_estimators'])
    max_depth_rs = random.choice(random_grid['max_depth'])
    max_feat_rs = random.choice(random_grid['max_features'])
    # Create the classifier with the chosen parameters
    classifier = RandomForestClassifier(n_estimators=n_estim_rs, max_depth=max_depth_rs,
                                            max_features=max_feat_rs, n_jobs=-1)
    # Scoring metric
    auc_vals = []    
    # Apply cross-validation
    for _, (train, test) in enumerate(cv.split(idx_part,idx_label)):
        train_list = [data_list[index] for index in train]
        test_list = [data_list[index] for index in test]
        # Read the data from the participants to be used for training and testing
        df_data_train = pd.concat(train_list, ignore_index=True)
        df_data_test = pd.concat(test_list, ignore_index=True)
        # Delete the features that offer extra information from the dataset
        df_data_train = df_data_train.drop(features_extra,axis = 1)
        df_data_train = df_data_train.drop('time',axis=1)          # Drop the inserted time too
        df_data_test = df_data_test.drop(features_extra,axis = 1)
        df_data_test = df_data_test.drop('time',axis=1)
        
        # Check if running on Agave
        if(AGAVE==True):
            sample_train = df_data_train.sample(frac=0.15, random_state=SEED);
            sample_test = df_data_test.sample(frac=0.15, random_state=SEED);
        else:
            sample_train = df_data_train.sample(frac=0.1, random_state=SEED);
            sample_test = df_data_test.sample(frac=0.1, random_state=SEED);
        
        # Scale the data to a range [0,1]
        idx_train = np.shape(sample_train)[0]
        sample_comb = MinMaxScaler(copy=False).fit_transform(np.concatenate((sample_train[top_feat_cond].to_numpy(), 
                                        sample_test[top_feat_cond].to_numpy()), axis=0))
        # Split scaled data into training and testing
        X_train = sample_comb[0:idx_train]
        y_train = sample_train['boolAuto']
        X_test = sample_comb[idx_train:len(sample_comb)]
        y_test = sample_test['boolAuto']

        # Fit the model with the given training set
        classifier.fit(X_train, y_train)
        y_pred_rs = classifier.predict(X_test)

        # Compute Area under the ROC curve (the closer to 1, the better the estimator is)
        auc_vals.append(roc_auc_score(y_test, y_pred_rs))

    if(np.mean(auc_vals) > best_params['auc']):
        best_params['auc'] = np.mean(auc_vals)
        best_params['n_estimators'] = n_estim_rs
        best_params['max_depth'] = max_depth_rs
        best_params['max_features'] = max_feat_rs



#### Cross Validation

In [259]:
# Get the best parameters fro random grid search
best_n_estim = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_max_features = best_params['max_features']

# Indices of participants
idx_part = list(range(0, 16))
idx_label = [0]*8 + [1]*8

# Create the k-fold
cv = StratifiedKFold(n_splits=5)
# Create the classifier
classifier = RandomForestClassifier(n_estimators=best_n_estim, max_depth=best_max_depth,
                            max_features=best_max_features, n_jobs=-1)
# Save performance metrics
tprs = []; aucs = []; acc = []; acc_bal = []; prec = []; rec = []; spec = []; AUC_v = []
acc_tr = []; acc_bal_tr = []; prec_tr = []; rec_tr = []; spec_tr = []; AUC_v_tr = []
mean_fpr = np.linspace(0, 1, 100)

# Apply cross-validation
for _, (train, test) in enumerate(cv.split(idx_part,idx_label)):
    train_list = [data_list[index] for index in train]
    test_list = [data_list[index] for index in test]
    # Read the data from the participants to be used for training and testing
    df_data_train = pd.concat(train_list, ignore_index=True)
    df_data_test = pd.concat(test_list, ignore_index=True)
    # Delete the features that offer extra information from the dataset
    df_data_train = df_data_train.drop(features_extra,axis = 1)
    df_data_train = df_data_train.drop('time',axis=1)          # Drop the inserted time too
    df_data_test = df_data_test.drop(features_extra,axis = 1)
    df_data_test = df_data_test.drop('time',axis=1)
    
    # Check if running on Agave
    if(AGAVE==True):
        sample_train = df_data_train.sample(frac=1, random_state=SEED);
        sample_test = df_data_test.sample(frac=1, random_state=SEED);
    else:
        sample_train = df_data_train.sample(frac=0.1, random_state=SEED);
        sample_test = df_data_test.sample(frac=0.1, random_state=SEED);
    
    # Scale the data to a range [0,1]
    idx_train = np.shape(sample_train)[0]
    sample_comb = MinMaxScaler(copy=False).fit_transform(np.concatenate((sample_train[top_feat_cond].to_numpy(), 
                                    sample_test[top_feat_cond].to_numpy()), axis=0))
    # Split scaled data into training and testing
    X_train = sample_comb[0:idx_train]
    y_train = sample_train['boolAuto']
    X_test = sample_comb[idx_train:len(sample_comb)]
    y_test = sample_test['boolAuto']

    # Fit the model with the given training set
    classifier.fit(X_train, y_train)
    
    # Get predictions for the training and testing sets
    y_pred = classifier.predict(X_test)
    y_pred_tr = classifier.predict(X_train)

    # ROC curve metrics
    fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=1)
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)

    # Other performance metrics
    # Testing
    acc.append(accuracy_score(y_test,y_pred))
    acc_bal.append(balanced_accuracy_score(y_test,y_pred))
    prec.append(precision_score(y_test,y_pred))
    rec.append(recall_score(y_test,y_pred, pos_label=1))
    spec.append(recall_score(y_test,y_pred, pos_label=0))
    AUC_v.append(roc_auc_score(y_test,y_pred))
    # Training
    acc_tr.append(accuracy_score(y_train,y_pred_tr))
    acc_bal_tr.append(balanced_accuracy_score(y_train,y_pred_tr))
    prec_tr.append(precision_score(y_train,y_pred_tr))
    rec_tr.append(recall_score(y_train,y_pred_tr, pos_label=1))
    spec_tr.append(recall_score(y_train,y_pred_tr, pos_label=0))
    AUC_v_tr.append(roc_auc_score(y_train,y_pred_tr))


#### Save the results

In [262]:
# Fill the dataframe with all the results
perf_scores.Train_Acc = acc_tr
perf_scores.Train_BalAcc = acc_bal_tr
perf_scores.Train_Prec = prec_tr
perf_scores.Train_Rec = rec_tr
perf_scores.Train_Spec = spec_tr
perf_scores.Train_AUC = AUC_v_tr
perf_scores.Test_Acc = acc
perf_scores.Test_BalAcc	 = acc_bal
perf_scores.Test_Prec = prec
perf_scores.Test_Rec = rec
perf_scores.Test_Spec = spec
perf_scores.Test_AUC = AUC_v
# Best parameters
best_parameters.n_estimators = [best_n_estim]
best_parameters.max_depth = [best_max_depth]
best_parameters.max_features = [best_max_features]
# Data ROC curves
data_roc.mean_fpr = mean_fpr
for i in range(1,6):
    data_roc.iloc[:,i] = tprs[i-1]

In [212]:
# Save the results to .csv files
folder = 'Results_CM_ByPar'
perf_scores.to_csv(folder + '/RF_CBPS_performance_' + cond + '.csv', index=False)
data_roc.to_csv(folder + '/RF_CBPS_ROC_' + cond + '.csv', index=False)
best_parameters.to_csv(folder + '/RF_CBPS_BestParm_' + cond + '.csv', index=False)