# 91 vs 20 features: Classification by Automation Condition (Random Sampling)
- Prediction of automation usage using all features and the top 20
- We train and test by automation condition
    - Grouping the data from all participants in a condition

## Necessary Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import glob as glob
import os as os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from joblib import Parallel, delayed
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *

## Reading and processing the data

### Choose if running on Agave

In [2]:
AGAVE = False


### Choose automation condition

In [178]:
choose_condition  = 3
conditions = ['SH','SL','FH','FL','ALL']
cond = conditions[choose_condition]

### Path to all data files

In [179]:
# Files
if(AGAVE==True):
    files_path = '../../../NewFeatures/' + cond             # Agave
else:
    files_path = '../../../features_data_risk/' + cond      # Local

all_files = glob.glob(os.path.join(files_path, "*.csv"))

### Features that offer extra information

In [180]:
# Features that offer extra information (they're basically the same as boolAuto)
features_extra = ['boolHand','boolTake','brakeOshp','sumAuto','sumHand','sumTake','sumTogg']

### Features that are related to the operator's actions

In [181]:
# Actions
features_internal = ['accAngOshpX','accAngOshpY','accAngOshpZ','accLinOshpX',
					'accLinOshpY','accLinOshpZ','boolButnA','boolButnB',
					'boolViolButn','boolViolLane','boolViolLead','boolViolPeds',
					'boolViolRang','boolViolTraf','boolViolVehs',
					'oriOshpX','oriOshpY','oriOshpZ','psnOshpLane','psnOshpLaneAbs',
					'psnOshpLaneLft','psnOshpX','psnOshpY','psnOshpZ',
					'steerOshp','sumViolButn','sumViolLane','sumViolLead','sumViolPeds',
					'sumViolRang','sumViolTraf','sumViolVehs','throtOshp','timeReact',
					'velAngOshpX','velAngOshpY','velAngOshpZ','velLinOshp','velLinOshpLane',
					'velLinOshpLaneAbs','velLinOshpLaneLft','velLinOshpRang','velLinOshpX',
					'velLinOshpY','velLinOshpZ']

### Read the data files

In [182]:
# Pre-process data
# Assign features from current time-step to future use of automation,
# The idea is to predict future use of automation based on current behavior
delay = 1/60            # How much time in advance we want to predict automation usage (in seconds)
h = (int)(delay*60 - 1) # Turn the time into index (Considering a sampling rate of 60 Hz)
data_list = []
for file in all_files:
    data_participant = pd.read_csv(file)
    data_participant.insert(0,"time",[(i*1/60) for i in range(0,data_participant.shape[0])],True)
    data_auto = data_participant['boolAuto']
    data_auto = np.array(data_auto.iloc[h:])
    data_participant = data_participant.drop('boolAuto', axis=1)
    data_participant = data_participant.head(data_participant.shape[0]-h)
    data_participant.insert(data_participant.shape[1],"boolAuto",data_auto,True)
    data_list.append(data_participant)

In [183]:
# Put the data from all participants together
df_data = pd.concat(data_list, ignore_index=True) # Read the data from all participants

### Delete features that offer extra information

In [184]:
# Delete the features that offer extra information from the dataset
# The idea is to determine which features are more related to automation usage
df_data = df_data.drop(features_extra,axis = 1)
df_data = df_data.drop('time',axis=1)               # Drop the inserted time too
# df_data = df_data.drop(features_internal,axis = 1)

In [185]:
# Take just a sample of the data for speed
# Change this when running on Agave
SEED = 10
if(AGAVE==True):
    sample_data = df_data.sample(frac=1, random_state=SEED);
else:
    sample_data = df_data.sample(frac=0.1, random_state=SEED);

In [186]:
X_data = sample_data.drop('boolAuto',axis = 1)
y_data = sample_data['boolAuto']

### Results of feature selection

In [187]:
SH_feats = ['psnOshpRang'    ,
'score'          ,
'velLinOshpX'    ,
'velLinOshp'     ,
'rrisk'          ,
'sumViolLane'    ,
'velLinLead'     ,
'throtOshp'      ,
'sumViolRang'    ,
'odomRoad'       ,
'velLinOshpRang' ,
'sumViolPeds'    ,
'ttcLead'        ,
'sumViolButn'    ,
'accLinOshpX'    ,
'psnTrafPrxY'    ,
'psnTrafPrxX'    ,
'boolStatRang'   ,
'oriTrafPrxZ'    ,
'oriOshpY'       ]

SL_feats = ['psnOshpRang'    ,
'rrisk'          ,
'sumViolAwrd'    ,
'velLinOshp'     ,
'throtOshp'      ,
'velLinOshpX'    ,
'sumViolRang'    ,
'score'          ,
'accLinOshpX'    ,
'sumViolButn'    ,
'oriOshpY'       ,
'velLinLead'     ,
'odomRoad'       ,
'sumViolLane'    ,
'velLinOshpRang' ,
'psnTrafPrxX'    ,
'psnTrafPrxY'    ,
'accAngOshpY'    ,
'oriLeadZ'       ,
'velAngOshpY'    ]

FH_feats = ['velLinOshpX'    ,
'psnOshpRang'    ,
'score'          ,
'rrisk'          ,
'velLinOshp'     ,
'throtOshp'      ,
'sumViolRang'    ,
'velLinLead'     ,
'odomRoad'       ,
'sumViolButn'    ,
'sumViolLane'    ,
'accLinOshpX'    ,
'velLinOshpRang' ,
'psnOshpLaneLft' ,
'oriTrafPrxZ'    ,
'velAngLeadZAbs' ,
'psnTrafPrxX'    ,
'oriOshpY'       ,
'psnTrafPrxY'    ,
'oriOshpZ'       ]

FL_feats = ['psnOshpRang' ,
'sumViolAwrd' ,
'velLinOshpX' ,
'rrisk'       ,
'throtOshp'   ,
'velLinOshp'  ,
'sumViolLane' ,
'score'       ,
'accLinOshpX' ,
'odomRoad'    ,
'sumViolRang' ,
'oriTrafPrxZ' ,
'sumViolButn' ,
'psnTrafPrxX' ,
'velLinLead'  ,
'sumViolPeds' ,
'psnLeadY'    ,
'timeReact'   ,
'psnTrafPrxY' ,
'psnRoadY'    ]

top_features_all = [SH_feats,SL_feats,FH_feats,FL_feats]
top_feat_cond = top_features_all[choose_condition]


In [188]:
# Update the dataset to consider only the top num_feats features
X_data_cut = X_data[top_feat_cond]
# Scale the data (0,1)
X_data_cut = MinMaxScaler().fit_transform(X_data_cut)
y_data_cut = np.array(y_data.to_list())
# X_data = X_data[top_feat_cond]

## Split the data into training and testing

In [189]:
# Split the data into training and testing
frac = 0.8 # Choose fraction of data to use for training
n_samples = len(X_data_cut)
idx_split = (int)(np.round(frac*n_samples))

In [190]:
# Split data into features and target
train_labels = np.array(y_data_cut[0:idx_split])
test_labels = np.array(y_data_cut[idx_split:n_samples]) 
train_features = np.array(X_data_cut[0:idx_split,:])
test_features = np.array(X_data_cut[idx_split:n_samples,:])

In [191]:
# Check sizes
print('Training Data Shape:', train_features.shape)
print('Testing Data Shape:', test_features.shape)

Training Data Shape: (55266, 20)
Testing Data Shape: (13816, 20)


## Evaluate the classification models

In [192]:
# Save the performance results
data_scores = {'Train_Acc':[],'Train_BalAcc':[],'Train_Prec':[],'Train_Rec':[],'Train_Spec':[],
                'Train_AUC':[],'Test_Acc':[],'Test_BalAcc':[],'Test_Prec':[],'Test_Rec':[],
                'Test_Spec':[],'Test_AUC':[]}
perf_scores = pd.DataFrame(data=data_scores)
# Save the ROC curve data
data_roc = pd.DataFrame({'mean_fpr':[], 'tpr_1':[], 'tpr_2':[], 'tpr_3':[], 'tpr_4':[], 
                        'tpr_5':[]}) # 5 KFold
# Save the best set of parameter for the model
best_parameters = pd.DataFrame({'n_estimators':[], 'max_depth':[], 'max_features':[]})

### Random Forest

#### Cross Validation (5-fold)

In [193]:
# Cross Validation when the best parameters have been selected
best_n_estim = 120
best_max_depth = 5157
best_max_features = 5

cv = StratifiedKFold(n_splits=5)
# classifier = KNeighborsClassifier(n_jobs=-1)
classifier = make_pipeline(StandardScaler(),LinearSVC())

tprs = []; aucs = []; acc = []; acc_bal = []; prec = []; rec = []; spec = []; AUC_v = []
acc_tr = []; acc_bal_tr = []; prec_tr = []; rec_tr = []; spec_tr = []; AUC_v_tr = []
mean_fpr = np.linspace(0, 1, 100)

for i, (train, test) in enumerate(cv.split(X_data_cut, y_data_cut)):
    # Classifier training and testing
    y_train = y_data_cut[train]
    classifier.fit(X_data_cut[train], y_train)
    y_test = y_data_cut[test]
    y_pred = classifier.predict(X_data_cut[test])
    y_pred_tr = classifier.predict(X_data_cut[train])

    # ROC curve metrics
    fpr, tpr, _ = roc_curve(y_data_cut[test], y_pred, pos_label=1)
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)

    # Other performance metrics
    # Testing
    acc.append(accuracy_score(y_test,y_pred))
    acc_bal.append(balanced_accuracy_score(y_test,y_pred))
    prec.append(precision_score(y_test,y_pred))
    rec.append(recall_score(y_test,y_pred, pos_label=1))
    spec.append(recall_score(y_test,y_pred, pos_label=0))
    AUC_v.append(roc_auc_score(y_test,y_pred))
    # Training
    acc_tr.append(accuracy_score(y_train,y_pred_tr))
    acc_bal_tr.append(balanced_accuracy_score(y_train,y_pred_tr))
    prec_tr.append(precision_score(y_train,y_pred_tr))
    rec_tr.append(recall_score(y_train,y_pred_tr, pos_label=1))
    spec_tr.append(recall_score(y_train,y_pred_tr, pos_label=0))
    AUC_v_tr.append(roc_auc_score(y_train,y_pred_tr))



#### Results

In [194]:
# Fill the dataframe with all the results
perf_scores.Train_Acc = acc_tr
perf_scores.Train_BalAcc = acc_bal_tr
perf_scores.Train_Prec = prec_tr
perf_scores.Train_Rec = rec_tr
perf_scores.Train_Spec = spec_tr
perf_scores.Train_AUC = AUC_v_tr
perf_scores.Test_Acc = acc
perf_scores.Test_BalAcc	 = acc_bal
perf_scores.Test_Prec = prec
perf_scores.Test_Rec = rec
perf_scores.Test_Spec = spec
perf_scores.Test_AUC = AUC_v
# Best parameters
best_parameters.n_estimators = [best_n_estim]
best_parameters.max_depth = [best_max_depth]
best_parameters.max_features = [best_max_features]
# Data ROC curves
data_roc.mean_fpr = mean_fpr
for i in range(1,6):
    data_roc.iloc[:,i] = tprs[i-1]

In [195]:
perf_scores

Unnamed: 0,Train_Acc,Train_BalAcc,Train_Prec,Train_Rec,Train_Spec,Train_AUC,Test_Acc,Test_BalAcc,Test_Prec,Test_Rec,Test_Spec,Test_AUC
0,0.754221,0.653617,0.703143,0.381357,0.925878,0.653617,0.757038,0.655175,0.716515,0.379477,0.930874,0.655175
1,0.754202,0.653728,0.702726,0.381816,0.92564,0.653728,0.757038,0.658148,0.707865,0.390496,0.925801,0.658148
2,0.754279,0.653907,0.702754,0.382253,0.925561,0.653907,0.754632,0.654166,0.704017,0.382319,0.926012,0.654166
3,0.754786,0.656337,0.699228,0.389887,0.922786,0.656337,0.749059,0.64768,0.687817,0.373364,0.921996,0.64768
4,0.75589,0.656863,0.704335,0.388876,0.924849,0.656863,0.755284,0.658362,0.69697,0.396006,0.920719,0.658362


### 91 Features

In [196]:
# Update the dataset to consider only all the features
X_data_all = X_data
# Scale the data (0,1)
X_data_all = MinMaxScaler().fit_transform(X_data_all)
y_data_all = np.array(y_data.to_list())

In [197]:
# Split the data into training and testing
frac = 0.8 # Choose fraction of data to use for training
n_samples = len(X_data_all)
idx_split = (int)(np.round(frac*n_samples))

In [198]:
# Split data into features and target
train_labels = np.array(y_data_all[0:idx_split])
test_labels = np.array(y_data_all[idx_split:n_samples]) 
train_features = np.array(X_data_all[0:idx_split,:])
test_features = np.array(X_data_all[idx_split:n_samples,:])

In [199]:
# Check sizes
print('Training Data Shape:', train_features.shape)
print('Testing Data Shape:', test_features.shape)

Training Data Shape: (55266, 91)
Testing Data Shape: (13816, 91)


In [200]:
# Cross Validation when the best parameters have been selected
best_n_estim = 120
best_max_depth = 5157
best_max_features = 5

cv = StratifiedKFold(n_splits=5)
# classifier = KNeighborsClassifier(n_jobs=-1)
classifier = make_pipeline(StandardScaler(),LinearSVC())

tprs = []; aucs = []; acc = []; acc_bal = []; prec = []; rec = []; spec = []; AUC_v = []
acc_tr = []; acc_bal_tr = []; prec_tr = []; rec_tr = []; spec_tr = []; AUC_v_tr = []
mean_fpr = np.linspace(0, 1, 100)

for i, (train, test) in enumerate(cv.split(X_data_all, y_data_all)):
    # Classifier training and testing
    y_train = y_data_all[train]
    classifier.fit(X_data_all[train], y_train)
    y_test = y_data_all[test]
    y_pred = classifier.predict(X_data_all[test])
    y_pred_tr = classifier.predict(X_data_all[train])

    # ROC curve metrics
    fpr, tpr, _ = roc_curve(y_data_all[test], y_pred, pos_label=1)
    interp_tpr = np.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)

    # Other performance metrics
    # Testing
    acc.append(accuracy_score(y_test,y_pred))
    acc_bal.append(balanced_accuracy_score(y_test,y_pred))
    prec.append(precision_score(y_test,y_pred))
    rec.append(recall_score(y_test,y_pred, pos_label=1))
    spec.append(recall_score(y_test,y_pred, pos_label=0))
    AUC_v.append(roc_auc_score(y_test,y_pred))
    # Training
    acc_tr.append(accuracy_score(y_train,y_pred_tr))
    acc_bal_tr.append(balanced_accuracy_score(y_train,y_pred_tr))
    prec_tr.append(precision_score(y_train,y_pred_tr))
    rec_tr.append(recall_score(y_train,y_pred_tr, pos_label=1))
    spec_tr.append(recall_score(y_train,y_pred_tr, pos_label=0))
    AUC_v_tr.append(roc_auc_score(y_train,y_pred_tr))



In [201]:
# Fill the dataframe with all the results
perf_scores.Train_Acc = acc_tr
perf_scores.Train_BalAcc = acc_bal_tr
perf_scores.Train_Prec = prec_tr
perf_scores.Train_Rec = rec_tr
perf_scores.Train_Spec = spec_tr
perf_scores.Train_AUC = AUC_v_tr
perf_scores.Test_Acc = acc
perf_scores.Test_BalAcc	 = acc_bal
perf_scores.Test_Prec = prec
perf_scores.Test_Rec = rec
perf_scores.Test_Spec = spec
perf_scores.Test_AUC = AUC_v
# Best parameters
best_parameters.n_estimators = [best_n_estim]
best_parameters.max_depth = [best_max_depth]
best_parameters.max_features = [best_max_features]
# Data ROC curves
data_roc.mean_fpr = mean_fpr
for i in range(1,6):
    data_roc.iloc[:,i] = tprs[i-1]

In [202]:
perf_scores

Unnamed: 0,Train_Acc,Train_BalAcc,Train_Prec,Train_Rec,Train_Spec,Train_AUC,Test_Acc,Test_BalAcc,Test_Prec,Test_Rec,Test_Spec,Test_AUC
0,0.795513,0.717112,0.766757,0.504936,0.929287,0.717112,0.796844,0.717501,0.773578,0.502755,0.932248,0.717501
1,0.794662,0.715763,0.765797,0.502239,0.929287,0.715763,0.795759,0.71708,0.768369,0.504132,0.930029,0.71708
2,0.794032,0.71629,0.760614,0.505883,0.926697,0.71629,0.795961,0.717306,0.768719,0.504478,0.930134,0.717306
3,0.795299,0.719027,0.759956,0.512598,0.925455,0.719027,0.79227,0.713681,0.757902,0.501033,0.926329,0.713681
4,0.796457,0.719069,0.766422,0.509643,0.928496,0.719069,0.793935,0.71848,0.753955,0.514233,0.922727,0.71848
