In [None]:
! git clone https://github.com/cggcaio/Anomaly-Detection-for-Driver-Identification.git

Cloning into 'Anomaly-Detection-for-Driver-Identification'...
remote: Enumerating objects: 105, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 572 (delta 35), reused 102 (delta 32), pack-reused 467[K
Receiving objects: 100% (572/572), 61.02 MiB | 8.52 MiB/s, done.
Resolving deltas: 100% (171/171), done.
Checking out files: 100% (55/55), done.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest # Importação Isolation Forest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import time
from tqdm import tqdm
from sklearn.svm import OneClassSVM # IMPORTAÇÃO OnClassSVM

In [None]:
columns = ['ID','Method', 'Time_Window', 'Driver','Driver_B','Features', 'Method_Parameters', 'Accuracy_Train','Accuracy_Test','Accuracy_Anomaly', 'Inference_Time']
results = pd.DataFrame(columns=columns)

# new_row = ["ON-SVM", 32, "A", "Teste", "Teste", 95.3]
# Analysis_df = Analysis_df.append(pd.DataFrame([new_row],columns=columns))

# Features:
# Iap - Intake_air_pressure
# Est - Engine_soacking_time
# LTFTB - Long_Term_Fuel_Trim_Bank1
# Tof - Torque_of_friction
# Ect - Engine_coolant_temperature
# Swp - Steering_wheel_speed                      

In [None]:
def split_data(driver_id,window_size):

  t = window_size
  d_s = driver_id
  # Reading the selected driver
  path = '/content/Anomaly-Detection-for-Driver-Identification/Data_Bases/Organized_DB2/Driver_'+d_s+'/driver_'+d_s+'_block_'+str(t)+'s'
  df = pd.read_csv(path, sep=",")
  print("")
  print("DRIVER "+d_s+" SELECTED IN THE TIME WINDOW "+str(t))

  # Creating a row of ones
  labels = np.ones((df.shape[0],1))

  # Randomly shuffle all entries to avoid anomalous data blocks.
  for f in range(0, 3):
    df = df.iloc[np.random.permutation(len(df))] 

  # Scikit-learn function called train_test_split that divide the data between train and test.
  x_train, x_val, y_train, y_val = train_test_split(df, labels, test_size = 0.2, random_state = 42) 
  return {'x_train':x_train, 'x_val': x_val, 'y_train':y_train, 'y_val':y_val}

## Isolation Forest

In [None]:
def IFTraining(x_train,max_samples=256,n_estimators=1000,contamination=0.002):

  # Defining the algorithm paramathers
  isolation_forest = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination, random_state=42)
  # n_estimators is the number of trees to use in forest. Default is 100.
  # max_samples is the maximum number of data points that the tree should build on. Default is 256 or smaller
  # contamination is an estimate of the percentage of the entire data set that should be considered an anomaly. Default is 0.1
  # random_state is the number it will initialize the random number generator with to use during the training process.

  isolation_forest.fit(x_train)

  return isolation_forest

In [None]:
def compute_threshold(iso_forest,x_val):
    anomaly_scores_test = iso_forest.decision_function(x_val)
    threshold = 0
    anomalies_test = anomaly_scores_test > 0                  
    num_false = np.sum([1 for a in anomalies_test if not a])
    res = num_false/len(anomalies_test)

    while (res>0.001):
      threshold = threshold - 0.001
      anomalies_test = anomaly_scores_test > threshold 
      num_false = np.sum([1 for a in anomalies_test if not a])
      res = num_false/len(anomalies_test)
    return threshold

In [None]:
def executeIF(isolation_forest,x_train,x_val,new_driver,t):

  # Compute threshold
  threshold = compute_threshold(iso_forest,x_train)

  # Compute acc_train

  acc_train = get_accuracy_iso(x_train,isolation_forest,threshold)

  # Compute acc_test

  acc_test = get_accuracy_iso(x_val,isolation_forest,threshold)

  # acc_an

  path = '/content/Anomaly-Detection-for-Driver-Identification/Data_Bases/Organized_DB2/Driver_'+new_driver+'/driver_'+new_driver+'_block_'+str(t)+'s'
  dfb = pd.read_csv(path, sep=",") 
  x = dfb.to_numpy()
  acc_an = get_accuracy_iso(x,isolation_forest,threshold)

  return {'acc_train':acc_train['acc'],'acc_test':acc_test['acc'],'acc_an': 1 - acc_an['acc'],'inf_time':acc_an['inf_time']}


def get_accuracy_iso(x,isolation_forest,threshold):
  
  ini = time.time()
  scores_val = isolation_forest.decision_function(x)
  fin = time.time() 
  
  anomalies = scores_val < threshold 
  num_false = np.sum([1 for a in anomalies if a])
  return {'acc': 1 - (num_false/len(anomalies)), 'inf_time': (fin-ini)/x.shape[0]}


## OC-SVM

In [None]:
def OCSVMTraining(x_train, kernel, nu):
  ocsvm = OneClassSVM(kernel=kernel, nu=nu)

  ocsvm.fit(x_train)
  
  return ocsvm

In [None]:
def executeOCSVM(ocsvm,x_train,x_val,new_driver,t):
  
  # Compute acc_train
  acc_train = get_accuracy_ocsvm(ocsvm, x_train)
  
  # Compute acc_test
  acc_test = get_accuracy_ocsvm(ocsvm, x_val)

  # acc_an
  path = '/content/Anomaly-Detection-for-Driver-Identification/Data_Bases/Organized_DB2/Driver_'+new_driver+'/driver_'+new_driver+'_block_'+str(t)+'s'
  dfb = pd.read_csv(path, sep=",") 
  x = dfb.to_numpy()
  acc_an = get_accuracy_ocsvm(ocsvm, x)

  return {'acc_train':acc_train['acc'],'acc_test':acc_test['acc'],'acc_an': 1 - acc_an['acc'],'inf_time':acc_an['inf_time']}

def get_accuracy_ocsvm(ocsvm, x):
  ini = time.time()
  preds = ocsvm.predict(x)
  fin = time.time()

  score = 0
  for f in range(0, x.shape[0]):
    if(preds[f] == 1):
        score = score+1
  acc = score / x.shape[0]

  return {'acc':acc, 'inf_time': (fin-ini)/x.shape[0]}


## Main Function

In [None]:
from tqdm import tqdm

id_ = 0
features = ['Iap','Est', 'LTFTB', 'Tof', 'Ect', 'Swp']

methods = ['OCSVM']
time_window = [30]
driver = ['A','B','C','D','E','F','G','H']
nexp = 5


# IS params
max_samples = [256]
n_estimators = [1000]
contamination = [0.002]

# OSVM
kernels = ['sigmoid'] #rbf
nu = [0.0001]

for m in methods:
  for tw in time_window:
    for d in tqdm(driver):
      if m == 'IF':
        for ms in max_samples:
          for ne in n_estimators:
            for c in contamination:
              for j in range(nexp):

                # Get data 
                data = split_data(d,tw)

                # Model training
                iso_forest = IFTraining(data['x_train'],ms,ne,c) 

                # ['ID','Method', 'Time_Window', 'Driver','Driver_B' 'Features', 'Method_Parameters', 'Accuracy_Train','Accuracy_Test','Accuracy_Anomaly', 'Inference_Time']
                for driver_b in driver: 
                  # executar
                  if driver_b != d:
                    response = executeIF(iso_forest,data['x_train'],data['x_val'],driver_b,tw)
                    row = [j,m,tw,d,driver_b,str(features), str({'max_samples':ms,'n_estimators':ne,'contamination':c}), response['acc_train'], response['acc_test'],response['acc_an'], response['inf_time']]

                    df = pd.DataFrame([row],columns=columns)

                    results = results.append(df)
                    results.to_csv('results.csv')
      if m == 'OCSVM':
        for k in kernels:
          for n in nu:
            for j in range(nexp):
              # Get data
              data = split_data(d,tw)

              # Model Training
              onclass = OCSVMTraining(data['x_train'],k,n)

              # ['ID','Method', 'Time_Window', 'Driver','Driver_B' 'Features', 'Method_Parameters', 'Accuracy_Train','Accuracy_Test','Accuracy_Anomaly', 'Inference_Time']
              for driver_b in driver:
                if driver_b != d:
                  response = executeOCSVM(onclass,data['x_train'],data['x_val'], driver_b, tw)
                  row = [j,m,tw,d,driver_b,str(features), str({'kernels':k, 'nu':n}), response['acc_train'], response['acc_test'],response['acc_an'], response['inf_time']]

                  df = pd.DataFrame([row], columns=columns)
                  results = results.append(df)
                  results.to_csv('results.csv')


                  
