This is the code adapted from the paper entitled "LCCDE: A Decision-Based Ensemble Framework for Intrusion Detection in The Internet of Vehicles" accepted in 2022 IEEE Global Communications Conference (GLOBECOM).
Authors: Li Yang (lyang339@uwo.ca), Abdallah Shami (Abdallah.Shami@uwo.ca), Gary Stevens, and Stephen de Rusett
Organization: The Optimized Computing and Communications (OC2) Lab, ECE Department, Western University, Ontario, Canada; S2E Technologies, St. Jacobs, Ontario, Canada

L. Yang, A. Shami, G. Stevens, and S. DeRusett, â€œLCCDE: A Decision-Based Ensemble Framework for Intrusion Detection in The Internet of Vehicles," in 2022 IEEE Global Communications Conference (GLOBECOM), 2022, pp. 1-6.

In [None]:
!pip install --upgrade pandas
!pip install catboost
!pip install river

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb
import time
from river import stream
from statistics import mode
from sklearn import preprocessing

In [None]:
# be careful of ignoring warnings

# import warnings
# warnings.filterwarnings("ignore")

In [None]:
# mount Google Drive to save models later
import gc
import pickle
from google.colab import drive

drive.mount("/content/drive")

In [None]:
def return_next_int():
  i = 0
  while True:
    yield i
    i += 1

g = return_next_int()


# settings for dataset mode
MODE_CAR_HACKING = next(g)  # use the Car-Hacking dataset
MODE_ROAD = next(g)  # use the ROAD dataset
MODE_TRAIN_CAR_HACKING_TEST_ROAD = next(g)  # train on the Car-Hacking dataset and test on the ROAD dataset
MODE_CAR_HACKING_BINARY = next(g)  # change Car-Hacking labels to two classes (to compare with MODE_TRAIN_CAR_HACKING_TEST_ROAD)

In [None]:
# set mode
dataset_mode = MODE_CAR_HACKING

def dataset_mode_error(invalid_dataset_mode: int):
  raise ValueError("Unsupported dataset mode: {m}".format(m=invalid_dataset_mode))

In [None]:
# paths for datasets
PATH_CAR_HACKING = "/content/drive/MyDrive/car_hacking_with_header.csv"
PATH_ROAD = "/content/drive/MyDrive/road.csv"

In [None]:
class LCCDE_Model:
  """Basically a struct containing data for a base learner in the LCCDE."""
  def __init__(self, model, name):
    # base learner object that should have the predict() and predict_proba() functions (e.g. XGBClassifier())
    self.model = model
    # string name
    self.name = name
    # the following attributes are for evaluation metrics
    self.accuracy = None
    self.precision = None
    self.recall = None
    self.f1_avg = None  # average of F1 scores
    self.f1 = None  # list of F1 scores for each class
    # the following attributes are for storing predictions for one data point
    self.predicted_class = None  # predicted class
    self.highest_predicted_prob = None  # class with highest confidence score

  def __repr__(self):
    return "LCCDE_Model({m}, {n})".format(m=self.model, n=self.name)

  def store_eval_metrics(self, y_test, y_pred):
    self.accuracy = accuracy_score(y_test, y_pred)
    self.precision = precision_score(y_test, y_pred, average='weighted')
    self.recall = recall_score(y_test, y_pred, average='weighted')
    self.f1_avg = f1_score(y_test, y_pred, average='weighted')
    self.f1 = f1_score(y_test, y_pred, average=None)

In [None]:
# select base learners to use
USE_LG = True  # LightGBM
USE_XG = True  # XGBoost
USE_ADA = True  # AdaBoost
USE_CB = True  # CatBoost

In [None]:
# create the models that we're using and wrap them in LCCDE_Model class
ensemble_models = []

if USE_LG:
  import lightgbm as lgb
  lg = LCCDE_Model(lgb.LGBMClassifier(), "LightGBM")
  ensemble_models.append(lg)
if USE_XG:
  import xgboost as xgb
  xg = LCCDE_Model(xgb.XGBClassifier(), "XGBoost")
  ensemble_models.append(xg)
if USE_ADA:
  from sklearn.ensemble import AdaBoostClassifier
  ada = LCCDE_Model(AdaBoostClassifier(), "AdaBoost")
  ensemble_models.append(ada)
if USE_CB:
  import catboost as cbt
  cb = LCCDE_Model(cbt.CatBoostClassifier(verbose=True,boosting_type='Plain'), "CatBoost")
  ensemble_models.append(cb)

In [None]:
# read files into dataframes

dtype_dict = {"Time": float, "Id": str,
              "Byte1": str, "Byte2": str, "Byte3": str, "Byte4": str,
              "Byte5": str, "Byte6": str, "Byte7": str, "Byte8": str,
              "Label": str}  # maps column names to their types


def read_car_hacking() -> pd.DataFrame:
  return pd.read_csv(PATH_CAR_HACKING, on_bad_lines="warn", dtype=dtype_dict)

def read_road() -> pd.DataFrame:
  return pd.read_csv(PATH_ROAD, on_bad_lines="warn", dtype=dtype_dict)


if dataset_mode == MODE_CAR_HACKING or dataset_mode == MODE_CAR_HACKING_BINARY:
  df = read_car_hacking()

elif dataset_mode == MODE_ROAD:
  df = read_road()

elif dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD:
  df = read_car_hacking()
  df_test = read_road()

else:
  dataset_mode_error(dataset_mode)

In [None]:
# for transferring between datasets, turn it into a binary classification problem

def make_non_benign_malicious(data_frame):
  """set all Labels that aren't 'Benign' to 'Malicious' (mutates the data frame)"""
  label_col = "Label"
  benign_label = "Benign"
  malicious_label = "Malicious"
  data_frame.loc[data_frame[label_col] != benign_label, [label_col]] = malicious_label


if dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD or dataset_mode == MODE_CAR_HACKING_BINARY:
  make_non_benign_malicious(df)  # change labels for Car-Hacking dataframe
  print("Changed labels for Car-Hacking dataframe")
  if dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD:
    make_non_benign_malicious(df_test)  # change labels for ROAD dataframe
    print("Changed labels for ROAD dataframe")

In [None]:
df

In [None]:
df["Label"].unique()

In [None]:
if dataset_mode == MODE_CAR_HACKING or dataset_mode == MODE_ROAD or dataset_mode == MODE_CAR_HACKING_BINARY:
  # encode labels
  label_encoder = preprocessing.LabelEncoder()
  df['Label'] = label_encoder.fit_transform(df['Label'])
  label_encoder_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

elif dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD:
  # combine dataframes because they have the same columns
  df_len = len(df)
  temp = pd.concat([df, df_test], ignore_index=True)
  # encode labels
  label_encoder = preprocessing.LabelEncoder()
  temp['Label'] = label_encoder.fit_transform(temp['Label'])
  label_encoder_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
  # separate back into train and test dataframes
  df = temp.iloc[:df_len].reset_index(drop=True)
  df_test = temp.iloc[df_len:].reset_index(drop=True)

else:
  dataset_mode_error(dataset_mode)

In [None]:
df

In [None]:
label_encoder_name_mapping

In [None]:
df.Label.value_counts()

In [None]:
df.dtypes

In [None]:
# drop NA values from Car-Hacking
if dataset_mode == MODE_CAR_HACKING or dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD or dataset_mode == MODE_CAR_HACKING_BINARY:
  count = df.isna().any(axis=0).sum()
  print("Dropping {c} NA values...".format(c=count))
  df.dropna(inplace=True)
  print("Done")

In [None]:
def get_X_and_y(data_frame: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
  """Prepares X (feature matrix) and y (the corresponding ground truth labels)."""
  X = data_frame.drop(['Label','Time','Id'],axis=1)
  y = data_frame['Label']
  # convert object dtypes to ints
  for col in X.columns:
    if X[col].dtype == object:  # Check if the column type is object (likely non-numeric)
        X[col] = X[col].apply(lambda x: int(x, 16))
  return X, y


if dataset_mode == MODE_CAR_HACKING or dataset_mode == MODE_ROAD or dataset_mode == MODE_CAR_HACKING_BINARY:
  # take train-test split of df
  X, y = get_X_and_y(df)
  # due to resource limitations, we've set the test size to 4% (train size will be its complement, 96%)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.04, random_state = 0, stratify=df["Label"]) # shuffle=False

elif dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD:
  # use the entire Car-Hacking dataset for training
  X_train, y_train = get_X_and_y(df)
  # use the ROAD dataset for testing
  # due to resource limitations, we've set the test size to 4% (train size will be its complement, 96%)
  X, y = get_X_and_y(df_test)
  _, X_test, _, y_test = train_test_split(X, y, test_size = 0.04, random_state = 0, stratify=df_test["Label"]) # shuffle=False

else:
  dataset_mode_error(dataset_mode)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

In [None]:
X_test.dtypes

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
from imblearn.over_sampling import SMOTE


smote = None

if dataset_mode == MODE_CAR_HACKING:
  # about 14 million Benign and 472k to 628k of other classes, we aren't oversampling here to save time
  pass

elif dataset_mode == MODE_ROAD:
  smote = SMOTE(sampling_strategy={2: 40000, 4: 40000, 3: 10000})

elif dataset_mode == MODE_TRAIN_CAR_HACKING_TEST_ROAD:
  # about 15 million Benign and 2 million Malicious, we aren't oversampling here to save time
  pass

elif dataset_mode == MODE_CAR_HACKING_BINARY:
  # same as MODE_CAR_HACKING
  pass

else:
  dataset_mode_error(dataset_mode)

In [None]:
if smote is not None:
  X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_test).value_counts()

In [None]:
def evaluate_model(model, y_test=None, y_pred=None):
  if y_test is not None and y_pred is not None:
    print(classification_report(y_test, y_pred))

  print("Accuracy of {m}: {e}".format(m=model.name, e=str(model.accuracy)))
  print("Precision of {m}: {e}".format(m=model.name, e=str(model.precision)))
  print("Recall of {m}: {e}".format(m=model.name, e=str(model.recall)))
  print("Average F1 of {m}: {e}".format(m=model.name, e=str(model.f1_avg)))
  print("F1 of {m} for each type of attack: {e}".format(m=model.name, e=str(model.f1)))

  # Plot the confusion matrix
  if y_test is not None and y_pred is not None:
    cm = confusion_matrix(y_test, y_pred)
    f, ax = plt.subplots(figsize=(5,5))
    sns.heatmap(cm, annot=True, linewidth=0.5, linecolor="red", fmt=".0f", ax=ax)
    plt.xlabel("y_pred")
    plt.ylabel("y_true")
    plt.show()

In [None]:
%%time
if USE_LG:
  # Train the LightGBM algorithm
  lg.model.fit(X_train, y_train)
  y_pred_lg = lg.model.predict(X_test)

  lg.store_eval_metrics(y_test, y_pred_lg)
  evaluate_model(lg, y_test, y_pred_lg)

In [None]:
%%time
if USE_XG:
  # Train the XGBoost algorithm
  X_train_x = X_train.values
  X_test_x = X_test.values
  xg.model.fit(X_train_x, y_train)
  y_pred_xg = xg.model.predict(X_test_x)

  xg.store_eval_metrics(y_test, y_pred_xg)
  evaluate_model(xg, y_test, y_pred_xg)

In [None]:
%%time
if USE_ADA:
  # Train the AdaBoost algorithm
  ada.model.fit(X_train.values, y_train)
  y_pred_ada = ada.model.predict(X_test)

  ada.store_eval_metrics(y_test, y_pred_ada)
  evaluate_model(ada, y_test, y_pred_ada)

In [None]:
%%time
if USE_CB:
  # Train the CatBoost algorithm
  cb.model.fit(X_train, y_train)
  y_pred_cb = cb.model.predict(X_test)

  cb.store_eval_metrics(y_test, y_pred_cb)
  evaluate_model(cb, y_test, y_pred_cb)

In [None]:
f1_scores = []  # list of f1 score lists for each model
for m in ensemble_models:
  f1_scores.append(m.f1)


leader_models = dict()  # maps each Label (an int) to its leader model, the model with the highest

# iterate through f1 scores (all f1_scores elements should be lists of the same length because all models were used on the same labels)
for i in range(len(f1_scores[0])):  # for each class... (corresponding to indices in an element of f1_scores)
  # find the model with the highest f1 for that class and add to leader_models dictionary
  for m in ensemble_models:
      max_f1 = max([f1[i] for f1 in f1_scores])
      if max_f1 == m.f1[i]:
          leader_models[i] = m.model
          break

In [None]:
leader_models

In [None]:
# LCCDE helper functions
from itertools import groupby


def all_equal(iterable):
    """Returns whether all values in an Iterable are equal
    (https://stackoverflow.com/a/3844832).
    """
    g = groupby(iterable)
    return next(g, True) and not next(g, False)


def all_unique(lst):
    """Returns whether all values in an Iterable are unique
    (https://www.geeksforgeeks.org/python-check-if-list-contains-all-unique-elements/).
    """
    # use the unique function from numpy to find the unique elements in the list
    unique_elements, counts = np.unique(lst, return_counts=True)
    # return True if all elements in the list are unique (i.e., the counts are all 1)
    return all(counts == 1)

In [None]:
# this cell defines a helper function LCCDE_predict_class(), and the ensemble prediction function LCCDE()
from statistics import mode


def LCCDE_predict_class(xi, models: list[LCCDE_Model], leader_models: dict):
    """Classifies a data record using the LCCDE model.
    Returns the model's predicted class.
    :param xi: features for a data record
    :param models: list of LCCDE_Model objects
    :param leader_models: a dictionary with Labels as keys, the value of each key should be m.model for any m in in models
    """
    if all_equal([m.predicted_class for m in models]):
        # if all models predict the same class, use that as final predicted class
        final_pred_class = models[0].predicted_class

    elif all_unique([m.predicted_class for m in models]):
        # if all models predict a different class, choose final predicted class based on class leaders

        # find models that are the leader for their predicted class
        matching_models = []
        for m in models:
            if leader_models[m.predicted_class] == m.model:
                matching_models.append(m)
        if len(matching_models) == 1:
            # if only one model is the leader for its predicted class, then use its prediction
            final_pred_class = matching_models[0].predicted_class
        else:
            # otherwise, use the prediction of the model with highest confidence
            highest_confidence = max([m.highest_predicted_prob for m in models])
            most_confident_models = [m for m in models if m.highest_predicted_prob == highest_confidence]
            final_pred_class = most_confident_models[0].predicted_class  # if there's a tie, just pick the first one

    else:
        # if some models agree and some don't, use the leader of the majority class as the final predicted class
        majority_class = mode([m.predicted_class for m in models])  # if there's a tie, mode() will pick the first one
        leader = leader_models[majority_class]
        final_pred_class = leader.predict(xi)[0]

    return final_pred_class


def LCCDE(X_test, y_test, models: list[LCCDE_Model], leader_models, verbose=False) -> tuple[list, list]:
    """Uses the Leader Class and Confidence Decision Ensemble (LCCDE) to
    classify records in a feature matrix. Casts predicted labels to ints.
    Returns a tuple containing ground truth labels and predicted labels.
    :param X_test: feature matrix for testing (pandas DataFrame)
    :param y_test: ground truth Labels corresponding to X_test
    :param models: a list of the LCCDE_Model objects
    :param leader_models: a dictionary with Labels as keys, the value of each key should be a model in base_learners
    """
    y_actual = []  # list of actual y-values (I think it ends up being the same as the y_test parameter)
    y_predicted = []  # the values predicted by the ensemble for each xi in X_test

    count = 0
    # predict each label based on the features
    for xi, yi in stream.iter_pandas(X_test, y_test):
        xi = np.array(list(xi.values())).reshape(1, -1)

        # for each model, predict class based on feature values xi
        for m in models:
            m.predicted_class = int(m.model.predict(xi)[0])  # predicted class for this data point xi
            predicted_probs = m.model.predict_proba(xi)  # prediction probability confidence list
            m.highest_predicted_prob = np.max(predicted_probs)  # max of prediction probability confidence list

        # use the ensemble to predict the class of xi
        final_pred_class = int(LCCDE_predict_class(xi, models, leader_models))

        # save the actual and predicted y-values
        y_actual.append(yi)
        y_predicted.append(final_pred_class)

        count += 1
        if verbose and count % 1000 == 0:
          print("Progress update: LCCDE has predicted {n} values".format(n=count))

    return y_actual, y_predicted

In [None]:
X_test

In [None]:
ensemble_models

In [None]:
%%time
import warnings
warnings.filterwarnings('ignore')

# run LCCDE() to predict classes for test set
verbose = True
y_actual, y_predicted = LCCDE(X_test, y_test, ensemble_models, leader_models, verbose=verbose)

In [None]:
# The performance of the proposed LCCDE model
lccde_ensemble = LCCDE_Model(None, "LCCDE")
lccde_ensemble.store_eval_metrics(y_actual, y_predicted)
evaluate_model(lccde_ensemble)

In [None]:
# comparison of F1 scores
for model in ensemble_models + [lccde_ensemble]:
  print("F1 of {m} for each type of attack: {f}".format(m=model.name, f=model.f1))

In [None]:
import pickle


models_to_save = dict()  # maps model objects to file name to save them in
if USE_LG:
  models_to_save[lg] = "lg.sav"
if USE_XG:
  models_to_save[xg] = "xg.sav"
if USE_ADA:
  models_to_save[ada] = "ada.sav"

# save to the root of your Google Drive (MyDrive)
drive_path_prefix = "/content/drive/MyDrive/"

# save the models
for m in models_to_save.keys():
  pickle_file = open(drive_path_prefix + models_to_save[m], "wb+")
  pickle.dump(m.model, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
  pickle_file.close()

In [None]:
if USE_CB:
  cb.model.save_model("CatboostModel")