# Testing on CICIoT2023

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
#warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
import shap
from joblib import load, dump
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
BASE = os.getcwd()
DATASET_DIRECTORY = './CICIoT2023/'
DATASET_DIRECTORY = os.path.join(BASE, DATASET_DIRECTORY)

## Data

### Importing Dataset

In [None]:
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]
df_sets.sort()
training_sets = df_sets[:int(len(df_sets)*.8)]
test_sets = df_sets[int(len(df_sets)*.8):]

In [None]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()

In [None]:
for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

### Classification: 2 (1+1)

In [None]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


ML_models = [
        LogisticRegression(n_jobs=-1),
]

ML_neams = [
        "LogisticRegression",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
    d[y_column] = d[y_column].apply(lambda x: 1 if x=='Attack' else 0)
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    d_test[y_column] = d_test[y_column].apply(lambda x: 1 if x=='Attack' else 0)
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (2 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

In [None]:
 # Saving the model
dump(model, os.path.join(BASE, "LogisticRegression_CiCioT2023.joblib"))

## Important Functions

In [None]:
def features_importance(model_file, n):
    '''
    Return top n features of a Forest model
    :param model_file: name of the saved model
    :param n: top n features
    '''
    
    a = load(model_file)
    a_importances = a.feature_importances_
    a_features_df = pd.DataFrame({
        'Feature': X_columns, 
        'Importance': a_importances
    })
    a_features_df = a_features_df.sort_values(by='Importance', ascending=False)
    
    return a_features_df.head(n)

In [None]:
def classify_api(target, features, threshold=0.5, type='hard'):
    '''
    This function opens a model previously saved and uses it to classify some samples.
    you can pass the feature line or a matrix of features, each sample a line
    
    : target: model used to classify feature's samples. This code use scikit way of predict 
    : features: feature vector to classify in pd Dataset format
    : threshold: to use models which use threshold
    : type: soft -> output vector with class and other information; hard -> output with only class
    '''
    actual_directory = os.getcwd()
    target_file = os.path.join(actual_directory, target)
    if os.path.exists(target_file):
        # loading specific model (it needs to get the model file name to be opened)
        target_model = load(os.path.join(actual_directory, target))
    else:
        print('Model file not found!')
        return -1
    
    prediction = target_model.predict(features)
    output = []

    for result in prediction:

        if result <= threshold:
            if type == 'soft':
                output.append((result, 0)) #benign
            elif type == 'hard':
                output.append(0)
            else:
                output.append(-1) #error
        else:
            if type == 'soft':
                output.append((result, 1)) #malicious
            elif type == 'hard':
                output.append(1)
            else:
                output.append(-1) #error

    if type == 'soft':
        return pd.Series(output, dtype=object)
    elif type == 'hard':
        return pd.Series(output, dtype=np.int64)
    else:
        return -1

In [None]:
def shap_values_importance(shap_values_1, shap_values_2, k=15): 
    # Calculate EBFA for each instance
    ebfa_scores = []
    loop = (len(shap_values_1)) if (len(shap_values_1) < len(shap_values_2)) else (len(shap_values_2))
    for i in range(loop):
        # Extract SHAP values for the instance
        shap_vals_instance_1 = shap_values_1[i]
        shap_vals_instance_2 = shap_values_2[i]

        importance_1 = np.abs(shap_vals_instance_1)
        importance_2 = np.abs(shap_vals_instance_2)

        # Determine top k features and convert indices to tuples
        top_feats_1 = set(tuple(np.argsort(-importance_1)[:k]))
        top_feats_2 = set(tuple(np.argsort(-importance_2)[:k]))

        # Calculate intersection and EBFA score
        intersection = top_feats_1.intersection(top_feats_2)
        ebfa_score = len(intersection) / k
        ebfa_scores.append(ebfa_score)

    # Average EBFA value
    average_ebfa = np.mean(ebfa_scores)
    print('Average EBFA:', average_ebfa)

    return average_ebfa

In [None]:
def linear_interpolation(data, num_samples, lambda_):
    synthetic_data = []

    for _ in range(num_samples):
        # choose random two numbers
        idx1, idx2 = np.random.choice(len(data), 2, replace=False)
        point1, point2 = data.iloc[idx1], data.iloc[idx2]

        if (lambda_):
            factor = lambda_
        else:
            factor = np.random.uniform(0,1)

        new_point = factor * point1 + (1 - factor) * point2
        synthetic_data.append(tuple(new_point))
    new_data = np.array(list(set(synthetic_data))) # set for avoiding repetition

    return new_data

In [None]:
def gaussian_noise(data, noise_level=0.01):
    noise = np.random.normal(0, noise_level * np.std(data, axis=0), data.shape)
    new_data = data + noise

    return new_data

In [None]:
''' Creating clusters to use as background data in our attack ''' 
from sklearn.cluster import KMeans

def process_and_scale_data(X, y, n_clusters, n_init=10):
    ''' Generate normalized centroids'''

    df = X.copy()
    df['label'] = y
    
    # Dividindo o DataFrame com base nas labels
    df_0 = df[df['label'] == 0].drop('label', axis=1)
    df_1 = df[df['label'] == 1].drop('label', axis=1)

    # Treinando modelos KMeans
    kmeans_0 = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=25).fit(df_0)
    kmeans_1 = KMeans(n_clusters=n_clusters, n_init=n_init, random_state=25).fit(df_1)

    # Extraindo centróides
    centroids_0 = kmeans_0.cluster_centers_
    centroids_1 = kmeans_1.cluster_centers_

    # Criando DataFrames para os centróides
    centroids_df_0 = pd.DataFrame(centroids_0, columns=df_0.columns)
    centroids_df_1 = pd.DataFrame(centroids_1, columns=df_1.columns)

    # Normalizando os dados
    scaler = MinMaxScaler()
    scaled_df_0 = pd.DataFrame(scaler.fit_transform(centroids_df_0), columns=centroids_df_0.columns)
    scaled_df_1 = pd.DataFrame(scaler.fit_transform(centroids_df_1), columns=centroids_df_1.columns)

    return scaled_df_0, scaled_df_1

In [None]:
def substitute_model_our_attack(target, X, y, augment_count=2, string='aa', lambda_=0.7, ebfa_limit=0.6, mode='li', surrogate_algo='RandomForestClassifier', threshold=0.5):
    '''

    :Param X: Attack data
    :Param y: Attack label-data
    :Param lambda_:
    :Param augment_count:
    :Param detection_variation:
    '''

    # config
    max_iter = 20 # max iterations to avoid an eternal loop
    num_samples = 200 #stating number of each class sample. eg: 5 benign and 5 malicious samples
    file_name = string+'ebfa_limit_'+str(ebfa_limit)+'_surrogate_model_our_attack_'+surrogate_algo+'_'+target+'_'+mode 
    substitute_model_path = os.path.join(os.getcwd(), file_name)
    target_model = load(target)
    if surrogate_algo == 'RandomForestClassifier':
        surrogate_model = RandomForestClassifier(random_state=38)
    elif surrogate_algo == 'DecisionTreeClassifier':
        surrogate_model = DecisionTreeClassifier(criterion='log_loss')
    elif surrogate_algo == 'KNNClassifier':
        surrogate_model = KNeighborsClassifier(n_neighbors = 5)
    elif surrogate_algo == 'QDAClassifier':
        surrogate_model = QuadraticDiscriminantAnalysis()
    elif surrogate_algo == 'ADABoostClassifier':
        surrogate_model = AdaBoostClassifier(random_state=38)
    elif surrogate_algo == 'LogisticRegression':
        surrogate_model = LogisticRegression(random_state=38)
    elif surrogate_algo == 'MLPClassifier':
        surrogate_model = MLPClassifier(hidden_layer_sizes=(154, 154), activation='relu', solver='sgd', alpha=1e-4, learning_rate_init=0.001, max_iter=100, random_state=38, verbose=True)
    else:
        print(f"Option {surrogate_algo} isn't within algorithms options.")
        return -1
            
    
    # reset unpaired index and drop unlabeled rows
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    
    label_index = (y != -1)
    X_train = X[label_index]
    y_train = y[label_index]

    # creating subsamples (index)
    index_0 = (y <= threshold)
    index_1 = (y > threshold)
    

    # test samples to be used in all of the validations, including to calculate the prediction value of the target model
    test_index_0 = np.random.choice(np.where(index_0)[0], 500, replace=False)
    test_index_1 = np.random.choice(np.where(index_1)[0], 500, replace=False)
    test_sample = np.concatenate((test_index_0, test_index_1))
    np.random.shuffle(test_sample)
    X_test = X_train.iloc[test_sample]
    y_test = y_train.iloc[test_sample]

    # Removing test data from original dataset, avoiding data leak
    X_train = X_train.drop(test_sample, axis=0).reset_index(drop=True)
    y_train = y_train.drop(test_sample, axis=0).reset_index(drop=True)

    # re-calculating the index without the test samples
    index_0 = (y_train <= threshold)   # index_0 - index_0(test)
    index_1 = (y_train > threshold)    # index_1 - index_1(test)

    # Shap config
    X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=0.01, stratify=y_train)
    _, X_test_sample, _, y_test_sample = train_test_split(X_test, y_test, test_size=0.1, stratify=y_test)
    X_test_sample = pd.DataFrame(X_test_sample, columns=X.columns)
    background_data = X_test_sample.map(lambda x: 0)

    
    # calculating shap-sampling to the target model
    print('Calculating Shap values of the target model')
    explainer_1 = shap.SamplingExplainer(target_model.predict, pd.concat([process_and_scale_data(X_train, y_train, n_clusters=10, n_init=10)], columns=X_train.columns))
    shap_values_1 = explainer_1.shap_values(X_test_sample) 

    for iter_count in tqdm.tqdm(range(max_iter), desc='Substitute model training'):
        print(f'Iteration {iter_count + 1}: Number of samples = {num_samples * 2}')
        balanced_sample_0 = np.random.choice(np.where(index_0)[0], num_samples, replace=False)
        balanced_sample_1 = np.random.choice(np.where(index_1)[0], num_samples, replace=False)
        balanced_sample = np.concatenate((balanced_sample_0, balanced_sample_1))
        np.random.shuffle(balanced_sample)

        #creating REAL-DATA subsampling
        X_subsample_real = X_train.iloc[balanced_sample]
        y_subsample_real = classify_api(target, X_subsample_real, threshold=0.5, type='hard')
        
        #creating synthetic-data subsampling
        if mode == 'li': #linear interpolation
            synthetic_data_0 = linear_interpolation(X_train.iloc[balanced_sample_0], (int(round(num_samples * augment_count))), lambda_)
            synthetic_data_1 = linear_interpolation(X_train.iloc[balanced_sample_1], (int(round(num_samples * augment_count))), lambda_)
            print(f'Creating {int(len(synthetic_data_0)) + int(len(synthetic_data_1))} synthetic samples using Linear Interpolation')

        elif mode == 'gn': # gaussian noise
            synthetic_data_0 = gaussian_noise(X_train.iloc[balanced_sample_0], noise_level=lambda_)
            synthetic_data_1 = gaussian_noise(X_train.iloc[balanced_sample_1], noise_level=lambda_)
            print(f'Creating {int(len(synthetic_data_0)) + int(len(synthetic_data_1))} syntetic samples using Gaussian noise')

        else:
            print("Incorrect option.")
            break
        
        # Concatenating real samples with synthetic samples
        X_subsample = np.concatenate((X_subsample_real, synthetic_data_0, synthetic_data_1))
        X_subsample_df = pd.DataFrame(X_subsample, columns=X.columns) # changing to pd dataframe

        synthetic_y_0 = np.array([0] * len(synthetic_data_0))
        synthetic_y_1 = np.array([0] * len(synthetic_data_1))
        y_subsample = np.concatenate((y_subsample_real, synthetic_y_0, synthetic_y_1))
        y_subsample_df = pd.Series(y_subsample)

        
        print(f'Total of {len(X_subsample_df)} samples')
        surrogate_model.fit(X_subsample_df, y_subsample_df)
        dump(surrogate_model, "surrogate_model_intermediate_our_attack.joblib")
        print('Calculating EBFA...')
        explainer_2 = shap.SamplingExplainer(target_model.predict,  pd.concat([process_and_scale_data(X_train_sample, y_train_sample, n_clusters=10, n_init=10)], columns=X_train.columns))
        shap_values_2 = explainer_2.shap_values(X_test_sample)
        ebfa_models = shap_values_importance(shap_values_1, shap_values_2, k=15)
        
        print("=======================================================================================================")
        print(f'Queries: {(num_samples*2):.4f}')
        print(f'Explainability-Based Feature Agreement: {ebfa_models:.4f}')
        print("=======================================================================================================")
        
        temp_file = os.path.join(os.getcwd(), "surrogate_model_intermediate_our_attack.joblib")
        if os.path.exists(temp_file):
            os.remove(temp_file)

        if abs(ebfa_models) >= ebfa_limit:
            print("Saving substitute model")
            dump(surrogate_model, substitute_model_path)
            
            print(f'Convergence achieved after {iter_count +1}. Saving model at: {substitute_model_path}')
            return (num_samples * 2, ebfa_models)
        # if continuing, the number of samples is enlarged
        num_samples += 200
        if num_samples > len(index_0) or num_samples > len(index_1):
            print(f"Process finished after {iter_count +1}. The convergence wasn't achieved.")
            return -1

## Attacking Results

In [None]:
# Usando normalização diferente daquela usada nos dados de treinamento original
Ascaler = MinMaxScaler()
X_scaled = Ascaler.fit_transform(d_test[X_columns])
X_scaled = pd.DataFrame(X_scaled, columns=X_columns)

In [None]:
substitute_model_our_attack("LogisticRegression_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.02, ebfa_limit=0.5, mode='li', surrogate_algo='RandomForestClassifier')

In [None]:
substitute_model_our_attack("LogisticRegression_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.02, ebfa_limit=0.8, mode='li', surrogate_algo='RandomForestClassifier')

In [None]:
substitute_model_our_attack("LogisticRegression_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.6, ebfa_limit=0.8, mode='gn', surrogate_algo='DecisionTreeClassifier')
                        

In [None]:
substitute_model_our_attack("LogisticRegression_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.6, ebfa_limit=0.8, mode='li', surrogate_algo='DecisionTreeClassifier')

## Training a Second model to compare the features - RandomForestClassifier as Surrogate Model


In [None]:
ML_models = [
        RandomForestClassifier(random_state=45),
]

ML_neams = [
        "RandomForestClassifier",
]


for train_set in tqdm(training_sets):
    d = pd.read_csv(DATASET_DIRECTORY + train_set)
    d[X_columns] = scaler.transform(d[X_columns])
    new_y = [dict_2classes[k] for k in d[y_column]]
    d[y_column] = new_y
    d[y_column] = d[y_column].apply(lambda x: 1 if x=='Attack' else 0)
    
    for model in (ML_models):
        model.fit(d[X_columns], d[y_column])
    del d

In [None]:
y_test = []
preds = {i:[] for i in range(len(ML_models))}
for test_set in tqdm(test_sets):
    d_test = pd.read_csv(DATASET_DIRECTORY + test_set)
    d_test[X_columns] = scaler.transform(d_test[X_columns])
    new_y = [dict_2classes[k] for k in d_test[y_column]]
    d_test[y_column] = new_y
    d_test[y_column] = d_test[y_column].apply(lambda x: 1 if x=='Attack' else 0)
    y_test += list(d_test[y_column].values)
    
    for i in range(len(ML_models)):
        model = ML_models[i]
        y_pred = list(model.predict(d_test[X_columns]))
        preds[i] = preds[i] + y_pred

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
for k,v in preds.items():
    y_pred = v
    print(f"##### {ML_neams[k]} (2 classes) #####")
    print('accuracy_score: ', accuracy_score(y_pred, y_test))
    print('recall_score: ', recall_score(y_pred, y_test, average='macro'))
    print('precision_score: ', precision_score(y_pred, y_test, average='macro'))
    print('f1_score: ', f1_score(y_pred, y_test, average='macro'))
    print()
    print()
    print()

In [None]:
 # Saving the model
dump(model, os.path.join(BASE, "RandomForestClassify_CiCioT2023.joblib"))

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.02, ebfa_limit=0.4, mode='li', surrogate_algo='RandomForestClassifier')

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.2, ebfa_limit=0.6, mode='li', surrogate_algo='DecisionTreeClassifier')

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.2, ebfa_limit=0.7, mode='li', surrogate_algo='DecisionTreeClassifier')

###### substitute_model_our_attack_v2("RandomForestClassify_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.02, ebfa_limit=0.6, mode='li', surrogate_algo='RandomForestClassifier')

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled, d_test[y_column], lambda_=0.5, ebfa_limit=0.5, mode='li', surrogate_algo='RandomForestClassifier')

In [None]:
features_importance("RandomForestClassify_CiCioT2023.joblib", 15)

In [None]:
features_importance("aaebfa_limit_0.5_surrogate_model_our_attack_RandomForestClassifier_RandomForestClassify_CiCioT2023.joblib_li", 15)

In [None]:
features_importance("RandomForestClassify_CiCioT2023.joblib", 15)

In [None]:
features_importance("aaebfa_limit_0.6_surrogate_model_our_attack_RandomForestClassifier_RandomForestClassify_CiCioT2023.joblib_li", 25)

In [None]:
# Using the same scaler of the trained model
Sscaler = StandardScaler()
X_scaled_S = Sscaler.fit_transform(d_test[X_columns])
X_scaled_S = pd.DataFrame(X_scaled_S, columns=[X_columns])

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled_S, d_test[y_column], lambda_=0.02, ebfa_limit=0.5, mode='gn', surrogate_algo='RandomForestClassifier')

In [None]:
substitute_model_our_attack("RandomForestClassify_CiCioT2023.joblib", X_scaled_S, d_test[y_column], lambda_=0.02, ebfa_limit=0.5, mode='li', surrogate_algo='RandomForestClassifier')