In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pickle
import os
import joblib
from tensorflow.keras.models import load_model
import tensorflow as tf
import matplotlib.pyplot as plt
tf.random.set_seed(42)

In [None]:
def individual_to_params(individual):
    criterion, splitter, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease, ccp_alpha = individual
    
    params = {"criterion": criterion, "splitter": splitter, "max_depth": max_depth, "min_samples_split": min_samples_split, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, "max_features": max_features, "max_leaf_nodes": max_leaf_nodes, "min_impurity_decrease": min_impurity_decrease, "ccp_alpha": ccp_alpha}
    
    return params

In [None]:
def createModel(individual, X_train, y_train):
    params = individual_to_params(individual)
    clf = DecisionTreeClassifier(random_state=42,**params)
    clf.fit(X_train, y_train)
    return clf

In [None]:
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath, index_col=[0])
    # df=df[['SrcWin','sHops','dHops','sTtl','dTtl','SynAck','SrcBytes','DstBytes','SAppBytes',\
    #                    'Dur','TotPkts','TotBytes','TotAppByte','Rate','SrcRate','Label']]
    #Le = LabelEncoder()
    #df['Label'] = le.fit_transform(df['Label'])
    df=df[['SrcWin', 'sHops', 'sTtl', 'dTtl', 'SrcBytes', 'DstBytes', 'Dur', 'TotBytes', 'Rate','Label']]
    print(df.shape)
    print("loading data")
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    return X, y,df


In [None]:
data_path='../data/'
train_file = os.path.join(data_path, 'ISCX_training.csv')
test_file = os.path.join(data_path, 'ISCX_Testing.csv')
X_train, y_train,train_df = load_and_preprocess(train_file)
X_test, y_test,test_df = load_and_preprocess(test_file)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_base, X_val_base, y_train_base, y_val_base = train_test_split(X_train_scaled,y_train, test_size=0.01, random_state=2, shuffle=True)

In [None]:
model_path='../optimization/information_feature_selection/'

In [None]:
# best_ind=['gini', 'random', 18, 6, 6, 0, None, 90, 0.0, 0.0]
# clf = createModel(best_ind, X_train_base, y_train_base)


In [None]:
clf=joblib.load(model_path+'best_decision_tree_multiiscx.pkl')

In [None]:
predictions = clf.predict(X_test_scaled)
print("Accuracy: ", accuracy_score(y_test, predictions))
print("Precision: ", precision_score(y_test, predictions))
print("Recall: ", recall_score(y_test, predictions))
print("F1 score: ", f1_score(y_test, predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))

In [None]:
def get_q_hat(cal_smx,cal_labels,alpha):
    n=cal_smx.shape[0]
    cal_scores = 1-cal_smx[np.arange(n),cal_labels]
    # 2: get adjusted quantile
    q_level = np.ceil((n+1)*(1-alpha))/n
    qhat = np.quantile(cal_scores, q_level, interpolation='higher')
    print(f'q_hat:{qhat}')
    return qhat

In [None]:
probabilities = clf.predict_proba(X_val_base)


In [None]:
test_probabilities = clf.predict_proba(X_test_scaled)


In [None]:
from conformity_helper_new import conform_helper
conformity_helper=conform_helper()

In [None]:
iscx_q_hat=get_q_hat(probabilities,y_val_base.astype(int).values,0.05)
test_f1, test_precision, test_recall, test_accuracy, test_cm = conformity_helper.calculate_metrics(y_test,predictions)
test_TN, test_FP, test_FN, test_TP = test_cm[0][0], test_cm[0][1], test_cm[1][0], test_cm[1][1]
test_tb_c, test_tm_c, test_fb_c, test_fm_c, test_tb_Nc, test_tm_Nc, test_fb_Nc, test_fm_Nc, original_test_df, lgb_test_final_prediction_sets = conformity_helper.get_conformity_result(test_TN, test_TP, test_FN, test_FP,test_probabilities ,y_test,predictions,iscx_q_hat)
test_coverage_count, test_non_coverage_count, test_coverage_per, test_non_coverage_per = conformity_helper.coverage_modified_value(lgb_test_final_prediction_sets,y_test.astype(int))
print(f"Test| Coverage count| {test_coverage_count} | Non-coverage count| {test_non_coverage_count} | Coverage percentage| {test_coverage_per} | Non-coverage percentage| {test_non_coverage_per}")

In [None]:
iscx_q_hat=get_q_hat(probabilities,y_val_base.astype(int).values,0.1)
test_f1, test_precision, test_recall, test_accuracy, test_cm = conformity_helper.calculate_metrics(y_test,predictions)
test_TN, test_FP, test_FN, test_TP = test_cm[0][0], test_cm[0][1], test_cm[1][0], test_cm[1][1]
test_tb_c, test_tm_c, test_fb_c, test_fm_c, test_tb_Nc, test_tm_Nc, test_fb_Nc, test_fm_Nc, original_test_df, lgb_test_final_prediction_sets = conformity_helper.get_conformity_result(test_TN, test_TP, test_FN, test_FP,test_probabilities ,y_test,predictions,iscx_q_hat)
test_coverage_count, test_non_coverage_count, test_coverage_per, test_non_coverage_per = conformity_helper.coverage_modified_value(lgb_test_final_prediction_sets,y_test.astype(int))
print(f"Test| Coverage count| {test_coverage_count} | Non-coverage count| {test_non_coverage_count} | Coverage percentage| {test_coverage_per} | Non-coverage percentage| {test_non_coverage_per}")

In [None]:
features_to_attack = ['SrcWin', 'sHops', 'sTtl', 'dTtl', 'SrcBytes', 'DstBytes', 'Dur', 'TotBytes', 'Rate']

all_adversarial_samples = []

for feature in features_to_attack:
    filename = "../output_iscx_new/" + str(feature) + "_data.pkl"
    
    with open(filename, "rb") as file:
        loaded_data = pickle.load(file)

    adversarial_samples_list = loaded_data['adversarial_samples_list']

    # Extend the collector list with non-empty lists
    for samples in adversarial_samples_list:
        if samples:  # This checks if the list is not empty
            all_adversarial_samples.extend(samples)

# Convert to numpy array
all_adversarial_samples_array = np.array(all_adversarial_samples)

print(all_adversarial_samples_array)

In [None]:
adversarial_prediction =clf.predict(all_adversarial_samples_array)

In [None]:
def count_ones_zero(adversarial_prediction):
    count_0s = np.sum(adversarial_prediction  == 0.)
    count_1s = np.sum(adversarial_prediction  == 1.)
    print(f"Number of 0s: {count_0s}")
    print(f"Number of 1s: {count_1s}")


In [None]:
count_ones_zero(adversarial_prediction)

In [None]:
#the adverserial sample should be predicted as malware so we get the samples which is predicted as benign
samples_with_prediction_0 = all_adversarial_samples_array[adversarial_prediction == 0]

In [None]:
samples_with_prediction_0.shape

In [None]:
def adverserial_conformal(clf,samples_with_prediction_0,iscx_q_hat):
    samples_with_prediction_proba=clf.predict_proba(samples_with_prediction_0)
    final_prediction_sets=samples_with_prediction_proba >= (1-iscx_q_hat)
    conformity=[]
    for i in range(0,final_prediction_sets.shape[0]):
        if final_prediction_sets[i][0]==final_prediction_sets[i][1]:
            conformity.append('No-Conformity')
        else:
            conformity.append('Conform')
    count_no_conformity = conformity.count('No-Conformity')
    count_conform = conformity.count('Conform')

    print(f"Number of 'No-Conformity': {count_no_conformity}")
    print(f"Number of 'Conform': {count_conform}")


In [None]:
adverserial_conformal(clf,samples_with_prediction_0,iscx_q_hat)

In [None]:
y_adverserial=np.ones(samples_with_prediction_0.shape[0])
print(y_adverserial.shape)

In [None]:
combine_y=np.concatenate((y_train_base,y_adverserial),axis=0)


In [None]:
combine_x_train = np.concatenate((X_train_base, samples_with_prediction_0), axis=0)

In [None]:
# Generate a shuffled index
shuffled_index = np.random.permutation(len(combine_y))

# Apply the shuffled index to both arrays
shuffled_x_train = combine_x_train[shuffled_index]
shuffled_y = combine_y[shuffled_index]

In [None]:
best_ind=['gini', 'random', 18, 6, 6, 0, None, 90, 0.0, 0.0]
retrain_clf = createModel(best_ind, shuffled_x_train,shuffled_y)

In [None]:
predictions = retrain_clf.predict(X_test_scaled)
print("Accuracy: ", accuracy_score(y_test, predictions))
print("Precision: ", precision_score(y_test, predictions))
print("Recall: ", recall_score(y_test, predictions))
print("F1 score: ", f1_score(y_test, predictions))
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions))

In [None]:
adversarial_prediction_retrain=retrain_clf.predict(samples_with_prediction_0)

In [None]:
count_ones_zero(adversarial_prediction_retrain)

In [None]:
from deap import creator, base, tools, algorithms

In [None]:
import random

In [None]:
# Define constants:
POPULATION_SIZE = 100
P_CROSSOVER = 0.5
P_MUTATION = 0.5
NUM_GENERATIONS = 100
HALL_OF_FAME_SIZE = 10
print("c")

# Genetic Algorithm constants:
creator.create("FitnessMulti", base.Fitness, weights=(1.0,1.0))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()

# ['entropy', 'random', None, 11, 2, 0, None, 90, 0.0, 0.0]
print("yo")
CRITERION = ["gini", "entropy"]
SPLITTER = ["best", "random"]
MAX_DEPTH = [None] + list(range(3, 51, 3))
MIN_SAMPLES_SPLIT = list(range(2, 21))
MIN_SAMPLES_LEAF = list(range(1, 21))
MIN_WEIGHT_FRACTION_LEAF = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
MAX_FEATURES = ["auto", "sqrt", "log2", None]
MAX_LEAF_NODES = [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
MIN_IMPURITY_DECREASE = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
CCP_ALPHA = [0.0, 0.01, 0.02, 0.03, 0.04, 0.05]

# Attribute generator 
toolbox.register("attr_criterion", random.choice, CRITERION)
toolbox.register("attr_splitter", random.choice, SPLITTER)
toolbox.register("attr_max_depth", random.choice, MAX_DEPTH)
toolbox.register("attr_min_samples_split", random.choice, MIN_SAMPLES_SPLIT)
toolbox.register("attr_min_samples_leaf", random.choice, MIN_SAMPLES_LEAF)
toolbox.register("attr_min_weight_fraction_leaf", random.choice, MIN_WEIGHT_FRACTION_LEAF)
toolbox.register("attr_max_features", random.choice, MAX_FEATURES)
toolbox.register("attr_max_leaf_nodes", random.choice, MAX_LEAF_NODES)
toolbox.register("attr_min_impurity_decrease", random.choice, MIN_IMPURITY_DECREASE)
toolbox.register("attr_ccp_alpha", random.choice, CCP_ALPHA)

# Structure initializers
# Structure initializers
toolbox.register("individual", tools.initCycle, creator.Individual, (toolbox.attr_criterion, toolbox.attr_splitter, toolbox.attr_max_depth, toolbox.attr_min_samples_split, toolbox.attr_min_samples_leaf, toolbox.attr_min_weight_fraction_leaf, toolbox.attr_max_features, toolbox.attr_max_leaf_nodes, toolbox.attr_min_impurity_decrease, toolbox.attr_ccp_alpha), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# toolbox.register("select", tools.selNSGA2)

toolbox.register("select", tools.selTournament, tournsize=5)
# toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mate", tools.cxUniform, indpb=0.5)


def custom_mutate(individual):
    gene = random.randint(0,9) # Select which parameter to mutate
    if gene == 0:
        individual[0] = toolbox.attr_criterion()
    elif gene == 1:
        individual[1] = toolbox.attr_splitter()
    elif gene == 2:
        individual[2] = toolbox.attr_max_depth()
    elif gene == 3:
        individual[3] = toolbox.attr_min_samples_split()
    elif gene == 4:
        individual[4] = toolbox.attr_min_samples_leaf()
    elif gene == 5:
        individual[5] = toolbox.attr_min_weight_fraction_leaf()
    elif gene == 6:
        individual[6] = toolbox.attr_max_features()
    elif gene == 7:
        individual[7] = toolbox.attr_max_leaf_nodes()
    elif gene == 8:
        individual[8] = toolbox.attr_min_impurity_decrease()
    elif gene == 9:
        individual[9] = toolbox.attr_ccp_alpha()
    return individual,

toolbox.register("mutate", custom_mutate)

In [None]:
def evalModel(individual, X_train, y_train, X_test, y_test):
    clf = createModel(individual, X_train, y_train)
    predictions = clf.predict(X_test)
    f1 = f1_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    return (f1,accuracy,)

In [None]:

def makeEvalModel(X_train, y_train, X_test, y_test):
    def evalModelWrapper(individual):
        return evalModel(individual, X_train, y_train, X_test, y_test)
    return evalModelWrapper


In [None]:
# Initialize a population and evolve it
toolbox.register("evaluate", makeEvalModel(shuffled_x_train,shuffled_y, X_test_scaled, y_test))
pop = toolbox.population(n=POPULATION_SIZE)
hof = tools.HallOfFame(1)
#     stats = tools.Statistics(lambda ind: ind.fitness.values)
#     stats.register("avg", np.mean)
#     stats.register("std", np.std)
#     stats.register("min", np.min)
#     stats.register("max", np.max)
stats = tools.Statistics(lambda ind: ind.fitness.values[0]) # for the first objective
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

stats2 = tools.Statistics(lambda ind: ind.fitness.values[1]) # for the second objective
stats2.register("avg", np.mean)
stats2.register("std", np.std)
stats2.register("min", np.min)
stats2.register("max", np.max)

mstats = tools.MultiStatistics(fitness=stats, fitness2=stats2)


pop, logbook = algorithms.eaSimple(pop, toolbox, cxpb=P_CROSSOVER, mutpb=P_MUTATION, 
                                    ngen=NUM_GENERATIONS, stats=mstats, halloffame=hof, verbose=True)

# Get the best individual from the Hall of Fame
best_ind = hof[0]
print("Best individual: %s\nwith fitness: %s" % (best_ind, best_ind.fitness))



In [None]:
best_ind=['entropy', 'best', 45, 9, 1, 0, 'sqrt', None, 0.0, 0.0]

In [None]:
# Train and test the best individual on the full data
re_optimized_clf = createModel(best_ind,shuffled_x_train,shuffled_y)
predictions_reoptimized = re_optimized_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions_reoptimized)

# Save the best model
dataset_name='iscx'
joblib.dump(clf, 're_optimized_adverserial_dt_model'+dataset_name+'.pkl')
print("Accuracy: ", accuracy_score(y_test, predictions_reoptimized))
print("Precision: ", precision_score(y_test, predictions_reoptimized))
print("Recall: ", recall_score(y_test, predictions_reoptimized))
print("F1 score: ", f1_score(y_test, predictions_reoptimized))
print("Confusion Matrix: \n", confusion_matrix(y_test, predictions_reoptimized))


In [None]:

train_predictions = re_optimized_clf.predict(shuffled_x_train)
print("Accuracy: ", accuracy_score(shuffled_y, train_predictions))
print("Precision: ", precision_score(shuffled_y, train_predictions))
print("Recall: ", recall_score(shuffled_y, train_predictions))
print("F1 score: ", f1_score(shuffled_y, train_predictions))
print("Confusion Matrix: \n", confusion_matrix(shuffled_y, train_predictions))

In [None]:
adversarial_prediction_re_optimized=re_optimized_clf.predict(samples_with_prediction_0)

In [None]:
count_ones_zero(adversarial_prediction_re_optimized)

In [None]:
val_probabilities_re_optimized = re_optimized_clf.predict_proba(X_val_base)

In [None]:
predictions_reoptimized_prob = re_optimized_clf.predict_proba(X_test_scaled)

In [None]:
iscx_q_hat=get_q_hat(val_probabilities_re_optimized,y_val_base.astype(int).values,0.1)
test_f1, test_precision, test_recall, test_accuracy, test_cm = conformity_helper.calculate_metrics(y_test,predictions_reoptimized)
test_TN, test_FP, test_FN, test_TP = test_cm[0][0], test_cm[0][1], test_cm[1][0], test_cm[1][1]
test_tb_c, test_tm_c, test_fb_c, test_fm_c, test_tb_Nc, test_tm_Nc, test_fb_Nc, test_fm_Nc, original_test_df, lgb_test_final_prediction_sets = conformity_helper.get_conformity_result(test_TN, test_TP, test_FN, test_FP,predictions_reoptimized_prob,y_test,predictions_reoptimized,iscx_q_hat)
test_coverage_count, test_non_coverage_count, test_coverage_per, test_non_coverage_per = conformity_helper.coverage_modified_value(lgb_test_final_prediction_sets,y_test.astype(int))
print(f"Test| Coverage count| {test_coverage_count} | Non-coverage count| {test_non_coverage_count} | Coverage percentage| {test_coverage_per} | Non-coverage percentage| {test_non_coverage_per}")