In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Library Importation

In [None]:
import pandas as pd
import warnings
from sklearn.preprocessing import normalize
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
import scipy.stats
from scipy.stats import norm
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

warnings.filterwarnings('ignore')

# Create Datasets Transformers

## Import Data

In [None]:
def raw_data_filter(data):
    columns = (['duration'
    ,'protocol_type'
    ,'service'
    ,'flag'
    ,'src_bytes'
    ,'dst_bytes'
    ,'land'
    ,'wrong_fragment'
    ,'urgent'
    ,'hot'
    ,'num_failed_logins'
    ,'logged_in'
    ,'num_compromised'
    ,'root_shell'
    ,'su_attempted'
    ,'num_root'
    ,'num_file_creations'
    ,'num_shells'
    ,'num_access_files'
    ,'num_outbound_cmds'
    ,'is_host_login'
    ,'is_guest_login'
    ,'count'
    ,'srv_count'
    ,'serror_rate'
    ,'srv_serror_rate'
    ,'rerror_rate'
    ,'srv_rerror_rate'
    ,'same_srv_rate'
    ,'diff_srv_rate'
    ,'srv_diff_host_rate'
    ,'dst_host_count'
    ,'dst_host_srv_count'
    ,'dst_host_same_srv_rate'
    ,'dst_host_diff_srv_rate'
    ,'dst_host_same_src_port_rate'
    ,'dst_host_srv_diff_host_rate'
    ,'dst_host_serror_rate'
    ,'dst_host_srv_serror_rate'
    ,'dst_host_rerror_rate'
    ,'dst_host_srv_rerror_rate'
    ,'attack'
    ,'level'])

    data.columns = columns
    selected_columns = ([
    'service',
    'flag',
    'src_bytes'
    ,'dst_bytes'
    ,'urgent'
    ,'num_failed_logins'
    ,'num_compromised'
    ,'root_shell'
    ,'su_attempted'
    ,'num_root'
    ,'num_file_creations'
    ,'num_shells'
    ,'num_access_files'
    ,'is_host_login'
    ,'is_guest_login'
    ,'num_outbound_cmds'
    ,'wrong_fragment'
    ,'level',
    ])
    data.drop(columns=selected_columns, inplace=True)
    data = data[data['protocol_type'] == 'tcp']
    data.drop(columns=['protocol_type'], inplace=True, axis =1)


    data['attack'] = data.attack.map(lambda a: 0 if a == 'normal' else 1)
    return data

In [None]:
# import raw data
path1 = "/content/drive/MyDrive/Dataset/KDDTrain+.txt"
d_raw_train = pd.read_csv(path1, header=None)
path2 = "/content/drive/MyDrive/Dataset/KDDTest+.txt"
d_raw_test = pd.read_csv(path2, header=None)


## Normalization

In [None]:
def normalizing(data):
    for idx in range(len(data.columns)):
        if data.nunique()[idx] != 2:
            data.iloc[:,idx] = normalize([np.array(data.iloc[:,idx])]).reshape(-1)
    return data


In [None]:
# this cell is for test:
#  d_raw_train = normalizing(d_raw_train)
#  d_raw_train.head()

## Principal Component Analysis

In [None]:
def PCA_transformation(data, dim = None):
    pca = PCA()
    pca.fit(data)
    cum_vars = np.cumsum(pca.explained_variance_ratio_)
    for i in range(len(cum_vars)):
        if cum_vars[i] == 0.999:
            break
    if dim:
        pca = PCA(dim)
        data = pca.fit_transform(data)
        return pd.DataFrame(data)

    pca = PCA(i)
    data = pca.fit_transform(data)
    return pd.DataFrame(data), i

In [None]:
# this cell is for test:
#  d_raw_train = PCA_transformation(d_raw_train)
#  d_raw_train.shape

## Feature Gaussian Mixture Probability Mode l

In [None]:
def GMM_Row_Transform(data, values, threshold):
    probs = []
    for idx in range(len(data.columns)):
        mean = np.array(data.iloc[:,idx]).mean()
        std = np.array(data.iloc[:,idx]).std()

        z_score = (values[idx] - mean)/std
        prob = (1-norm.cdf(z_score))*100
        probs.append(prob)
    return probs

In [None]:
def GMM_Matrix_Transform(origin_data, data, threshold):
    matrix = []
    for i in tqdm(range(len(data))):
        row = GMM_Row_Transform(origin_data, data.iloc[i,:], threshold)

        matrix.append(row)
    return matrix

In [None]:
def GMM_vote(data, values, threshold):
    no = 0
    for idx in range(len(data.columns)):
        mean = np.array(data.iloc[:,idx]).mean()
        std = np.array(data.iloc[:,idx]).std()

        z_score = (values[idx] - mean)/std
        prob = (1-norm.cdf(z_score))*100
        if prob <= threshold:
            no += 1
    return no

In [None]:
# test GMM_vote with 70%:
# values = d_raw_train.iloc[0,:]
# GMM_vote(d_raw_train, values, 70)



# Create 8 Datasets

### d_raw

In [None]:
# filter raw data
d_raw_train = raw_data_filter(d_raw_train)
d_raw_test = raw_data_filter(d_raw_test)

In [None]:
train_target = d_raw_train['attack']
d_raw_train.drop('attack',inplace=True,axis=1)

test_target = d_raw_test['attack']
d_raw_test.drop('attack',inplace=True,axis=1)

In [None]:
d_raw_train['attack'] = train_target
d_raw_train_normal = d_raw_train[d_raw_train['attack'] == 0]
d_raw_train_normal.drop(['attack'], inplace=True, axis = 1)
d_raw_train.drop(['attack'], inplace=True, axis = 1)
d_raw_train_normal.head()

Unnamed: 0,duration,land,hot,logged_in,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,...,150,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0
3,0,0,0,1,5,5,0.2,0.2,0.0,0.0,...,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01
4,0,0,0,1,30,32,0.0,0.0,0.0,0.0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0,0,0,1,3,7,0.0,0.0,0.0,0.0,...,8,219,1.0,0.0,0.12,0.03,0.0,0.0,0.0,0.0
16,0,0,0,1,8,9,0.0,0.11,0.0,0.0,...,91,255,1.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0


### d_raw_pca

In [None]:
d_raw_pca_train, dim = PCA_transformation(d_raw_train)
d_raw_pca_test = PCA_transformation(d_raw_test, dim)

### d_raw_norm

In [None]:
d_raw_norm_train = normalizing(d_raw_train)
d_raw_norm_test = normalizing(d_raw_test)

### d_raw_norm_pca

In [None]:
d_raw_norm_pca_train, dim = PCA_transformation(normalizing(d_raw_train))
d_raw_norm_pca_test = PCA_transformation(normalizing(d_raw_test), dim)

## d_raw_probs

In [None]:
d_raw_probs_train = pd.DataFrame(GMM_Matrix_Transform(d_raw_train, d_raw_train, 50))
d_raw_probs_test =  pd.DataFrame(GMM_Matrix_Transform(d_raw_train, d_raw_test, 50))


100%|██████████| 102689/102689 [40:33<00:00, 42.19it/s]
100%|██████████| 18880/18880 [07:23<00:00, 42.56it/s]


In [None]:
# normal
d_raw_probs_train_normal = pd.DataFrame(GMM_Matrix_Transform(d_raw_train_normal, d_raw_train_normal, 50))

100%|██████████| 53600/53600 [16:09<00:00, 55.27it/s]


## d_raw_pca_probs

In [None]:
d_raw_pca_probs_train, dim = PCA_transformation(d_raw_train)
d_raw_pca_probs_train = pd.DataFrame(GMM_Matrix_Transform(d_raw_pca_probs_train, d_raw_pca_probs_train, 50))

d_raw_pca_probs_test = PCA_transformation(d_raw_test, dim )
d_raw_pca_probs_test = pd.DataFrame(GMM_Matrix_Transform(d_raw_pca_probs_train, d_raw_pca_probs_test, 50))

100%|██████████| 102689/102689 [37:19<00:00, 45.85it/s]
100%|██████████| 18880/18880 [06:51<00:00, 45.93it/s]


In [None]:
# normal
d_raw_pca_probs_train_normal, dim = PCA_transformation(d_raw_train_normal)
d_raw_pca_probs_train_normal = pd.DataFrame(GMM_Matrix_Transform(d_raw_pca_probs_train_normal, d_raw_pca_probs_train_normal, 50))

100%|██████████| 53600/53600 [13:39<00:00, 65.37it/s]


## d_norm_probs

In [None]:
d_norm_probs_train = normalizing(d_raw_train)
d_norm_probs_train = pd.DataFrame(GMM_Matrix_Transform(d_norm_probs_train, d_norm_probs_train, 50))

d_norm_probs_test = normalizing(d_raw_test)
d_norm_probs_test = pd.DataFrame(GMM_Matrix_Transform(d_norm_probs_train, d_norm_probs_test, 50))

100%|██████████| 102689/102689 [40:06<00:00, 42.68it/s]
100%|██████████| 18880/18880 [07:10<00:00, 43.90it/s]


In [None]:
# normal
d_norm_probs_train_normal = normalizing(d_raw_train_normal)
d_norm_probs_train_normal = pd.DataFrame(GMM_Matrix_Transform(d_norm_probs_train_normal, d_norm_probs_train_normal, 50))

100%|██████████| 53600/53600 [14:58<00:00, 59.63it/s]


## d_norm_pca_probs

In [None]:
d_norm_pca_probs_train = normalizing(d_raw_train)
d_norm_pca_probs_train, dim = PCA_transformation(d_norm_pca_probs_train)
d_norm_pca_probs_train = pd.DataFrame(GMM_Matrix_Transform(d_norm_pca_probs_train, d_norm_pca_probs_train, 50))

d_norm_pca_probs_test = normalizing(d_raw_test)
d_norm_pca_probs_test, dim = PCA_transformation(d_norm_pca_probs_test)
d_norm_pca_probs_test = pd.DataFrame(GMM_Matrix_Transform(d_norm_pca_probs_train, d_norm_pca_probs_test, 50))

 81%|████████▏ | 83527/102689 [30:18<07:43, 41.37it/s]

In [None]:
d_norm_pca_probs_train_noraml = normalizing(d_raw_train_normal)
d_norm_pca_probs_train_noraml, dim = PCA_transformation(d_norm_pca_probs_train_noraml)
d_norm_pca_probs_train_noraml = pd.DataFrame(GMM_Matrix_Transform(d_norm_pca_probs_train_noraml, d_norm_pca_probs_train_noraml, 50))

# **Models**

In [None]:
def voting(data_train, data_test, min_abnormal_features = 10, threshold = 50):
    preds = []
    for idx in tqdm(range(len(data_test))):
        values = list(data_test.iloc[idx,:])
        no = GMM_vote(data_train, values, threshold)
        if no > min_abnormal_features:
            preds.append(1)
        else:
            preds.append(0)
    return preds

In [None]:
def K_means_Distance(test_idx, test_data, model):
    c1 = np.array(test_data.iloc[test_idx,:])
    c2 = model.cluster_centers_[0]
    temp = c1 - c2
    euclid_dist = np.sqrt(np.dot(temp.T, temp))
    euclid_dist
    return euclid_dist

In [None]:
def kmd_model(test_data, train_data, model, threshold_dis):
    kmd_d_raw_preds  = []
    for idx in tqdm(range(len(test_data))):
        dis = K_means_Distance(idx, test_data, model)
        if dis > threshold_dis:
            kmd_d_raw_preds.append(1)
        else:
            kmd_d_raw_preds.append(0)
    return kmd_d_raw_preds

In [None]:
def SVM_model(train_data, test_data):
    model = SVC()
    model.fit(train_data, train_target)
    preds = model.predict(test_data)
    return preds

In [None]:
def kmean_C_model(train_data, test_data):
    kmeans = KMeans(n_clusters=2, n_init="auto").fit(train_data)
    preds = kmeans.predict(test_data)
    return preds

In [None]:
def dt_model(train_data, test_data):
    model = tree.DecisionTreeClassifier()
    model.fit(train_data, train_target)
    preds = model.predict(test_data)
    return preds

In [None]:


def mlp_model(train_data, test_data):
    # Assuming train_target is the target variable for the training data
    model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42).fit(train_data, train_target)
    preds = model.predict(test_data)
    return preds

# Voting

## Voting for d_raw

In [None]:
voting_d_raw_preds = voting(d_raw_train_normal, d_raw_test, min_abnormal_features = 10, threshold = 50)

In [None]:

voting_d_raw_preds1 = voting(d_raw_probs_train_normal, d_raw_probs_test, min_abnormal_features = 10, threshold = 50)

## Voting for d_raw_pca

In [None]:
d_raw_pca_train_normal, _ = PCA_transformation(d_raw_train_normal)

In [None]:
voting_d_raw_pac_preds = voting(d_raw_pca_train_normal, d_raw_pca_test, min_abnormal_features = 10, threshold = 50)

In [None]:

voting_d_raw_pac_preds2 = voting(d_raw_pca_probs_train_normal, d_raw_pca_probs_test, min_abnormal_features = 10, threshold = 50)

## Voting for d_raw_norm

In [None]:
d_raw_norm_train_normal = normalizing(d_raw_train_normal)

In [None]:
voting_d_raw_norm_preds = voting(d_raw_norm_train_normal, d_raw_norm_test, min_abnormal_features = 10, threshold = 50)

In [None]:

voting_d_raw_norm_preds3 = voting(d_norm_probs_train_normal, d_norm_probs_test, min_abnormal_features = 10, threshold = 50)

## Voting for d_raw_norm_pca

In [None]:
d_raw_norm_pca_train_noraml, dim = PCA_transformation(normalizing(d_raw_train_normal))

In [None]:
voting_d_raw_norm_pac_preds = voting(d_raw_norm_pca_train, d_raw_norm_pca_test, min_abnormal_features = 10, threshold = 50)

In [None]:

voting_d_raw_norm_pac_preds4 = voting(d_norm_pca_probs_train_noraml, d_norm_pca_probs_test, min_abnormal_features = 10, threshold = 50)

# KM-D

## KM-D for d_raw

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_train_normal)
kmd_d_raw_preds = kmd_model(d_raw_test, d_raw_train_normal,kmeans, 0.8)

## KM-D for d_raw_pca

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_pca_train_normal)
kmd_d_raw_pca_preds = kmd_model(d_raw_pca_test, d_raw_pca_train_normal,kmeans,0.8)

## KM-D for d_raw_norm

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_norm_train_normal)
kmd_d_raw_norm_preds = kmd_model(d_raw_norm_test, d_raw_norm_train_normal,kmeans,0.5)
# np.unique(kmd_d_raw_norm_preds, return_counts=True)

## KM-D for d_raw_norm_pca

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_norm_pca_train_noraml)
kmd_d_raw_norm_pca_preds = kmd_model(d_raw_norm_pca_test, d_raw_norm_pca_train_noraml,kmeans,0.5)

## KM-D for d_raw_probs

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_probs_train_normal)
kmd_d_raw_probs_preds = kmd_model(d_raw_probs_test, d_raw_probs_train_normal,kmeans,150)

## KM-D for d_raw_pca_probs

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_raw_pca_probs_train_normal)
kmd_d_raw_pca_probs_preds = kmd_model(d_raw_pca_probs_test, d_raw_pca_probs_train_normal,kmeans,165)

## KM-D for d_norm_probs

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_norm_probs_train_normal)
kmd_d_raw_norm_probs_preds = kmd_model(d_norm_probs_test, d_norm_probs_train_normal,kmeans,150)

## KM-D for d_norm_pca_probs

In [None]:
kmeans = KMeans(n_clusters=1, random_state=0, n_init="auto").fit(d_norm_pca_probs_train_noraml)
kmd_d_raw_norm_pca_probs_preds = kmd_model(d_norm_pca_probs_test, d_norm_pca_probs_train_noraml,kmeans,176)

# SVM

## SVM for d_raw

In [None]:
svm_d_raw_preds = SVM_model(d_raw_train, d_raw_test)

## SVM for d_raw_pca

In [None]:
svm_d_raw_pca_preds = SVM_model(d_raw_pca_train, d_raw_pca_test)

## SVM for d_raw_norm

In [None]:
svm_d_raw_norm_preds = SVM_model(d_raw_norm_train, d_raw_norm_test)

## SVM for d_raw_norm_pca

In [None]:
svm_d_raw_norm_pca_preds = SVM_model(d_raw_norm_pca_train, d_raw_norm_pca_test)

## SVM for d_raw_probs

In [None]:
svm_d_raw_probs_preds = SVM_model(d_raw_probs_train, d_raw_probs_test)

## SVM for d_raw_pca_probs

In [None]:
svm_d_raw_pca_probs_preds = SVM_model(d_raw_pca_probs_train, d_raw_pca_probs_test)

## SVM for d_norm_probs

In [None]:
svm_d_norm_probs_preds = SVM_model(d_norm_probs_train, d_norm_probs_test)

## SVM for d_norm_pca_probs

In [None]:
svm_d_norm_pca_probs_preds = SVM_model(d_norm_pca_probs_train, d_norm_pca_probs_test)

# KM-C

## KM-C for d_raw

In [None]:
kmc_d_raw_preds = kmean_C_model(d_raw_train, d_raw_test)

## KM-C for d_raw_pca

In [None]:
kmc_d_raw_pca_preds = kmean_C_model(d_raw_pca_train, d_raw_pca_test)

## KM-C for d_raw_norm

In [None]:
kmc_d_raw_norm_preds = kmean_C_model(d_raw_norm_train, d_raw_norm_test)

## KM-C for d_raw_norm_pca

In [None]:
kmc_d_raw_norm_pca_preds = kmean_C_model(d_raw_norm_pca_train, d_raw_norm_pca_test)

## KM-C for d_raw_probs

In [None]:
kmc_d_raw_probs_preds = kmean_C_model(d_raw_probs_train, d_raw_probs_test)

## KM-C for d_raw_pca_probs

In [None]:
kmc_d_raw_pca_probs_preds = kmean_C_model(d_raw_pca_probs_train, d_raw_pca_probs_test)

## KM-C for d_norm_probs

In [None]:
kmc_d_norm_probs_preds = kmean_C_model(d_norm_probs_train, d_norm_probs_test)

## KM-C for d_norm_pca_probs

In [None]:
kmc_d_norm_pca_probs_preds = kmean_C_model(d_norm_pca_probs_train, d_norm_pca_probs_test)

# DT

## DT for d_raw

In [None]:
dt_d_raw_preds = dt_model(d_raw_train, d_raw_test)

## DT for d_raw_pca

In [None]:
dt_d_raw_pca_preds = dt_model(d_raw_pca_train, d_raw_pca_test)

## DT for d_raw_norm

In [None]:
dt_d_raw_norm_preds = dt_model(d_raw_norm_train, d_raw_norm_test)

## DT for d_raw_norm_pca

In [None]:
dt_d_raw_norm_pca_preds = dt_model(d_raw_norm_pca_train, d_raw_norm_pca_test)

## DT for d_raw_probs

In [None]:
dt_d_raw_probs_preds = dt_model(d_raw_probs_train, d_raw_probs_test)

## DT for d_raw_pca_probs

In [None]:
dt_d_raw_pca_probs_preds = dt_model(d_raw_pca_probs_train, d_raw_pca_probs_test)

## DT for d_norm_probs

In [None]:
dt_d_norm_probs_preds = dt_model(d_norm_probs_train, d_norm_probs_test)

## DT for d_norm_pca_probs

In [None]:
dt_d_norm_pca_probs_preds = dt_model(d_norm_pca_probs_train, d_norm_pca_probs_test)

## MLP

## MLP for d_raw

In [None]:
mlp_d_raw_preds = mlp_model(d_raw_train, d_raw_test)

## MLP for d_raw_pca

In [None]:
mlp_d_raw_pca_preds = mlp_model(d_raw_pca_train, d_raw_pca_test)

## MLP for d_raw_norm

In [None]:
mlp_d_raw_norm_preds = mlp_model(d_raw_norm_train, d_raw_norm_test)

## MLP for d_raw_norm_pca

In [None]:
mlp_d_raw_norm_pca_preds = mlp_model(d_raw_norm_pca_train, d_raw_norm_pca_test)

## MLP for d_raw_probs

In [None]:
mlp_d_raw_probs_preds = mlp_model(d_raw_probs_train, d_raw_probs_test)

## MLP for d_raw_pca_probs

In [None]:
mlp_d_raw_pca_probs_preds = mlp_model(d_raw_pca_probs_train, d_raw_pca_probs_test)

## MLP for d_norm_probs

In [None]:
mlp_d_norm_probs_preds = mlp_model(d_norm_probs_train, d_norm_probs_test)

## MLP for d_norm_pca_probs

In [None]:
mlp_d_norm_pca_probs_preds = mlp_model(d_norm_pca_probs_train, d_norm_pca_probs_test)

# Evaluation

In [None]:
def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

In [None]:
f1(test_target, mlp_d_raw_preds)

In [None]:
f1(test_target, kmd_d_raw_preds )

In [None]:
f1(test_target, svm_d_raw_preds )

In [None]:
f1(test_target, kmc_d_raw_preds )

In [None]:
f1(test_target, dt_d_raw_preds )

In [None]:
f1(test_target, voting_d_raw_preds   )

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [
        f1(test_target, voting_d_raw_preds ),
        f1(test_target, kmd_d_raw_preds ),
        f1(test_target, svm_d_raw_preds),
        f1(test_target, kmc_d_raw_preds),
        f1(test_target, dt_d_raw_preds),
        f1(test_target, mlp_d_raw_preds),
    ]
})

# Afficher le tableau des résultats
print(results_df)

In [None]:
def sensitivity(y_true, y_pred):
    tp = sum((y_true == 1) & (y_pred == 1))
    fn = sum((y_true == 1) & (y_pred == 0))

    sensitivity_score = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    return sensitivity_score

In [None]:
sensitivity(test_target, mlp_d_raw_preds)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_sens1 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [
        sensitivity(test_target, voting_d_raw_preds ),
        sensitivity(test_target, kmd_d_raw_preds ),
        sensitivity(test_target, svm_d_raw_preds),
        sensitivity(test_target, kmc_d_raw_preds),
        sensitivity(test_target, dt_d_raw_preds),
        sensitivity(test_target, mlp_d_raw_preds),
    ]
})

# Afficher le tableau des résultats
print(results_df_sens1)

In [None]:
from sklearn.metrics import auc, roc_curve

def calculate_cap(y_true, y_prob):
    sorted_indices = np.argsort(y_prob)[::-1]
    cum_positive = np.cumsum(y_true[sorted_indices])
    cum_total = np.arange(1, len(y_true) + 1)
    cap = cum_positive / np.sum(y_true)
    return cum_total, cap

In [None]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(test_target, mlp_d_raw_preds)
print(f"AUC Score: {auc_score}")



In [None]:
import pandas as pd
from sklearn.metrics import roc_auc_score

# Calculer les AUC pour chaque modèle
auc_mlp = roc_auc_score(test_target, mlp_d_raw_preds)
auc_svm = roc_auc_score(test_target, svm_d_raw_preds)
auc_dt = roc_auc_score(test_target, dt_d_raw_preds)
auc_kmc = roc_auc_score(test_target, kmc_d_raw_preds)
auc_kmd = roc_auc_score(test_target, kmd_d_raw_preds)
auc_voting = roc_auc_score(test_target, voting_d_raw_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap1 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [auc_voting, auc_kmd, auc_svm, auc_kmc, auc_dt, auc_mlp]
})

# Afficher le tableau des résultats
print(results_df_cap1)


In [None]:
# Concaténer les trois DataFrames le long de l'axe des colonnes
final_result_df = pd.concat([results_df_cap1, results_df_sens1, results_df], axis=1)

# Afficher le tableau des résultats final
print(final_result_df)



In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [

        f1(test_target, kmd_d_raw_preds ),
        f1(test_target, svm_d_raw_preds),
        f1(test_target, kmc_d_raw_preds),
        f1(test_target, dt_d_raw_preds),
        f1(test_target, mlp_d_raw_preds),
    ]
})
results_df_sens = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [

        sensitivity(test_target, kmd_d_raw_preds ),
        sensitivity(test_target, svm_d_raw_preds),
        sensitivity(test_target, kmc_d_raw_preds),
        sensitivity(test_target, dt_d_raw_preds),
        sensitivity(test_target, mlp_d_raw_preds),
    ]
})
# Calculer les AUC pour chaque modèle

auc_kmd = roc_auc_score(test_target, kmd_d_raw_preds)
auc_svm = roc_auc_score(test_target, svm_d_raw_preds)
auc_kmc = roc_auc_score(test_target, kmc_d_raw_preds)
auc_dt = roc_auc_score(test_target, dt_d_raw_preds)
auc_mlp = roc_auc_score(test_target, mlp_d_raw_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [ auc_kmd, auc_svm, auc_kmc, auc_dt, auc_mlp]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df)
print(results_df_sens)
print(results_df_cap)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df2 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [
        f1(test_target, voting_d_raw_preds1 ),
        f1(test_target, kmd_d_raw_pca_preds ),
        f1(test_target, svm_d_raw_pca_preds),
        f1(test_target, kmc_d_raw_pca_preds),
        f1(test_target, dt_d_raw_pca_preds),
        f1(test_target, mlp_d_raw_pca_preds),
    ]
})
results_df_sens2 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [
        sensitivity(test_target, voting_d_raw_preds1 ),
        sensitivity(test_target, kmd_d_raw_pca_preds ),
        sensitivity(test_target, svm_d_raw_pca_preds),
        sensitivity(test_target, kmc_d_raw_pca_preds),
        sensitivity(test_target, dt_d_raw_pca_preds),
        sensitivity(test_target, mlp_d_raw_pca_preds),
    ]
})
# Calculer les AUC pour chaque modèle
auc_voting2 = roc_auc_score(test_target, voting_d_raw_preds1)
auc_kmd2 = roc_auc_score(test_target, kmd_d_raw_pca_preds)
auc_svm2 = roc_auc_score(test_target, svm_d_raw_pca_preds)
auc_kmc2 = roc_auc_score(test_target, kmc_d_raw_pca_preds)
auc_dt2 = roc_auc_score(test_target, dt_d_raw_pca_preds)
auc_mlp2 = roc_auc_score(test_target, mlp_d_raw_pca_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap2 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [auc_voting2, auc_kmd2, auc_svm2, auc_kmc2, auc_dt2, auc_mlp2]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df2)
print(results_df_sens2)
print(results_df_cap2)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df3 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [

        f1(test_target, kmd_d_raw_norm_preds ),
        f1(test_target, svm_d_raw_norm_preds),
        f1(test_target, kmc_d_raw_norm_preds),
        f1(test_target, dt_d_raw_norm_preds),
        f1(test_target, mlp_d_raw_norm_preds),
    ]
})
results_df_sens3 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [

        sensitivity(test_target, kmd_d_raw_norm_preds ),
        sensitivity(test_target, svm_d_raw_norm_preds),
        sensitivity(test_target, kmc_d_raw_norm_preds),
        sensitivity(test_target, dt_d_raw_norm_preds),
        sensitivity(test_target, mlp_d_raw_norm_preds),
    ]
})
# Calculer les AUC pour chaque modèle

auc_kmd3 = roc_auc_score(test_target, kmd_d_raw_norm_preds)
auc_svm3 = roc_auc_score(test_target, svm_d_raw_norm_preds)
auc_kmc3 = roc_auc_score(test_target, kmc_d_raw_norm_preds)
auc_dt3 = roc_auc_score(test_target, dt_d_raw_norm_preds)
auc_mlp3 = roc_auc_score(test_target, mlp_d_raw_norm_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap3 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [ auc_kmd3, auc_svm3, auc_kmc3, auc_dt3, auc_mlp3]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df3)
print(results_df_sens3)
print(results_df_cap3)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df4 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [
        f1(test_target, voting_d_raw_pac_preds2 ),
        f1(test_target, kmd_d_raw_norm_pca_preds ),
        f1(test_target, svm_d_raw_norm_pca_preds),
        f1(test_target, kmc_d_raw_norm_pca_preds),
        f1(test_target, dt_d_raw_norm_pca_preds),
        f1(test_target, mlp_d_raw_norm_pca_preds),
    ]
})
results_df_sens4 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [
        sensitivity(test_target, voting_d_raw_pac_preds2 ),
        sensitivity(test_target, kmd_d_raw_norm_pca_preds ),
        sensitivity(test_target, svm_d_raw_norm_pca_preds),
        sensitivity(test_target, kmc_d_raw_norm_pca_preds),
        sensitivity(test_target, dt_d_raw_norm_pca_preds),
        sensitivity(test_target, mlp_d_raw_norm_pca_preds),
    ]
})
# Calculer les AUC pour chaque modèle
auc_voting4 = roc_auc_score(test_target, voting_d_raw_pac_preds2)
auc_kmd4 = roc_auc_score(test_target, kmd_d_raw_norm_pca_preds)
auc_svm4 = roc_auc_score(test_target, svm_d_raw_norm_pca_preds)
auc_kmc4 = roc_auc_score(test_target, kmc_d_raw_norm_pca_preds)
auc_dt4 = roc_auc_score(test_target, dt_d_raw_norm_pca_preds)
auc_mlp4 = roc_auc_score(test_target, mlp_d_raw_norm_pca_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap4 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [auc_voting4, auc_kmd4, auc_svm4, auc_kmc4, auc_dt4, auc_mlp4]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df4)
print(results_df_sens4)
print(results_df_cap4)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df5 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [

        f1(test_target, kmd_d_raw_probs_preds ),
        f1(test_target, svm_d_raw_probs_preds),
        f1(test_target, kmc_d_raw_probs_preds),
        f1(test_target, dt_d_raw_probs_preds),
        f1(test_target, mlp_d_raw_probs_preds),
    ]
})
results_df_sens5 = pd.DataFrame({
    'Model': ['KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [

        sensitivity(test_target, kmd_d_raw_probs_preds ),
        sensitivity(test_target, svm_d_raw_probs_preds),
        sensitivity(test_target, kmc_d_raw_probs_preds),
        sensitivity(test_target, dt_d_raw_probs_preds),
        sensitivity(test_target, mlp_d_raw_probs_preds),
    ]
})
# Calculer les AUC pour chaque modèle

auc_kmd5 = roc_auc_score(test_target, kmd_d_raw_probs_preds)
auc_svm5 = roc_auc_score(test_target, svm_d_raw_probs_preds)
auc_kmc5 = roc_auc_score(test_target, kmc_d_raw_probs_preds)
auc_dt5 = roc_auc_score(test_target, dt_d_raw_probs_preds)
auc_mlp5 = roc_auc_score(test_target, mlp_d_raw_probs_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap5 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [ auc_kmd5, auc_svm5, auc_kmc5, auc_dt5, auc_mlp5]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df5)
print(results_df_sens5)
print(results_df_cap5)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df6 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [
        f1(test_target, voting_d_raw_norm_preds3 ),
        f1(test_target, kmd_d_raw_pca_probs_preds ),
        f1(test_target, svm_d_raw_pca_probs_preds),
        f1(test_target, kmc_d_raw_pca_probs_preds),
        f1(test_target, dt_d_raw_pca_probs_preds),
        f1(test_target, mlp_d_raw_pca_probs_preds),
    ]
})
results_df_sens6 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [
        sensitivity(test_target, voting_d_raw_norm_preds3 ),
        sensitivity(test_target, kmd_d_raw_pca_probs_preds ),
        sensitivity(test_target, svm_d_raw_pca_probs_preds),
        sensitivity(test_target, kmc_d_raw_pca_probs_preds),
        sensitivity(test_target, dt_d_raw_pca_probs_preds),
        sensitivity(test_target, mlp_d_raw_pca_probs_preds),
    ]
})
# Calculer les AUC pour chaque modèle
auc_voting6 = roc_auc_score(test_target, voting_d_raw_norm_preds3)
auc_kmd6 = roc_auc_score(test_target, kmd_d_raw_pca_probs_preds)
auc_svm6 = roc_auc_score(test_target, svm_d_raw_pca_probs_preds)
auc_kmc6 = roc_auc_score(test_target, kmc_d_raw_pca_probs_preds)
auc_dt6 = roc_auc_score(test_target, dt_d_raw_pca_probs_preds)
auc_mlp6 = roc_auc_score(test_target, mlp_d_raw_pca_probs_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap6 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [auc_voting6, auc_kmd6, auc_svm6, auc_kmc6, auc_dt6, auc_mlp6]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df6)
print(results_df_sens6)
print(results_df_cap6)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df7 = pd.DataFrame({
    'Model': ['KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [

        f1(test_target, kmd_d_raw_norm_probs_preds ),
        f1(test_target, svm_d_norm_probs_preds),
        f1(test_target, kmc_d_norm_probs_preds),
        f1(test_target, dt_d_norm_probs_preds),
        f1(test_target, mlp_d_norm_probs_preds),
    ]
})
results_df_sens7 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [

        sensitivity(test_target, kmd_d_raw_norm_probs_preds ),
        sensitivity(test_target, svm_d_norm_probs_preds),
        sensitivity(test_target, kmc_d_norm_probs_preds),
        sensitivity(test_target, dt_d_norm_probs_preds),
        sensitivity(test_target, mlp_d_norm_probs_preds),
    ]
})
# Calculer les AUC pour chaque modèle

auc_kmd7 = roc_auc_score(test_target, kmd_d_raw_norm_probs_preds)
auc_svm7 = roc_auc_score(test_target, svm_d_norm_probs_preds)
auc_kmc7 = roc_auc_score(test_target, kmc_d_norm_probs_preds)
auc_dt7 = roc_auc_score(test_target, dt_d_norm_probs_preds)
auc_mlp7 = roc_auc_score(test_target, mlp_d_norm_probs_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap7 = pd.DataFrame({
    'Model': [ 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [ auc_kmd7, auc_svm7, auc_kmc7, auc_dt7, auc_mlp7]
})

# Afficher le tableau des résultats


# Afficher le tableau des résultats
print(results_df7)
print(results_df_sens7)
print(results_df_cap7)

In [None]:
# Créer un DataFrame avec les prédictions et les noms des modèles
results_df8 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'F1 Score': [
        f1(test_target, voting_d_raw_norm_pac_preds4),
        f1(test_target, kmd_d_raw_norm_pca_probs_preds),
        f1(test_target, svm_d_norm_pca_probs_preds),
        f1(test_target, kmc_d_norm_pca_probs_preds),
        f1(test_target, dt_d_norm_pca_probs_preds),
        f1(test_target, mlp_d_norm_pca_probs_preds),
    ]
})

results_df_sens8 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'sensitivity': [
        sensitivity(test_target, voting_d_raw_norm_pac_preds4),
        sensitivity(test_target, kmd_d_raw_norm_pca_probs_preds),
        sensitivity(test_target, svm_d_norm_pca_probs_preds),
        sensitivity(test_target, kmc_d_norm_pca_probs_preds),
        sensitivity(test_target, dt_d_norm_pca_probs_preds),
        sensitivity(test_target, mlp_d_norm_pca_probs_preds),
    ]
})

# Calculer les AUC pour chaque modèle
auc_voting8 = roc_auc_score(test_target, voting_d_raw_norm_pac_preds4)
auc_kmd8 = roc_auc_score(test_target, kmd_d_raw_norm_pca_probs_preds)
auc_svm8 = roc_auc_score(test_target, svm_d_norm_pca_probs_preds)
auc_kmc8 = roc_auc_score(test_target, kmc_d_norm_pca_probs_preds)
auc_dt8 = roc_auc_score(test_target, dt_d_norm_pca_probs_preds)
auc_mlp8 = roc_auc_score(test_target, mlp_d_norm_pca_probs_preds)

# Créer un DataFrame avec les prédictions et les noms des modèles
results_df_cap8 = pd.DataFrame({
    'Model': ['Voting', 'KMD', 'SVM', 'KMC', 'Decision Tree', 'MLP'],
    'CAP': [auc_voting8, auc_kmd8, auc_svm8, auc_kmc8, auc_dt8, auc_mlp8]
})

# Afficher le tableau des résultats
print(results_df8)
print(results_df_sens8)
print(results_df_cap8)
