In [None]:
!unzip -q /content/drive/MyDrive/master\ SDIA/S3/Advanced\ Topics\ \ Data\ Science/dataset_binary.zip \
 -d /content/dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
X_train=pd.read_csv('/content/dataset/X_train.csv')
X_test=pd.read_csv('/content/dataset/X_test.csv')
X_val=pd.read_csv('/content/dataset/X_val.csv')
y_train=pd.read_csv('/content/dataset/y_train.csv')
y_test=pd.read_csv('/content/dataset/y_test.csv')
y_val=pd.read_csv('/content/dataset/y_val.csv')

In [None]:
len(X_train), len(X_test), len(X_val)

(5076507, 1087824, 1087824)

### **Encodage du label (obligatoire)**

In [None]:
label_map = {
    'Benign': 0,
    'Malicious': 1
}

y_train.iloc[:, 0] = y_train.iloc[:, 0].map(label_map)
y_val.iloc[:, 0]   = y_val.iloc[:, 0].map(label_map)
y_test.iloc[:, 0]  = y_test.iloc[:, 0].map(label_map)


### **Normalisation des features (TRÈS IMPORTANT)**

On fit le scaler UNIQUEMENT sur le train
(et on applique sur val/test)

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 1️⃣ Fit uniquement sur le train
scaler.partial_fit(X_train)

# 2️⃣ Fonction de scaling par batch
def scale_in_batches(X, scaler, batch_size=50_000):
    X = X.values if hasattr(X, "values") else X
    X_scaled = np.empty(X.shape, dtype=np.float32)

    for i in range(0, X.shape[0], batch_size):
        X_scaled[i:i+batch_size] = scaler.transform(X[i:i+batch_size])

    return X_scaled

# 3️⃣ Scaling sécurisé
X_train_scaled = scale_in_batches(X_train, scaler)
X_val_scaled   = scale_in_batches(X_val, scaler)
X_test_scaled  = scale_in_batches(X_test, scaler)




In [None]:
import torch
del X_train, X_val, X_test
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
print("Train :", X_train_scaled.shape, y_train.shape)
print("Val   :", X_val_scaled.shape, y_val.shape)
print("Test  :", X_test_scaled.shape, y_test.shape)
print("Labels uniques :", y_train.iloc[:, 0].unique())

Train : (5076507, 77) (5076507, 1)
Val   : (1087824, 77) (1087824, 1)
Test  : (1087824, 77) (1087824, 1)
Labels uniques : [1 0]


### **L’équilibrage se fait UNIQUEMENT sur le TRAIN**

In [None]:
import pandas as pd
from sklearn.utils import resample

# IMPORTANT : utiliser X_train_scaled
train_df = pd.concat([
    pd.DataFrame(X_train_scaled),
    y_train.reset_index(drop=True)
], axis=1)

In [None]:
# Nom de la colonne label
label_col = train_df.columns[-1]
df_benign    = train_df[train_df[label_col] == 0]
df_malicious = train_df[train_df[label_col] == 1]


In [None]:
print(train_df.iloc[:, -1].value_counts())


Label
1    5075604
0        903
Name: count, dtype: int64


In [None]:
#train_df,
del X_train_scaled,train_df
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
M =2000000
print("Taille cible par classe :", M)

Taille cible par classe : 2000000


In [None]:
df_malicious_downsampled = resample(
    df_malicious,
    replace=False,      # sans duplication
    n_samples=M,
    random_state=42
)

In [None]:
df_benign_upsampled = resample(
    df_benign,
    replace=True,      # avec duplication
    n_samples=M,
    random_state=42
)

In [None]:
train_balanced_df = pd.concat(
    [df_benign_upsampled, df_malicious_downsampled]
).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
del df_benign, df_malicious, df_benign_upsampled, df_malicious_downsampled
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
X_train_balanced = train_balanced_df.iloc[:, :-1].values
y_train_balanced = train_balanced_df.iloc[:, -1].values


In [None]:
del train_balanced_df
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
pd.Series(y_train_balanced).value_counts()


Unnamed: 0,count
0,2000000
1,2000000


### **Implémentation FWM**

In [None]:
import numpy as np
#Fonction : calcul des poids FWM (TRAIN ONLY)
def compute_fwm_weights(X, y, eps=1e-8):
    """
    X : np.ndarray (n_samples, n_features)
    y : np.ndarray (n_samples,)
    """

    # Séparation des classes
    X_0 = X[y == 0]
    X_1 = X[y == 1]

    # Moyennes par feature
    mu_0 = np.mean(X_0, axis=0)
    mu_1 = np.mean(X_1, axis=0)

    # Variances par feature
    std_0 = np.std(X_0, axis=0)
    std_1 = np.std(X_1, axis=0)

    # Score de séparabilité (article)
    scores = np.abs(mu_1 - mu_0) / (std_0 + std_1 + eps)

    # Normalisation
    weights = scores / (np.sum(scores) + eps)

    return weights.astype(np.float32)


In [None]:
#Calcul des poids FWM (une seule fois)
fwm_weights = compute_fwm_weights(
    X_train_balanced,
    y_train_balanced
)

print("Shape des poids :", fwm_weights.shape)


Shape des poids : (77,)


In [None]:
#Fonction : application FWM par batch (RAM SAFE)
def apply_fwm_in_batches(X, weights, batch_size=50_000):
    """
    X : np.ndarray (n_samples, n_features)
    weights : np.ndarray (n_features,)
    """

    X_fwm = np.empty_like(X, dtype=np.float32)

    for i in range(0, X.shape[0], batch_size):
        X_fwm[i:i+batch_size] = X[i:i+batch_size] * weights

    return X_fwm


In [None]:
#Application FWM (train / val / test)
X_train_fwm = apply_fwm_in_batches(X_train_balanced, fwm_weights)
X_val_fwm   = apply_fwm_in_batches(X_val_scaled,   fwm_weights)
X_test_fwm  = apply_fwm_in_batches(X_test_scaled,  fwm_weights)


In [None]:
#Vérification rapide (sanity check)
print("Avant FWM :", X_train_balanced[0, :10])
print("Après FWM :", X_train_fwm[0, :10])


Avant FWM : [ 4.5545931e+00  1.1845577e+00  3.4474282e+00 -7.8125382e-03
  5.2828209e+01 -5.4075713e-03  2.5962763e+00  2.6907651e+00
  2.6754749e+00 -1.6975038e-01]
Après FWM : [ 9.3613759e-02  4.5933731e-02  8.8477530e-02 -1.6546519e-04
  1.1535512e+00 -8.0005266e-05  9.2550516e-03  9.0150461e-03
  8.3973967e-03 -8.9236075e-04]


### implémentattion du  swcc

In [None]:
import numpy as np
from tqdm import tqdm

def swcc_single_batch(X_batch, window_size=5):
    """
    Applique SWCC sur un batch de données.
    X_batch: ndarray de forme (n_samples, n_features)
    window_size: taille de la fenêtre glissante
    """
    X_corrected = X_batch.copy()
    n_samples, n_features = X_batch.shape

    # Pour chaque feature (canal)
    for i in range(n_features):
        channel = X_batch[:, i]
        corrected_channel = np.zeros_like(channel)

        # Boucle glissante sur les échantillons
        for j in range(n_samples):
            start = max(0, j - window_size // 2)
            end = min(n_samples, j + window_size // 2 + 1)
            window = channel[start:end]

            # Correction : centrer sur la moyenne locale
            corrected_channel[j] = channel[j] - np.mean(window)

        X_corrected[:, i] = corrected_channel

    return X_corrected

def apply_swcc_in_batches(X, batch_size=1024, window_size=5):
    """
    Applique SWCC sur l'ensemble des données par batch pour économiser la RAM.
    """
    n_samples = X.shape[0]
    X_corrected = np.zeros_like(X)

    for start in tqdm(range(0, n_samples, batch_size), desc="SWCC"):
        end = min(start + batch_size, n_samples)
        X_corrected[start:end] = swcc_single_batch(X[start:end], window_size=window_size)
    return X_corrected

In [None]:
# Application sur train / val / test
X_train_swcc = apply_swcc_in_batches(X_train_balanced, batch_size=1024, window_size=5)
X_val_swcc   = apply_swcc_in_batches(X_val_scaled,   batch_size=1024, window_size=5)
X_test_swcc  = apply_swcc_in_batches(X_test_scaled,  batch_size=1024, window_size=5)

SWCC: 100%|██████████| 3907/3907 [50:34<00:00,  1.29it/s]
SWCC: 100%|██████████| 1063/1063 [13:40<00:00,  1.30it/s]
SWCC: 100%|██████████| 1063/1063 [13:35<00:00,  1.30it/s]


### **Modelisation CNN**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- 1. Définir un CNN générique ---
class SimpleCNN(nn.Module):
    def __init__(self, input_channels=1, num_classes=2):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(16)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)
        return x
def train_model_clean(X_train, y_train, X_val, y_val, epochs=10, batch_size=64, lr=0.001):
    import numpy as np
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset

    # --- Convertir les labels en int si ce n'est pas déjà fait ---
    y_train = np.array(y_train).astype(int)
    y_val   = np.array(y_val).astype(int)

    # --- Convertir les données en Tensor ---
    def prepare_tensor(X):
        X_tensor = torch.tensor(X, dtype=torch.float32)
        if X_tensor.ndim == 2:
            X_tensor = X_tensor.unsqueeze(1)  # (batch, 1, features)
        return X_tensor

    X_train_tensor = prepare_tensor(X_train)
    X_val_tensor   = prepare_tensor(X_val)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    y_val_tensor   = torch.tensor(y_val, dtype=torch.long)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader    = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # --- Device ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SimpleCNN(input_channels=X_train_tensor.shape[1], num_classes=len(np.unique(y_train)))
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_acc = 0.0
    best_model = model.state_dict()

    for epoch in range(epochs):
        # --- Entraînement ---
        model.train()
        running_loss = 0.0
        correct_train, total_train = 0, 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)
            _, predicted = torch.max(outputs, 1)
            total_train += y_batch.size(0)
            correct_train += (predicted == y_batch).sum().item()

        train_acc = float(correct_train) / float(total_train)

        # --- Validation ---
        model.eval()
        correct_val, total_val = 0, 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                total_val += y_batch.size(0)
                correct_val += (predicted == y_batch).sum().item()

        val_acc = float(correct_val) / float(total_val)

        # --- Sauvegarder meilleur modèle ---
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()

        # --- Print clair entre 0 et 1 ---
        print(f"Epoch {epoch+1}/{epochs} - Loss: {running_loss/total_train:.4f} "
              f"- Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}")

    model.load_state_dict(best_model)
    return model




### **CNN-1 : sur données normales**

In [None]:
# Assurer que les labels sont des entiers
y_train_balanced = np.array(y_train_balanced).astype(int)
y_val = np.array(y_val).astype(int)
y_test = np.array(y_test).astype(int)  # si tu as un test set

# Vérifier le type
print(y_train_balanced.dtype)  # doit être int64
print(y_val.dtype)

int64
int64


In [None]:
# --- 3. Entraînement des trois CNN ---
# CNN-1 : sur données normales
cnn1_model = train_model_v1(X_train_balanced, y_train_balanced, X_val_scaled, y_val)

Epoch 1/20 - Loss: 0.1373 - Train Acc: 0.9518 - Val Acc: 63.9779
Epoch 2/20 - Loss: 0.0832 - Train Acc: 0.9708 - Val Acc: 63.9564
Epoch 3/20 - Loss: 0.0658 - Train Acc: 0.9765 - Val Acc: 63.9823
Epoch 4/20 - Loss: 0.0562 - Train Acc: 0.9802 - Val Acc: 63.9532
Epoch 5/20 - Loss: 0.0502 - Train Acc: 0.9826 - Val Acc: 63.9681
Epoch 6/20 - Loss: 0.0461 - Train Acc: 0.9843 - Val Acc: 52.5729
Epoch 7/20 - Loss: 0.0431 - Train Acc: 0.9854 - Val Acc: 63.9748
Epoch 8/20 - Loss: 0.0413 - Train Acc: 0.9862 - Val Acc: 63.9672
Epoch 9/20 - Loss: 0.0392 - Train Acc: 0.9871 - Val Acc: 63.9497
Epoch 10/20 - Loss: 0.0380 - Train Acc: 0.9876 - Val Acc: 63.8980
Epoch 11/20 - Loss: 0.0369 - Train Acc: 0.9880 - Val Acc: 63.9542
Epoch 12/20 - Loss: 0.0357 - Train Acc: 0.9885 - Val Acc: 63.2816
Epoch 13/20 - Loss: 0.0349 - Train Acc: 0.9889 - Val Acc: 63.3978
Epoch 14/20 - Loss: 0.0340 - Train Acc: 0.9893 - Val Acc: 63.9715
Epoch 15/20 - Loss: 0.0332 - Train Acc: 0.9896 - Val Acc: 60.4481
Epoch 16/20 - Loss:

In [None]:
def evaluate_model_full(model, X_test, y_test, batch_size=64):
    import torch
    import numpy as np
    from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()

    # --- X ---
    if hasattr(X_test, "values"):  # pandas DataFrame
        X_test = X_test.values
    X_tensor = torch.tensor(X_test, dtype=torch.float32)
    if X_tensor.ndim == 2:
        X_tensor = X_tensor.unsqueeze(1)

    # --- y (ULTRA SAFE) ---
    if hasattr(y_test, "values"):  # pandas DataFrame ou Series
        y_test = y_test.values
    y_test = np.array(y_test).squeeze().astype(int)
    y_tensor = torch.tensor(y_test, dtype=torch.long)

    dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    tn, fp, fn, tp = cm.ravel()

    f1 = f1_score(all_labels, all_preds)
    TPR = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    FPR = fp / (fp + tn) if (fp + tn) > 0 else 0.0

    print(f"Accuracy: {acc:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"TPR (Recall): {TPR:.4f}")
    print(f"FPR: {FPR:.4f}")
    print("Confusion Matrix:")
    print(cm)

    return acc, cm, f1, TPR, FPR


In [None]:
# Après entraînement du modèle
#cnn_model = train_model_clean(X_train, y_train, X_val, y_val, epochs=20)
# Évaluation complète sur le test set
acc, cm, f1, TPR, FPR = evaluate_model_full(cnn1_model, X_test_scaled, y_test)


Accuracy: 0.9993
F1-score: 0.9996
TPR (Recall): 0.9993
FPR: 0.1813
Confusion Matrix:
[[    158      35]
 [    754 1086877]]


### **SWCC-CNN : sur données SWCC**

In [None]:
# SWCC-CNN : sur données SWCC
swcc_model = train_model_clean(X_train_swcc, y_train_balanced, X_val_swcc, y_val)

Epoch 1/10 - Loss: 0.3055 - Train Acc: 0.8727 - Val Acc: 61.2491
Epoch 2/10 - Loss: 0.2581 - Train Acc: 0.8933 - Val Acc: 60.8000
Epoch 3/10 - Loss: 0.2405 - Train Acc: 0.9021 - Val Acc: 59.6104
Epoch 4/10 - Loss: 0.2291 - Train Acc: 0.9081 - Val Acc: 61.3665
Epoch 5/10 - Loss: 0.2196 - Train Acc: 0.9132 - Val Acc: 60.1289
Epoch 6/10 - Loss: 0.2128 - Train Acc: 0.9172 - Val Acc: 60.9723
Epoch 7/10 - Loss: 0.2071 - Train Acc: 0.9202 - Val Acc: 60.3881
Epoch 8/10 - Loss: 0.2028 - Train Acc: 0.9225 - Val Acc: 61.3153
Epoch 9/10 - Loss: 0.1990 - Train Acc: 0.9246 - Val Acc: 60.0872
Epoch 10/10 - Loss: 0.1952 - Train Acc: 0.9266 - Val Acc: 58.8576


In [None]:
acc, cm, f1, TPR, FPR = evaluate_model_full(swcc_model, X_test_swcc, y_test)

Accuracy: 0.9196
F1-score: 0.9581
TPR (Recall): 0.9196
FPR: 0.0363
Confusion Matrix:
[[    186       7]
 [  87406 1000225]]


In [None]:
# Chemin complet dans Google Drive
save_path = "/content/drive/MyDrive/master SDIA/S3/Advanced Topics  Data Science/swcc_cnn_model.pth"

# Sauvegarde du modèle (state_dict recommandé)
torch.save(swcc_model.state_dict(), save_path)

print(f"Modèle sauvegardé dans : {save_path}")


Modèle sauvegardé dans : /content/drive/MyDrive/master SDIA/S3/Advanced Topics  Data Science/swcc_cnn_model.pth


### **FWM-CNN : sur données FWM**

In [None]:
del cnn1_model
import gc
gc.collect()
torch.cuda.empty_cache()


NameError: name 'cnn1_model' is not defined

In [None]:
# FWM-CNN : sur données FWM
fwm_model = train_model_clean(X_train_fwm, y_train_balanced, X_val_fwm, y_val)

Epoch 1/10 - Loss: 0.1463 - Train Acc: 0.9475 - Val Acc: 48.1689
Epoch 2/10 - Loss: 0.0963 - Train Acc: 0.9645 - Val Acc: 63.9243
Epoch 3/10 - Loss: 0.0779 - Train Acc: 0.9716 - Val Acc: 63.9805
Epoch 4/10 - Loss: 0.0680 - Train Acc: 0.9755 - Val Acc: 63.9788
Epoch 5/10 - Loss: 0.0616 - Train Acc: 0.9781 - Val Acc: 63.9781
Epoch 6/10 - Loss: 0.0568 - Train Acc: 0.9798 - Val Acc: 63.9782
Epoch 7/10 - Loss: 0.0538 - Train Acc: 0.9810 - Val Acc: 63.7691
Epoch 8/10 - Loss: 0.0507 - Train Acc: 0.9822 - Val Acc: 63.9854
Epoch 9/10 - Loss: 0.0479 - Train Acc: 0.9832 - Val Acc: 63.9748
Epoch 10/10 - Loss: 0.0457 - Train Acc: 0.9842 - Val Acc: 63.5748


In [None]:
acc, cm, f1, TPR, FPR = evaluate_model_full(fwm_model, X_test_fwm, y_test)

Accuracy: 0.9935
F1-score: 0.9967
TPR (Recall): 0.9935
FPR: 0.0311
Confusion Matrix:
[[    187       6]
 [   7065 1080566]]


In [None]:
# Chemin complet dans Google Drive
save_path = "/content/drive/MyDrive/master SDIA/S3/Advanced Topics  Data Science/fwm_cnn_model.pth"

# Sauvegarde du modèle (state_dict recommandé)
torch.save(fwm_model.state_dict(), save_path)

print(f"Modèle sauvegardé dans : {save_path}")


Modèle sauvegardé dans : /content/drive/MyDrive/master SDIA/S3/Advanced Topics  Data Science/fwm_cnn_model.pth


In [None]:
del fwm_model,X_test_fwm,X_val_fwm,X_train_fwm
import gc
gc.collect()
torch.cuda.empty_cache()