In [None]:
!unzip -q /content/drive/MyDrive/projet_IPS_IDS/dataset_type.zip  -d /content/dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
X_train=pd.read_csv('/content/dataset/X_train.csv')
X_test=pd.read_csv('/content/dataset/X_test.csv')
X_val=pd.read_csv('/content/dataset/X_val.csv')
y_train=pd.read_csv('/content/dataset/y_train.csv')
y_test=pd.read_csv('/content/dataset/y_test.csv')
y_val=pd.read_csv('/content/dataset/y_val.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/dataset/X_train.csv'

### **Encodage du label (obligatoire)**

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train.values.ravel())
y_val   = label_encoder.transform(y_val.values.ravel())
y_test  = label_encoder.transform(y_test.values.ravel())

# V√©rification
print("Classes :", label_encoder.classes_)


Classes : ['Audio' 'Background' 'Bruteforce' 'DoS' 'Information Gathering' 'Mirai'
 'Text' 'Video']


### **Normalisation des features (TR√àS IMPORTANT)**

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 1Ô∏è‚É£ Fit uniquement sur le train
scaler.partial_fit(X_train)

# 2Ô∏è‚É£ Fonction de scaling par batch
def scale_in_batches(X, scaler, batch_size=50_000):
    X = X.values if hasattr(X, "values") else X
    X_scaled = np.empty(X.shape, dtype=np.float32)

    for i in range(0, X.shape[0], batch_size):
        X_scaled[i:i+batch_size] = scaler.transform(X[i:i+batch_size])

    return X_scaled

# 3Ô∏è‚É£ Scaling s√©curis√©
X_train_scaled = scale_in_batches(X_train, scaler)
X_val_scaled   = scale_in_batches(X_val, scaler)
X_test_scaled  = scale_in_batches(X_test, scaler)




In [None]:
import torch
del X_train, X_val, X_test
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
print("Train :", X_train_scaled.shape, y_train.shape)
print("Val   :", X_val_scaled.shape, y_val.shape)
print("Test  :", X_test_scaled.shape, y_test.shape)
print("Labels uniques :", np.unique(y_train))


Train : (5076507, 77) (5076507,)
Val   : (1087824, 77) (1087824,)
Test  : (1087824, 77) (1087824,)
Labels uniques : [0 1 2 3 4 5 6 7]


### **L‚Äô√©quilibrage se fait UNIQUEMENT sur le TRAIN**

In [None]:
import pandas as pd

# IMPORTANT : utiliser X_train_scaled
train_df = pd.concat([
    pd.DataFrame(X_train_scaled),
    pd.Series(y_train, name="label").reset_index(drop=True)
], axis=1)

label_col = "label"

print(train_df[label_col].value_counts())


label
3    4470615
4     519985
5      60935
2      24069
7        600
6        147
0        134
1         22
Name: count, dtype: int64


In [None]:
#del X_train_scaled
import gc, torch
gc.collect()
torch.cuda.empty_cache()


In [None]:
M = 500000
print("Taille cible par classe :", M)


Taille cible par classe : 500000


In [None]:
from sklearn.utils import resample

balanced_dfs = []

classes = train_df[label_col].unique()
print("Classes :", classes)

for cls in classes:
    df_cls = train_df[train_df[label_col] == cls]
    n_cls = len(df_cls)

    print(f"Classe {cls} : {n_cls} √©chantillons")

    if n_cls > M:
        # Down-sampling
        df_resampled = resample(
            df_cls,
            replace=False,
            n_samples=M,
            random_state=42
        )
    else:
        # Up-sampling
        df_resampled = resample(
            df_cls,
            replace=True,
            n_samples=M,
            random_state=42
        )

    balanced_dfs.append(df_resampled)


Classes : [3 4 2 5 0 6 1 7]
Classe 3 : 4470615 √©chantillons
Classe 4 : 519985 √©chantillons
Classe 2 : 24069 √©chantillons
Classe 5 : 60935 √©chantillons
Classe 0 : 134 √©chantillons
Classe 6 : 147 √©chantillons
Classe 1 : 22 √©chantillons
Classe 7 : 600 √©chantillons


In [None]:
train_balanced_df = pd.concat(balanced_dfs) \
    .sample(frac=1, random_state=42) \
    .reset_index(drop=True)


In [None]:
del train_df, balanced_dfs
gc.collect()
torch.cuda.empty_cache()


In [None]:
X_train_balanced = train_balanced_df.iloc[:, :-1].values
y_train_balanced = train_balanced_df.iloc[:, -1].values


In [None]:
del train_balanced_df
gc.collect()
torch.cuda.empty_cache()


In [None]:
pd.Series(y_train_balanced).value_counts()


Unnamed: 0,count
2,500000
3,500000
0,500000
6,500000
5,500000
4,500000
1,500000
7,500000


### **One hot encoding**

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# reshape n√©cessaire car y_train_balanced est 1D
y_train_balanced = y_train_balanced.reshape(-1, 1)


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Pour les versions r√©centes de scikit-learn
ohe = OneHotEncoder(sparse_output=False)

# Fit et transform
y_train_oh = ohe.fit_transform(y_train_balanced)

# V√©rification
print("Shape One-Hot :", y_train_oh.shape)
print("Classes :", ohe.categories_)


Shape One-Hot : (4000000, 8)
Classes : [array([0, 1, 2, 3, 4, 5, 6, 7])]


In [None]:
y_val_oh = ohe.transform(y_val.reshape(-1, 1))
y_test_oh = ohe.transform(y_test.reshape(-1, 1))


### **Impl√©mentation FWM**

In [None]:
def compute_fwm_weights_multiclass(X, y, eps=1e-8):
    classes = np.unique(y)
    n_features = X.shape[1]

    mu_total = np.mean(X, axis=0)
    scores = np.zeros(n_features, dtype=np.float32)

    for cls in classes:
        X_cls = X[y == cls]
        mu_cls = np.mean(X_cls, axis=0)
        std_cls = np.std(X_cls, axis=0)
        scores += np.abs(mu_cls - mu_total) / (std_cls + eps)

    # Normalisation douce [0,1]
    weights = scores / (np.max(scores) + eps)

    return weights.astype(np.float32)


In [None]:
def apply_fwm_in_batches(X, fwm_weights, alpha=0.3, batch_size=500000):
    n_samples = X.shape[0]
    X_weighted = np.empty_like(X, dtype=np.float32)

    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        batch = X[start:end]
        X_weighted[start:end] = batch * (1.0 + alpha * fwm_weights)

    return X_weighted


In [None]:
# 1. Calcul des poids (train ONLY)
y_train_balanced_1d = y_train_balanced.reshape(-1)
fwm_weights = compute_fwm_weights_multiclass(
    X_train_balanced,
    y_train_balanced_1d
)

# 2. Application FWM SUR LES M√äMES TYPES DE DONN√âES
X_train_fwm = apply_fwm_in_batches(X_train_balanced, fwm_weights)
X_val_fwm   = apply_fwm_in_batches(X_val_scaled,   fwm_weights)
X_test_fwm  = apply_fwm_in_batches(X_test_scaled,  fwm_weights)


In [None]:
X_train_fwm = X_train_fwm[:, None, :]
X_val_fwm   = X_val_fwm[:, None, :]
X_test_fwm  = X_test_fwm[:, None, :]


### **impl√©mentattion du swcc**

In [None]:
import numpy as np
from tqdm import tqdm

def swcc_single_batch(X_batch, window_size=5):
    """
    Applique SWCC sur un batch de donn√©es.
    X_batch: ndarray de forme (n_samples, n_features)
    window_size: taille de la fen√™tre glissante
    """
    X_corrected = np.empty_like(X_batch, dtype=np.float32)
    n_samples, n_features = X_batch.shape

    for i in range(n_features):
        channel = X_batch[:, i]
        corrected_channel = np.zeros_like(channel, dtype=np.float32)

        for j in range(n_samples):
            start = max(0, j - window_size // 2)
            end = min(n_samples, j + window_size // 2 + 1)
            window = channel[start:end]

            corrected_channel[j] = channel[j] - np.mean(window)

        X_corrected[:, i] = corrected_channel

    return X_corrected


def apply_swcc_in_batches(X, batch_size=500000, window_size=5):
    """
    Applique SWCC sur l'ensemble des donn√©es par batch.
    """
    n_samples = X.shape[0]
    X_corrected = np.empty_like(X, dtype=np.float32)

    for start in tqdm(range(0, n_samples, batch_size), desc="SWCC"):
        end = min(start + batch_size, n_samples)
        X_corrected[start:end] = swcc_single_batch(X[start:end], window_size=window_size)

    return X_corrected


In [None]:
# Train √©quilibr√©
X_train_swcc = apply_swcc_in_batches(X_train_balanced, batch_size=500000, window_size=5)
# Val et Test (non √©quilibr√©, juste scaled)
X_val_swcc   = apply_swcc_in_batches(X_val_scaled, batch_size=200000, window_size=5)
X_test_swcc  = apply_swcc_in_batches(X_test_scaled, batch_size=200000, window_size=5)
print("Shapes SWCC :")
print("Train :", X_train_swcc.shape)
print("Val   :", X_val_swcc.shape)
print("Test  :", X_test_swcc.shape)

SWCC: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [48:17<00:00, 362.17s/it]
SWCC: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [13:01<00:00, 130.31s/it]
SWCC: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [13:05<00:00, 131.00s/it]

Shapes SWCC :
Train : (4000000, 77)
Val   : (1087824, 77)
Test  : (1087824, 77)





### **Modelisation CNN**

In [None]:
import torch
import torch.nn as nn

class SimpleCNN_OHE(nn.Module):
    def __init__(self, input_channels=1, num_classes=8):
        super(SimpleCNN_OHE, self).__init__()

        self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(16)

        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(32)

        self.relu  = nn.ReLU()
        self.pool  = nn.AdaptiveAvgPool1d(1)

        self.fc    = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.relu(self.bn1(self.conv1(x)))
        x = self.relu(self.bn2(self.conv2(x)))
        x = self.pool(x).squeeze(-1)
        x = self.fc(x)   # logits
        return x


In [None]:
def train_cnn_multiclass_ohe(
    model,
    X_train, y_train,
    X_val, y_val,
    epochs=20,
    batch_size=256,
    lr=0.001
):
    import torch
    from torch.utils.data import DataLoader, TensorDataset
    import numpy as np

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # ---------- Tensors ----------
    def prepare_X(X):
        X = torch.tensor(X, dtype=torch.float32)
        if X.ndim == 2:
            X = X.unsqueeze(1)
        return X

    X_train_t = prepare_X(X_train)
    X_val_t   = prepare_X(X_val)

    y_train_t = torch.tensor(y_train, dtype=torch.float32)
    y_val_t   = torch.tensor(y_val, dtype=torch.float32)

    train_loader = DataLoader(
        TensorDataset(X_train_t, y_train_t),
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        TensorDataset(X_val_t, y_val_t),
        batch_size=batch_size,
        shuffle=False
    )

    # ---------- Loss & Optim ----------
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = 0.0
    best_state = model.state_dict()

    for epoch in range(epochs):
        # ===== TRAIN =====
        model.train()
        correct, total = 0, 0

        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)

            optimizer.zero_grad()
            logits = model(Xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()

            preds = torch.argmax(logits, dim=1)
            targets = torch.argmax(yb, dim=1)

            correct += (preds == targets).sum().item()
            total += yb.size(0)

        train_acc = correct / total

        # ===== VAL =====
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                logits = model(Xb)

                preds = torch.argmax(logits, dim=1)
                targets = torch.argmax(yb, dim=1)

                correct += (preds == targets).sum().item()
                total += yb.size(0)

        val_acc = correct / total

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state = model.state_dict()

        print(f"Epoch {epoch+1}/{epochs} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    model.load_state_dict(best_state)
    return model


### **CNN-1 : sur donn√©es normales**

In [None]:
model = SimpleCNN_OHE(input_channels=1, num_classes=8)
cnn1_model = train_cnn_multiclass_ohe(
    model,
    X_train=X_train_balanced,
    y_train=y_train_oh,   # (N, 8) float
    X_val=X_val_scaled,
    y_val=y_val_oh,       # (N, 8) float
    epochs=20,
    batch_size=256,
    lr=0.001
)

In [None]:
def evaluate_model_multiclass_ohe(model, X_test, y_test, batch_size=256):
    import torch
    import numpy as np
    from sklearn.metrics import (
        confusion_matrix,
        accuracy_score,
        f1_score,
        classification_report
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model.to(device)

    # ---------- X ----------
    if hasattr(X_test, "values"):
        X_test = X_test.values

    X_tensor = torch.tensor(X_test, dtype=torch.float32)
    if X_tensor.ndim == 2:
        X_tensor = X_tensor.unsqueeze(1)

    # ---------- y (One-Hot ‚Üí class index) ----------
    if hasattr(y_test, "values"):
        y_test = y_test.values

    y_test = np.array(y_test, dtype=np.float32)
    y_true = np.argmax(y_test, axis=1)   # üîë OHE ‚Üí labels int

    y_tensor = torch.tensor(y_test, dtype=torch.float32)

    dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
    loader  = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_preds = []

    with torch.no_grad():
        for Xb, yb in loader:
            Xb = Xb.to(device)
            logits = model(Xb)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())

    all_preds = np.array(all_preds)

    # ---------- Metrics ----------
    acc = accuracy_score(y_true, all_preds)
    f1_macro = f1_score(y_true, all_preds, average="macro")
    f1_weighted = f1_score(y_true, all_preds, average="weighted")
    cm = confusion_matrix(y_true, all_preds)

    print("Accuracy :", round(acc, 4))
    print("F1 macro :", round(f1_macro, 4))
    print("F1 weighted :", round(f1_weighted, 4))
    print("\nConfusion Matrix:")
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_true, all_preds, digits=4))

    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "confusion_matrix": cm
    }


In [None]:
results = evaluate_model_multiclass_ohe(
    model=cnn1_model,
    X_test=X_test_scaled,
    y_test=y_test_oh,   # One-Hot
    batch_size=512
)


NameError: name 'y_test_oh' is not defined

In [None]:
# Chemin complet dans Google Drive
save_path = "/content/drive/MyDrive/projet_IPS_IDS/cnn1_model.pth"

# Sauvegarde du mod√®le (state_dict recommand√©)
torch.save(cnn1_model.state_dict(), save_path)

print(f"Mod√®le sauvegard√© dans : {save_path}")


### **FWM-CNN**

In [None]:
model = SimpleCNN_OHE(input_channels=1, num_classes=8)
fwm_cnn_model = train_cnn_multiclass_ohe(
    model,
    X_train=X_train_fwm,
    y_train=y_train_oh,   # (N, 8) float
    X_val=X_val_fwm,
    y_val=y_val_oh,       # (N, 8) float
    epochs=20,
    batch_size=256,
    lr=0.001
)

In [None]:
model = SimpleCNN_OHE(input_channels=1, num_classes=8)
results = evaluate_model_multiclass_ohe(
    model=fwm_cnn_model,
    X_test=X_test_fwm,
    y_test=y_test_oh,   # One-Hot
    batch_size=512
)

NameError: name 'SimpleCNN_OHE' is not defined

In [None]:
# Chemin complet dans Google Drive
save_path = "/content/drive/MyDrive/projet_IPS_IDS/fwm_cnn_model.pth"

# Sauvegarde du mod√®le (state_dict recommand√©)
torch.save(fwm_cnn_model.state_dict(), save_path)

print(f"Mod√®le sauvegard√© dans : {save_path}")


### **SWCC-CNN**

In [None]:
model = SimpleCNN_OHE(input_channels=1, num_classes=8)
swcc_cnn_model = train_cnn_multiclass_ohe(
    model,
    X_train=X_train_swcc,
    y_train=y_train_oh,   # (N, 8) float
    X_val=X_val_swcc,
    y_val=y_val_oh,       # (N, 8) float
    epochs=20,
    batch_size=256,
    lr=0.001
)

Epoch 1/20 | Train Acc: 0.5884 | Val Acc: 0.4030
Epoch 2/20 | Train Acc: 0.6735 | Val Acc: 0.5240
Epoch 3/20 | Train Acc: 0.7015 | Val Acc: 0.4633
Epoch 4/20 | Train Acc: 0.7165 | Val Acc: 0.5223
Epoch 5/20 | Train Acc: 0.7267 | Val Acc: 0.5308
Epoch 6/20 | Train Acc: 0.7341 | Val Acc: 0.5779
Epoch 7/20 | Train Acc: 0.7404 | Val Acc: 0.5179
Epoch 8/20 | Train Acc: 0.7444 | Val Acc: 0.5419
Epoch 9/20 | Train Acc: 0.7476 | Val Acc: 0.5483
Epoch 10/20 | Train Acc: 0.7508 | Val Acc: 0.5065
Epoch 11/20 | Train Acc: 0.7531 | Val Acc: 0.5222
Epoch 12/20 | Train Acc: 0.7557 | Val Acc: 0.5134
Epoch 13/20 | Train Acc: 0.7576 | Val Acc: 0.5250
Epoch 14/20 | Train Acc: 0.7596 | Val Acc: 0.5507
Epoch 15/20 | Train Acc: 0.7611 | Val Acc: 0.4852
Epoch 16/20 | Train Acc: 0.7623 | Val Acc: 0.5672
Epoch 17/20 | Train Acc: 0.7638 | Val Acc: 0.5328
Epoch 18/20 | Train Acc: 0.7646 | Val Acc: 0.4559
Epoch 19/20 | Train Acc: 0.7658 | Val Acc: 0.5800
Epoch 20/20 | Train Acc: 0.7669 | Val Acc: 0.5239


In [None]:
results = evaluate_model_multiclass_ohe(
    model=swcc_cnn_model,
    X_test=X_test_swcc,
    y_test=y_test_oh,   # One-Hot
    batch_size=512
)

Accuracy : 0.5235
F1 macro : 0.2258
F1 weighted : 0.6016

Confusion Matrix:
[[    24      1      0      0      3      0      0      0]
 [     0      5      0      0      0      0      0      0]
 [     5      1   4483     51    455     33     18    112]
 [ 11148   9792   7068 448029 399193  41958  10285  30517]
 [     1     47     93   1840 107504   1713      8    219]
 [    49     54    172    586   2228   9287    127    555]
 [     0      7      0      0      1      1     22      0]
 [     1     14      2      0     12     23      0     77]]

Classification Report:
              precision    recall  f1-score   support

           0     0.0021    0.8571    0.0043        28
           1     0.0005    1.0000    0.0010         5
           2     0.3793    0.8691    0.5282      5158
           3     0.9945    0.4677    0.6362    957990
           4     0.2110    0.9648    0.3463    111425
           5     0.1752    0.7112    0.2811     13058
           6     0.0021    0.7097    0.0042     

In [None]:
# Chemin complet dans Google Drive
save_path = "/content/drive/MyDrive/projet_IPS_IDS/swcc_cnn_model.pth"

# Sauvegarde du mod√®le (state_dict recommand√©)
torch.save(swcc_cnn_model.state_dict(), save_path)

print(f"Mod√®le sauvegard√© dans : {save_path}")

Mod√®le sauvegard√© dans : /content/drive/MyDrive/projet_IPS_IDS/swcc_cnn_model.pth
