### detection with TRANSFORMERS

In [1]:
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np

In [2]:
# Charger les données
df = pd.read_csv("../data.csv", index_col=0, parse_dates=True)
features = [col for col in df.columns]

# Normaliser les données
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df[features])

# Diviser les données en train/test
train_data, test_data = train_test_split(data_scaled, test_size=0.2)

In [3]:
# Paramètres du modèle
seq_length = 10  # Taille de la séquence temporelle
d_model = 64  # Dimension des embeddings
nhead = 4  # Nombre de têtes d'attention
num_layers = 2  # Nombre de couches de l'encodeur
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Fonction pour créer des séquences temporelles
def create_sequences(data, seq_length):
    xs = []
    for i in range(len(data) - seq_length):
        xs.append(data[i:i+seq_length])
    return np.array(xs)

In [5]:
# Préparer les données d'entraînement
train_sequences = create_sequences(train_data, seq_length)
train_sequences = torch.tensor(train_sequences, dtype=torch.float32).to(device)

In [6]:
# Construire le modèle Transformer Encoder
class TransformerAnomalyDetector(nn.Module):
    def __init__(self, d_model, nhead, num_layers):
        super(TransformerAnomalyDetector, self).__init__()
        self.embedding = nn.Linear(train_sequences.shape[2], d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(d_model, train_sequences.shape[2])

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x)
        return x

In [7]:
# Initialiser le modèle
model = TransformerAnomalyDetector(d_model=d_model, nhead=nhead, num_layers=num_layers).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [8]:
def get_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [9]:
# Entraînement
batch_size = 512  # Ajustez selon votre capacité GPU
epochs = 5
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in get_batches(train_sequences, batch_size):
        optimizer.zero_grad()
        batch = batch.to(device)  # Assurez-vous que le batch est envoyé au GPU si disponible

        output = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / (len(train_sequences) / batch_size)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

Epoch [1/5], Loss: 0.0317
Epoch [2/5], Loss: 0.0013
Epoch [3/5], Loss: 0.0006
Epoch [4/5], Loss: 0.0004
Epoch [5/5], Loss: 0.0003


In [10]:
# Prédiction sur les données de test
model.eval()
test_sequences = create_sequences(test_data, seq_length)
test_sequences = torch.tensor(test_sequences, dtype=torch.float32)

test_loader = DataLoader(test_sequences, batch_size=512, shuffle=False)
predictions = []

with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        output = model(batch)
        predictions.append(output.cpu())

predictions = torch.cat(predictions, dim=0)

In [11]:
# Calcul des erreurs de reconstruction
errors = torch.mean((predictions - test_sequences) ** 2, dim=(1, 2)).cpu().numpy()

In [14]:
# Détection d'anomalies (en utilisant un seuil arbitraire)
anomaly_threshold = np.percentile(errors, 95)  # Par exemple, on prend le 95ème percentile
anomalies = np.where(errors > anomaly_threshold)[0]

anomaly_indices = anomalies.nonzero()[0]
anomalous_data = test_data[anomaly_indices]
print(f"Nombre d'anomalies détectées : {len(anomalies)}")
print(f"Indices des anomalies détectées : {anomaly_indices}")

Nombre d'anomalies détectées : 11462
Indices des anomalies détectées : [    0     1     2 ... 11459 11460 11461]


In [15]:
print(f"Lignes du dataset correspondant aux anomalies : \n{anomalous_data}")

Lignes du dataset correspondant aux anomalies : 
[[ 0.          0.66733935  0.52670696 ...  0.24623948 -1.53516092
  -0.15570735]
 [ 0.         -1.31228543  0.58735628 ...  0.39204772  0.78371849
   0.20798116]
 [ 0.          0.75023134  0.60088994 ...  0.20573691 -0.26691177
  -0.05553508]
 ...
 [ 0.         -0.46194841 -0.86320546 ...  0.2408388  -0.30762382
   0.20617625]
 [ 0.         -0.5318844   0.47407767 ...  0.19763639  0.31478361
   0.15293153]
 [ 0.         -0.63235955 -1.42801562 ...  0.33804463 -0.33648399
   0.20617625]]
