In [4]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
import scipy.io as sio  # Per salvare in .mat (MATLAB format)

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8

In [5]:
df = pd.read_csv("data_1/BATADAL_dataset04.csv")

df.columns = [
    'DATETIME','L_T1','L_T2','L_T3','L_T4','L_T5','L_T6','L_T7','F_PU1','S_PU1','F_PU2','S_PU2','F_PU3',
    'S_PU3','F_PU4','S_PU4','F_PU5','S_PU5','F_PU6','S_PU6','F_PU7','S_PU7','F_PU8','S_PU8','F_PU9','S_PU9',
    'F_PU10','S_PU10','F_PU11','S_PU11','F_V2','S_V2','P_J280','P_J269','P_J300','P_J256','P_J289','P_J415',
    'P_J302','P_J306','P_J307','P_J317','P_J14','P_J422','OUTCOME'
]

# 'DATETIME' is irrelevant for the thesis
# The other dropped columns contain either only 0s, only 1s or only 2s and are therefore irrelevant

df = df.drop(['DATETIME', 'S_PU1', 'F_PU3', 'S_PU3', 'F_PU5', 'S_PU5', 'F_PU9', 'S_PU9'], axis = 1)

# The dataset labels attacks by '-999' and labels normal data as 1
# To keep the same structure in all datasets, the '-999' values are changed to '-1' and normal values to '1'

df['OUTCOME'].replace(to_replace = [-999], value = '-1', inplace = True)
df['OUTCOME'].replace(to_replace = [1], value = '1', inplace = True)

# data types need to be numeric to be encoded to z-scores --> convert column object data types to numerics

cols = df.columns[df.columns != 'OUTCOME']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Encoding the feature vectors to [0,1]
scaler = MinMaxScaler()
df[cols] = scaler.fit_transform(df[cols])
display(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['OUTCOME'].replace(to_replace = [-999], value = '-1', inplace = True)


Unnamed: 0,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,F_PU2,S_PU2,...,P_J256,P_J289,P_J415,P_J302,P_J306,P_J307,P_J317,P_J14,P_J422,OUTCOME
0,0.391867,0.919776,0.121569,0.784173,0.548611,1.000000,0.866521,0.654035,0.937250,1.0,...,0.136758,0.591892,0.591187,0.136014,0.662634,0.138159,0.486295,0.232103,0.579387,-1
1,0.432532,0.787313,0.125490,0.812950,0.697917,0.917808,0.897155,0.603586,0.895016,1.0,...,0.752169,0.337162,0.572672,0.277972,0.767473,0.281364,0.197306,0.289373,0.336351,-1
2,0.515712,0.625000,0.305882,0.823741,0.899306,0.520548,0.610503,0.609205,0.899620,1.0,...,0.806317,0.300000,0.614146,0.278671,0.794892,0.282060,0.167533,0.285945,0.295265,-1
3,0.609982,0.509328,0.505882,0.762590,0.791667,0.287671,0.431072,0.587926,0.881906,1.0,...,0.899688,0.516216,0.274208,0.309266,0.166129,0.310249,0.493620,0.310950,0.491643,-1
4,0.695009,0.442164,0.725490,0.460432,0.635417,0.849315,0.663020,0.574895,0.870997,1.0,...,0.904200,0.493243,0.274023,0.321503,0.165591,0.322429,0.499055,0.326275,0.467967,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4172,0.430684,0.384328,0.380392,0.402878,0.881944,0.232877,0.385120,0.970233,0.000000,0.0,...,0.137799,0.535135,0.559156,0.079371,0.634409,0.081782,0.345699,0.041541,0.533426,-1
4173,0.354898,0.419776,0.211765,0.359712,0.836806,0.342466,0.336980,0.958757,0.000000,0.0,...,0.088164,0.554054,0.546565,0.200874,0.074731,0.202540,0.365784,0.047792,0.555014,-1
4174,0.293900,0.457090,0.027451,0.205036,0.493056,0.506849,0.315098,0.977764,0.000000,0.0,...,0.679278,0.498649,0.534716,0.184441,0.040054,0.186184,0.346645,0.037508,0.519499,-1
4175,0.221811,0.412313,0.176471,0.039568,0.142361,0.452055,0.210066,0.969516,0.000000,0.0,...,0.697327,0.427027,0.920015,0.189860,0.025269,0.191578,0.354679,0.023392,0.470752,-1


In [6]:
normal_mask = df['OUTCOME'] == '1'
attack_mask = df['OUTCOME'] == '-1'

df_normal = df[normal_mask]
df_attack = df[attack_mask].sample(50, random_state=42)

print(f"Normal count: {len(df_normal)}")
print(f"Attack count: {len(df_attack)}")

Normal count: 219
Attack count: 50


In [7]:
# Concatenazione dei dati e shuffle
df1 = pd.concat([df_normal, df_attack]).sample(frac=1, random_state=42).reset_index(drop=True)

# Separazione tra features e target
X = df1.drop(columns=['OUTCOME'])  # Corretto l'errore su df1
Y = df1["OUTCOME"]

# 🔹 Calcola la Mutual Information
mi_scores = mutual_info_classif(X, Y)

# 🔹 Crea un DataFrame con i risultati
mi_results = pd.DataFrame({"Feature": X.columns, "MI_Score": mi_scores})

# 🔹 Filtra solo le feature con MI > 0 (cioè, che portano informazione)
threshold=0.086
selected_features = mi_results[mi_results["MI_Score"] > threshold]

# 🔹 Ordina in base all'importanza
selected_features = selected_features.sort_values(by="MI_Score", ascending=False)

# 🔹 Stampa le feature selezionate
print("Feature più informative rispetto alle anomalie:")
print(selected_features)


Feature più informative rispetto alle anomalie:
   Feature  MI_Score
30  P_J302  0.088068
32  P_J307  0.086845


In [8]:
# Supponiamo che `df` sia il tuo DataFrame originale
selected_features = ['P_J307', 'P_J302']

# Creiamo un nuovo DataFrame con solo queste feature + il target
df_selected = df[selected_features + ["OUTCOME"]]

# Salviamo il file CSV
df_selected.to_csv("./data_1/relevant_features_BATADAL.csv", index=False)

In [9]:
# 🔹 Carica il dataset WADI
df = pd.read_csv("./data_1/relevant_features_BATADAL.csv")
df["OUTCOME"] = df["OUTCOME"].map({-1: 1, 1: 0})
X = df.drop(columns=['OUTCOME']).values
y = df['OUTCOME'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔹 Filtra solo dati normali nel training set
X_train = X_train[y_train == 0]
y_train = y_train[y_train == 0]  # Mantieni solo le etichette 0

# 🔹 Bilanciamento del test set: riduzione delle anomalie
normal_indices = np.where(y_test == 0)[0]
anomaly_indices = np.where(y_test == 1)[0]

X_val = X_train[100:]
y_val = y_train[100:]
# Imposta un rapporto equilibrato (ad esempio, 2:1 tra normali e anomalie)
n_normals = len(normal_indices)
n_anomalies = min(len(anomaly_indices), n_normals // 3)  # Mantieni meno anomalie rispetto ai normali

# Campionamento casuale delle anomalie
selected_anomaly_indices = np.random.choice(anomaly_indices, n_anomalies, replace=False)

# Creazione del nuovo test set bilanciato
selected_indices = np.concatenate([normal_indices, selected_anomaly_indices])
X_test, y_test = X_test[selected_indices], y_test[selected_indices]

y_train = y_train.astype(np.float64)
y_val = y_val.astype(np.float64)
y_test = y_test.astype(np.float64)

# 🔹 Salva i dataset in formato MATLAB (.mat)
sio.savemat("./data_1/fuzzy_data_BATADAL.mat", {
    "X_train": X_train, 
    "y_train": y_train.reshape(-1, 1),  # Mantieni etichette in colonna
    "X_val": X_val, 
    "y_val": y_val.reshape(-1, 1),
    "X_test_5perc": X_test, 
    "y_test_5perc": y_test.reshape(-1, 1)
})

print("Dataset BATADAL preparato e salvato in formato MATLAB!")

Dataset BATADAL preparato e salvato in formato MATLAB!
