In [None]:
# import libraries
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
import scipy.io as sio  # Per salvare in .mat (MATLAB format)

%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 14, 8

In [1]:
df = pd.read_csv("data_1/WADI_attackdataLABLE.csv", low_memory=False)

df.columns = [
    'ROW','DATE','TIME','1_AIT_001_PV','1_AIT_002_PV','1_AIT_003_PV','1_AIT_004_PV','1_AIT_005_PV','1_FIT_001_PV','1_LS_001_AL',
    '1_LS_002_AL','1_LT_001_PV','1_MV_001_STATUS','1_MV_002_STATUS','1_MV_003_STATUS','1_MV_004_STATUS','1_P_001_STATUS',
    '1_P_002_STATUS','1_P_003_STATUS','1_P_004_STATUS','1_P_005_STATUS','1_P_006_STATUS','2_DPIT_001_PV','2_FIC_101_CO',
    '2_FIC_101_PV','2_FIC_101_SP','2_FIC_201_CO','2_FIC_201_PV','2_FIC_201_SP','2_FIC_301_CO','2_FIC_301_PV','2_FIC_301_SP',
    '2_FIC_401_CO','2_FIC_401_PV','2_FIC_401_SP','2_FIC_501_CO','2_FIC_501_PV','2_FIC_501_SP','2_FIC_601_CO','2_FIC_601_PV',
    '2_FIC_601_SP','2_FIT_001_PV','2_FIT_002_PV','2_FIT_003_PV','2_FQ_101_PV','2_FQ_201_PV','2_FQ_301_PV','2_FQ_401_PV',
    '2_FQ_501_PV','2_FQ_601_PV','2_LS_001_AL','2_LS_002_AL','2_LS_101_AH','2_LS_101_AL','2_LS_201_AH','2_LS_201_AL',
    '2_LS_301_AH','2_LS_301_AL','2_LS_401_AH','2_LS_401_AL','2_LS_501_AH','2_LS_501_AL','2_LS_601_AH','2_LS_601_AL',
    '2_LT_001_PV','2_LT_002_PV','2_MCV_007_CO','2_MCV_101_CO','2_MCV_201_CO','2_MCV_301_CO','2_MCV_401_CO','2_MCV_501_CO',
    '2_MCV_601_CO','2_MV_001_STATUS','2_MV_002_STATUS','2_MV_003_STATUS','2_MV_004_STATUS','2_MV_005_STATUS','2_MV_006_STATUS',
    '2_MV_009_STATUS','2_MV_101_STATUS','2_MV_201_STATUS','2_MV_301_STATUS','2_MV_401_STATUS','2_MV_501_STATUS',
    '2_MV_601_STATUS','2_P_001_STATUS','2_P_002_STATUS','2_P_003_SPEED','2_P_003_STATUS','2_P_004_SPEED','2_P_004_STATUS',
    '2_PIC_003_CO','2_PIC_003_PV','2_PIC_003_SP','2_PIT_001_PV','2_PIT_002_PV','2_PIT_003_PV','2_SV_101_STATUS',
    '2_SV_201_STATUS','2_SV_301_STATUS','2_SV_401_STATUS','2_SV_501_STATUS','2_SV_601_STATUS','2A_AIT_001_PV','2A_AIT_002_PV',
    '2A_AIT_003_PV','2A_AIT_004_PV','2B_AIT_001_PV','2B_AIT_002_PV','2B_AIT_003_PV','2B_AIT_004_PV','3_AIT_001_PV',
    '3_AIT_002_PV','3_AIT_003_PV','3_AIT_004_PV','3_AIT_005_PV','3_FIT_001_PV','3_LS_001_AL','3_LT_001_PV','3_MV_001_STATUS',
    '3_MV_002_STATUS','3_MV_003_STATUS','3_P_001_STATUS','3_P_002_STATUS','3_P_003_STATUS','3_P_004_STATUS','LEAK_DIFF_PRESSURE',
    'PLANT_START_STOP_LOG','TOTAL_CONS_REQUIRED_FLOW','OUTCOME'
]

df = df.iloc[1:] # Row only contains labels
df = df.drop([172802,172803]) # Rows contain no information, only nan values

# 'ROW', 'DATE' and 'TIME' are irrelevant for the thesis.
# '2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS' contain only nan values
# Other dropped columns contain either only 0s, only 1s or only 2s and are therefore irrelevant

df = df.drop(['ROW', 'DATE', 'TIME', '1_LS_001_AL', '1_LS_002_AL', '1_P_002_STATUS', '1_P_004_STATUS', '2_LS_001_AL', 
              '2_LS_002_AL', '2_MV_001_STATUS', '2_MV_002_STATUS', '2_MV_004_STATUS', '2_MV_005_STATUS', '2_MV_009_STATUS',
              '2_P_001_STATUS', '2_P_002_STATUS', '2_P_004_STATUS', '2_SV_101_STATUS', '2_SV_201_STATUS', '2_SV_301_STATUS',
              '2_SV_401_STATUS', '2_SV_501_STATUS', '2_SV_601_STATUS', '3_AIT_001_PV', '3_LS_001_AL', '3_MV_001_STATUS',
              '3_MV_002_STATUS', '3_MV_003_STATUS', '3_P_001_STATUS', '3_P_002_STATUS', '3_P_003_STATUS', '3_P_004_STATUS',
              'PLANT_START_STOP_LOG'], axis = 1)

# set OUTCOME data type to int and change normal values to 0 / attack values to -1 
df['OUTCOME'] = df['OUTCOME'].astype(int)
df['OUTCOME'].replace(to_replace = [1], value = 0, inplace = True)
df['OUTCOME'].replace(to_replace = [-1], value = 1, inplace = True)

# data types need to be numeric to be encoded to z-scores --> convert column object data types to numerics
cols = df.columns[df.columns != 'OUTCOME']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Encoding the feature vectors to z-scores
scaler = MinMaxScaler()
df[cols] = scaler.fit_transform(df[cols])

NameError: name 'pd' is not defined

In [None]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', None):
    display(df)

Unnamed: 0,1_AIT_001_PV,1_AIT_002_PV,1_AIT_003_PV,1_AIT_004_PV,1_AIT_005_PV,1_FIT_001_PV,1_LT_001_PV,1_MV_001_STATUS,1_MV_002_STATUS,1_MV_003_STATUS,1_MV_004_STATUS,1_P_001_STATUS,1_P_003_STATUS,1_P_005_STATUS,1_P_006_STATUS,2_DPIT_001_PV,2_FIC_101_CO,2_FIC_101_PV,2_FIC_101_SP,2_FIC_201_CO,2_FIC_201_PV,2_FIC_201_SP,2_FIC_301_CO,2_FIC_301_PV,2_FIC_301_SP,2_FIC_401_CO,2_FIC_401_PV,2_FIC_401_SP,2_FIC_501_CO,2_FIC_501_PV,2_FIC_501_SP,2_FIC_601_CO,2_FIC_601_PV,2_FIC_601_SP,2_FIT_001_PV,2_FIT_002_PV,2_FIT_003_PV,2_FQ_101_PV,2_FQ_201_PV,2_FQ_301_PV,2_FQ_401_PV,2_FQ_501_PV,2_FQ_601_PV,2_LS_101_AH,2_LS_101_AL,2_LS_201_AH,2_LS_201_AL,2_LS_301_AH,2_LS_301_AL,2_LS_401_AH,2_LS_401_AL,2_LS_501_AH,2_LS_501_AL,2_LS_601_AH,2_LS_601_AL,2_LT_001_PV,2_LT_002_PV,2_MCV_007_CO,2_MCV_101_CO,2_MCV_201_CO,2_MCV_301_CO,2_MCV_401_CO,2_MCV_501_CO,2_MCV_601_CO,2_MV_003_STATUS,2_MV_006_STATUS,2_MV_101_STATUS,2_MV_201_STATUS,2_MV_301_STATUS,2_MV_401_STATUS,2_MV_501_STATUS,2_MV_601_STATUS,2_P_003_SPEED,2_P_003_STATUS,2_P_004_SPEED,2_PIC_003_CO,2_PIC_003_PV,2_PIC_003_SP,2_PIT_001_PV,2_PIT_002_PV,2_PIT_003_PV,2A_AIT_001_PV,2A_AIT_002_PV,2A_AIT_003_PV,2A_AIT_004_PV,2B_AIT_001_PV,2B_AIT_002_PV,2B_AIT_003_PV,2B_AIT_004_PV,3_AIT_002_PV,3_AIT_003_PV,3_AIT_004_PV,3_AIT_005_PV,3_FIT_001_PV,3_LT_001_PV,LEAK_DIFF_PRESSURE,TOTAL_CONS_REQUIRED_FLOW,OUTCOME
1,0.258805,0.088248,0.990702,0.995069,0.865286,0.000268,0.300407,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.946337,0.301893,0.088006,0.268293,0.656061,0.102784,0.028571,1.000000,0.009629,0.024390,0.147284,0.052862,0.024390,1.000000,0.009624,0.047619,1.000000,0.012631,0.150,0.0,0.36862,0.0,0.087956,0.103055,0.010014,0.051243,0.010339,0.012631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680894,0.789851,0.0,0.0,0.0,0.200685,0.0,0.686983,0.356947,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.001846,0.0,0.879294,1.0,0.005680,1.0,0.771271,0.918687,0.005291,0.000000,4.478127e-41,0.75,0.439491,0.366693,0.000038,0.781739,0.398462,0.0,0.295319,0.380856,0.755538,0.577042,0.067104,0.071170,0.167382,0
2,0.258805,0.088248,0.990702,0.995069,0.865286,0.000268,0.300407,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.946337,0.301893,0.088006,0.268293,0.656061,0.102784,0.028571,1.000000,0.009629,0.024390,0.147284,0.052862,0.024390,1.000000,0.009624,0.047619,1.000000,0.012631,0.150,0.0,0.36862,0.0,0.087956,0.103055,0.010014,0.051243,0.010339,0.012631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680894,0.789851,0.0,0.0,0.0,0.200685,0.0,0.686983,0.356947,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.001846,0.0,0.879294,1.0,0.005680,1.0,0.771271,0.918687,0.005291,0.000000,4.478127e-41,0.75,0.439491,0.366693,0.000038,0.781739,0.398462,0.0,0.295319,0.380856,0.755538,0.577042,0.067104,0.071170,0.167382,0
3,0.258805,0.088248,0.990702,0.995069,0.865286,0.000268,0.300407,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.946337,0.301893,0.088006,0.268293,0.656061,0.102784,0.028571,1.000000,0.009629,0.024390,0.147284,0.052862,0.024390,1.000000,0.009624,0.047619,1.000000,0.012631,0.150,0.0,0.36862,0.0,0.087956,0.103055,0.010014,0.051243,0.010339,0.012631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680894,0.789851,0.0,0.0,0.0,0.200685,0.0,0.686983,0.356947,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.001846,0.0,0.879294,1.0,0.005680,1.0,0.771271,0.918687,0.005291,0.000000,4.478127e-41,0.75,0.439491,0.366693,0.000038,0.781739,0.398462,0.0,0.295319,0.380856,0.755538,0.577042,0.067104,0.071170,0.167382,0
4,0.258805,0.088248,0.990702,0.995069,0.865286,0.000268,0.300407,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.946337,0.301893,0.088006,0.268293,0.656061,0.102784,0.028571,1.000000,0.009629,0.024390,0.147284,0.052862,0.024390,1.000000,0.009624,0.047619,1.000000,0.012631,0.150,0.0,0.36862,0.0,0.087956,0.103055,0.010014,0.051243,0.010339,0.012631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680894,0.789851,0.0,0.0,0.0,0.200685,0.0,0.686983,0.356947,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.001846,0.0,0.879294,1.0,0.005680,1.0,0.771271,0.918687,0.005291,0.000000,4.179957e-41,0.75,0.437227,0.366693,0.000046,0.941424,0.394514,0.0,0.295319,0.380856,0.755538,0.577042,0.067104,0.071170,0.167382,0
5,0.258805,0.088248,0.990702,0.995069,0.865286,0.000268,0.300407,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.946337,0.301893,0.088006,0.268293,0.656061,0.102784,0.028571,1.000000,0.009629,0.024390,0.147284,0.052862,0.024390,1.000000,0.009624,0.047619,1.000000,0.012631,0.150,0.0,0.36862,0.0,0.087956,0.103055,0.010014,0.051243,0.010339,0.012631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680894,0.789851,0.0,0.0,0.0,0.200685,0.0,0.686983,0.356947,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,0.001846,0.0,0.879294,1.0,0.005680,1.0,0.771271,0.918687,0.005291,0.000000,4.179957e-41,0.75,0.437227,0.366693,0.000046,0.941424,0.394514,0.0,0.295319,0.380856,0.755538,0.577042,0.067104,0.071170,0.167382,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172797,0.272594,0.091247,0.984195,0.961150,0.778557,0.000247,0.485594,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.950105,0.991850,0.016115,0.073171,1.000000,0.010465,0.028571,0.996266,0.012390,0.073171,1.000000,0.014155,0.097561,0.990467,0.009580,0.023810,0.996301,0.000085,0.025,0.0,0.00000,0.0,0.016193,0.010350,0.012884,0.014155,0.010291,0.000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.710784,0.719694,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,0.001913,0.0,0.892718,1.0,0.010048,1.0,0.718006,0.934869,0.009359,0.687552,4.684553e-41,0.50,0.447080,0.666689,0.996417,0.268794,0.479961,0.0,0.463496,0.354938,0.720265,0.639805,0.615225,0.032528,0.000000,0
172798,0.272594,0.091247,0.984195,0.961150,0.778557,0.000247,0.485594,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.950105,0.991850,0.016115,0.073171,1.000000,0.010465,0.028571,0.996266,0.012390,0.073171,1.000000,0.014155,0.097561,0.990467,0.009580,0.023810,0.996301,0.000085,0.025,0.0,0.00000,0.0,0.016193,0.010350,0.012884,0.014155,0.010291,0.000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.710784,0.719694,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,0.001913,0.0,0.892718,1.0,0.010048,1.0,0.718006,0.934869,0.009359,0.687552,4.684553e-41,0.50,0.447080,0.666689,0.996417,0.268794,0.479961,0.0,0.463496,0.354938,0.720265,0.639805,0.615225,0.032528,0.000000,0
172799,0.272525,0.097246,0.984418,0.961186,0.779227,0.000263,0.489972,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.950281,0.997897,0.016115,0.073171,1.000000,0.010465,0.028571,1.000000,0.012390,0.073171,1.000000,0.014155,0.097561,0.995311,0.009580,0.023810,1.000000,0.000085,0.025,0.0,0.00000,0.0,0.016193,0.010350,0.012884,0.014155,0.010291,0.000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.713235,0.719806,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,0.001882,0.0,0.873786,1.0,0.010049,1.0,0.722570,0.937564,0.009386,0.687552,4.684553e-41,0.50,0.447080,0.666689,0.996417,0.268794,0.479961,0.0,0.000000,0.946459,0.720299,0.639859,0.615667,0.026090,0.000000,0
172800,0.272525,0.097246,0.984418,0.961186,0.779227,0.000263,0.489972,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.950281,0.997897,0.016115,0.073171,1.000000,0.010465,0.028571,1.000000,0.012390,0.073171,1.000000,0.014155,0.097561,0.995311,0.009580,0.023810,1.000000,0.000085,0.025,0.0,0.00000,0.0,0.016193,0.010350,0.012884,0.014155,0.010291,0.000085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.713235,0.719806,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,0.001882,0.0,0.873786,1.0,0.010049,1.0,0.722570,0.937564,0.009386,0.687552,4.535468e-41,0.50,0.448113,0.666689,0.996474,0.268794,0.484036,0.0,0.000000,0.946459,0.720299,0.639859,0.615667,0.026090,0.000000,0


In [None]:
normal = df[df['OUTCOME'] == 0].sample(9000)
attack = df[df['OUTCOME'] == 1]

df1 = pd.concat([normal, attack]).sample(frac=1).reset_index(drop=True)

X = df1.drop(['OUTCOME'], axis = 1)#.values
Y = df1["OUTCOME"] #.values

# 🔹 Calcola la Mutual Information
mi_scores = mutual_info_classif(X, Y)

# 🔹 Crea un DataFrame con i risultati
mi_results = pd.DataFrame({"Feature": X.columns, "MI_Score": mi_scores})

# 🔹 Filtra solo le feature con MI > 0 (cioè, che portano informazione)
threshold = 0.49
selected_features = mi_results[mi_results["MI_Score"] > threshold]

# 🔹 Ordina in base all'importanza
selected_features = selected_features.sort_values(by="MI_Score", ascending=False)

# 🔹 Stampa le feature selezionate
print("Feature più informative rispetto alle anomalie:")
print(selected_features)

# 🔹 Se vuoi solo i nomi delle feature per usarle in MATLAB
important_feature_names = selected_features["Feature"].tolist()
print("\nFeature selezionate:", important_feature_names)

# tsne_plot(X, Y, "original.png")

Feature più informative rispetto alle anomalie:
          Feature  MI_Score
84  2A_AIT_004_PV  0.551780
6     1_LT_001_PV  0.494932

Feature selezionate: ['2A_AIT_004_PV', '1_LT_001_PV']


In [None]:
# Supponiamo che `df` sia il tuo DataFrame originale
selected_features = ['2A_AIT_004_PV', '1_LT_001_PV']

# Creiamo un nuovo DataFrame con solo queste feature + il target
df_selected = df[selected_features + ["OUTCOME"]]

# Salviamo il file CSV
df_selected.to_csv("./data_1/relevant_features_WADI.csv", index=False)

In [None]:
# 🔹 Carica il dataset WADI
df = pd.read_csv("./data_1/relevant_features_WADI.csv")

# 🔹 Separa feature e target
X = df.drop(columns=["OUTCOME"])
y = df["OUTCOME"]

# 🔹 Filtra solo dati normali e anomalie
df_normal = df[df["OUTCOME"] == 0]
df_anomaly = df[df["OUTCOME"] == 1]

# 🔹 Definisci le dimensioni dei dataset
num_train = 9000  # Training set (solo normali)
num_val = 9000  # Validation set (solo normali)
num_test_normal = 3325  # Normali nel test set (95%)
num_test_anomaly = int(num_test_normal * 0.05 / 0.95)  # Anomalie nel test set (5%)

# 🔹 Crea Training Set (solo normali)
X_train = df_normal.iloc[:num_train, :-1].values
y_train = df_normal.iloc[:num_train, -1].values

# 🔹 Crea Validation Set (solo normali)
X_val = df_normal.iloc[num_train:num_train + num_val, :-1].values
y_val = df_normal.iloc[num_train:num_train + num_val, -1].values

# 🔹 Crea Test Set con 5% di anomalie
X_test_5perc = pd.concat([
    df_normal.iloc[num_train + num_val:num_train + num_val + num_test_normal, :-1],
    df_anomaly.iloc[:num_test_anomaly, :-1]
]).values

y_test_5perc = pd.concat([
    df_normal.iloc[num_train + num_val:num_train + num_val + num_test_normal, -1],
    df_anomaly.iloc[:num_test_anomaly, -1]
]).values

# 🔹 Salva i dataset in formato MATLAB (.mat)
sio.savemat("./data_1/fuzzy_data_WADI.mat", {
    "X_train": X_train, 
    "y_train": y_train.reshape(-1, 1),  # Mantieni etichette in colonna
    "X_val": X_val, 
    "y_val": y_val.reshape(-1, 1),
    "X_test_5perc": X_test_5perc, 
    "y_test_5perc": y_test_5perc.reshape(-1, 1)
})

print("Dataset WADI preparato e salvato in formato MATLAB!")

Dataset WADI preparato e salvato in formato MATLAB!
