In [6]:
import pandas as pd
import numpy as np
import os
import sys

# Sklearn imports for preprocessing and splitting
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# --- Path and Import Setup for your custom functions ---
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

# Import your custom functions from preprocess.py
from preprocess import clean_wadi_data, create_windows_vectorized

print("✅ All libraries and custom functions imported.")

✅ All libraries and custom functions imported.


In [7]:
# --- Load and Clean Raw Data ---
def find_header_row(file_path):
    """Opens a file and finds the line number of the real header."""
    with open(file_path, 'r', errors='ignore') as f:
        for i, line in enumerate(f):
            if line.startswith('Row,Date,Time'):
                return i
    return 0

normal_path = os.path.join('..', 'data', 'WADI', 'WADI_14days.csv')
attack_path = os.path.join('..', 'data', 'WADI', 'WADI_attackdata.csv')

lines_to_skip_normal = find_header_row(normal_path)
lines_to_skip_attack = find_header_row(attack_path)

normal_df_raw = pd.read_csv(normal_path, skiprows=lines_to_skip_normal)
attack_df_raw = pd.read_csv(attack_path, skiprows=lines_to_skip_attack)

normal_df = clean_wadi_data(normal_df_raw.copy())
attack_df = clean_wadi_data(attack_df_raw.copy())

print("✅ Raw data loaded and cleaned successfully.")
print(f"Normal df shape: {normal_df.shape}")
print(f"Attack df shape: {attack_df.shape}")

✅ Raw data loaded and cleaned successfully.
Normal df shape: (1209601, 127)
Attack df shape: (172801, 127)


In [8]:
# --- Combine Data, Apply Attack Labels, and Select Features ---
combined_df = pd.concat([normal_df, attack_df])
combined_df.sort_index(inplace=True)
combined_df = combined_df.copy() # Create a clean, un-fragmented copy
combined_df['Label'] = 0

attack_intervals = [
    ('2017-10-09 19:25:00', '2017-10-09 19:50:00'),
    ('2017-10-10 10:25:00', '2017-10-10 10:35:00'),
    ('2017-10-10 10:50:00', '2017-10-10 11:00:00'),
    ('2017-10-10 11:20:00', '2017-10-10 11:30:00'),
    ('2017-10-10 11:40:00', '2017-10-10 11:50:00'),
    ('2017-10-10 14:30:00', '2017-10-10 14:40:00'),
    ('2017-10-10 14:50:00', '2017-10-10 15:00:00'),
    ('2017-10-10 15:20:00', '2017-10-10 15:30:00'),
    ('2017-10-11 10:25:00', '2017-10-11 10:35:00'),
    ('2017-10-11 10:55:00', '2017-10-11 11:05:00'),
    ('2017-10-11 11:20:00', '2017-10-11 11:25:00'),
    ('2017-10-11 11:40:00', '2017-10-11 11:45:00'),
    ('2017-10-11 15:35:00', '2017-10-11 15:45:00'),
    ('2017-10-11 15:55:00', '2017-10-11 16:00:00')
]

for start, end in attack_intervals:
    start_ts = pd.to_datetime(start)
    end_ts = pd.to_datetime(end)
    mask = (combined_df.index >= start_ts) & (combined_df.index <= end_ts)
    combined_df.loc[mask, 'Label'] = 1

# --- Feature Selection ---
selected_features = [
    '1_AIT_001_PV', '1_AIT_002_PV', '1_AIT_003_PV', '1_AIT_004_PV', '1_AIT_005_PV',
    '1_FIT_001_PV', '1_LT_001_PV', '2_DPIT_001_PV', '2_FIC_101_PV', '2_FIC_201_PV',
    '2_FIC_301_PV', '2_FIC_401_PV', '2_FIC_501_PV', '2_FIC_601_PV',
    '2_LT_001_PV', '2_LT_002_PV', '2_PIT_001_PV', '2_PIT_002_PV', '2_PIT_003_PV'
]
subset_df = combined_df[selected_features + ['Label']].copy()

print("✅ Data combined, labeled, and features selected.")
print(f"Final shape of subset_df: {subset_df.shape}")

✅ Data combined, labeled, and features selected.
Final shape of subset_df: (1382402, 20)


In [9]:
# --- Scale Features and Create Windows ---
features = subset_df.drop('Label', axis=1)
labels = subset_df['Label']

# Drop any all-NaN columns that might exist
features.dropna(axis=1, how='all', inplace=True)
print(f"Features to be scaled: {features.shape[1]}")

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
print("✅ Features scaled successfully.")

# Create a temporary DataFrame to pass to the windowing function
scaled_df = pd.DataFrame(features_scaled, columns=features.columns, index=features.index)
scaled_df['Label'] = labels

# Define windowing parameters
WINDOW_SIZE = 50
STEP_SIZE = 50 # Using non-overlapping windows for efficiency

# Generate the windowed dataset using the efficient vectorized function
X, y = create_windows_vectorized(scaled_df, window_size=WINDOW_SIZE, stride=STEP_SIZE)

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Features to be scaled: 19
✅ Features scaled successfully.
✅ Vectorized windows created: 27648 sequences of shape (50, 19)
Shape of X: (27648, 50, 19)
Shape of y: (27648,)


In [10]:
# --- Manual Stratified Split ---
X_normal = X[y == 0]
y_normal = y[y == 0]
X_attack = X[y == 1]
y_attack = y[y == 1]

# Split Normal Data (80/10/10)
X_normal_train, X_normal_temp, y_normal_train, y_normal_temp = train_test_split(
    X_normal, y_normal, test_size=0.2, random_state=42
)
X_normal_val, X_normal_test, y_normal_val, y_normal_test = train_test_split(
    X_normal_temp, y_normal_temp, test_size=0.5, random_state=42
)

# Split Attack Data (80/10/10)
X_attack_train, X_attack_temp, y_attack_train, y_attack_temp = train_test_split(
    X_attack, y_attack, test_size=0.2, random_state=42
)
X_attack_val, X_attack_test, y_attack_val, y_attack_test = train_test_split(
    X_attack_temp, y_attack_temp, test_size=0.5, random_state=42
)

# Combine the splits
X_train = np.concatenate([X_normal_train, X_attack_train])
y_train = np.concatenate([y_normal_train, y_attack_train])
X_val = np.concatenate([X_normal_val, X_attack_val])
y_val = np.concatenate([y_normal_val, y_attack_val])
X_test = np.concatenate([X_normal_test, X_attack_test])
y_test = np.concatenate([y_normal_test, y_attack_test])

# Shuffle the training data
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Verify the New Distribution
print("✅ New data split complete:")
print("Training set label distribution:", dict(zip(*np.unique(y_train, return_counts=True))))
print("Validation set label distribution:", dict(zip(*np.unique(y_val, return_counts=True))))
print("Test set label distribution:", dict(zip(*np.unique(y_test, return_counts=True))))

✅ New data split complete:
Training set label distribution: {np.int64(0): np.int64(21972), np.int64(1): np.int64(145)}
Validation set label distribution: {np.int64(0): np.int64(2747), np.int64(1): np.int64(18)}
Test set label distribution: {np.int64(0): np.int64(2747), np.int64(1): np.int64(19)}


In [11]:
# --- Save Processed Data ---
save_path = '../data/processed'
os.makedirs(save_path, exist_ok=True)

np.save(os.path.join(save_path, 'X_train.npy'), X_train)
np.save(os.path.join(save_path, 'y_train.npy'), y_train)
np.save(os.path.join(save_path, 'X_val.npy'), X_val)
np.save(os.path.join(save_path, 'y_val.npy'), y_val)
np.save(os.path.join(save_path, 'X_test.npy'), X_test)
np.save(os.path.join(save_path, 'y_test.npy'), y_test)

print(f"✅ All data arrays saved successfully to: {os.path.abspath(save_path)}")

✅ All data arrays saved successfully to: /Users/anshreyas/PycharmProjects/ics-anomaly-detection-transformer/data/processed
