In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# --- Combine Data and Fix PerformanceWarning ---
combined_df = pd.concat([normal_df, attack_df])
combined_df.sort_index(inplace=True)
combined_df = combined_df.copy() # FIX: Create a clean copy
combined_df['Label'] = 0


# --- 1. Scale the Features ---
features = subset_df.drop('Label', axis=1)

features.dropna(axis=1, how='all', inplace=True)
print(f"✅ Dropped all-NaN columns. Remaining features: {features.shape[1]}")

labels = subset_df['Label']
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
print("✅ Features scaled successfully.")

# --- 2. Create Sliding Windows ---
def create_windows(data, labels, window_size, step_size):
    X, y = [], []
    for i in range(0, len(data) - window_size, step_size):
        end_idx = i + window_size
        window_features = data[i:end_idx]
        window_labels = labels[i:end_idx]
        window_label = 1 if np.any(window_labels == 1) else 0
        X.append(window_features)
        y.append(window_label)
    return np.array(X), np.array(y)

# Define windowing parameters with a larger step size
WINDOW_SIZE = 50
STEP_SIZE = 50

# Generate the windowed dataset
X, y = create_windows(features_scaled, labels.values, WINDOW_SIZE, STEP_SIZE)
print(f"✅ Windowing complete. Created {len(X)} windows of size {WINDOW_SIZE}.")

# --- 3. Split into Train, Validation, and Test Sets ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print("\n✅ Data split complete:")
print(f"Training set:   {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set:       {len(X_test)} samples")

In [None]:
selected_features = [
    '1_AIT_001_PV', '1_AIT_002_PV', '1_AIT_003_PV', '1_AIT_004_PV', '1_AIT_005_PV',
    '1_FIT_001_PV', '1_LT_001_PV', '2_DPIT_001_PV', '2_FIC_101_PV', '2_FIC_201_PV',
    '2_FIC_301_PV', '2_FIC_401_PV', '2_FIC_501_PV', '2_FIC_601_PV',
    '2_LT_001_PV', '2_LT_002_PV', '2_PIT_001_PV', '2_PIT_002_PV', '2_PIT_003_PV'
]
subset_df = combined_df[selected_features + ['Label']].copy()

print(f"Created a new dataset with {len(selected_features)} selected features.")

In [None]:
combined_df = pd.concat([normal_df, attack_df])
combined_df.sort_index(inplace=True)

combined_df['Label'] = 0

attack_intervals = [
    ('2017-10-09 19:25:00', '2017-10-09 19:50:00'),
    ('2017-10-10 10:25:00', '2017-10-10 10:35:00'),
    ('2017-10-10 10:50:00', '2017-10-10 11:00:00'),
    ('2017-10-10 11:20:00', '2017-10-10 11:30:00'),
    ('2017-10-10 11:40:00', '2017-10-10 11:50:00'),
    ('2017-10-10 14:30:00', '2017-10-10 14:40:00'),
    ('2017-10-10 14:50:00', '2017-10-10 15:00:00'),
    ('2017-10-10 15:20:00', '2017-10-10 15:30:00'),
    ('2017-10-11 10:25:00', '2017-10-11 10:35:00'),
    ('2017-10-11 10:55:00', '2017-10-11 11:05:00'),
    ('2017-10-11 11:20:00', '2017-10-11 11:25:00'),
    ('2017-10-11 11:40:00', '2017-10-11 11:45:00'),
    ('2017-10-11 15:35:00', '2017-10-11 15:45:00'),
    ('2017-10-11 15:55:00', '2017-10-11 16:00:00')
]

for start, end in attack_intervals:
    start_ts = pd.to_datetime(start)
    end_ts = pd.to_datetime(end)
    mask = (combined_df.index >= start_ts) & (combined_df.index <= end_ts)
    combined_df.loc[mask, 'Label'] = 1


# --- 5. Sanity Check ---
print("✅ Labeling complete.")
print("Label distribution:")
print(combined_df['Label'].value_counts())

print("\nData with labels:")
print(combined_df.head())

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# --- 1. Scale the Features ---
features = subset_df.drop('Label', axis=1)
labels = combined_df['Label']
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
print("✅ Features scaled successfully.")

# --- 2. Create Sliding Windows ---
def create_windows(data, labels, window_size, step_size):
    X, y = [], []
    for i in range(0, len(data) - window_size, step_size):
        end_idx = i + window_size
        window_features = data[i:end_idx]
        window_labels = labels[i:end_idx]
        window_label = 1 if np.any(window_labels == 1) else 0
        X.append(window_features)
        y.append(window_label)
    return np.array(X), np.array(y)

WINDOW_SIZE = 50
STEP_SIZE = 50

X, y = create_windows(features_scaled, labels.values, WINDOW_SIZE, STEP_SIZE)
print(f"✅ Windowing complete. Created {len(X)} windows of size {WINDOW_SIZE}.")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# --- 3. Split into Train, Validation, and Test Sets ---
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)
print("\n✅ Data split complete:")
print(f"Training set:   {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set:       {len(X_test)} samples")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np

# --- 1. Separate Normal and Attack Windows ---
X_normal = X[y == 0]
y_normal = y[y == 0]
X_attack = X[y == 1]
y_attack = y[y == 1]

# --- 2. Split Normal Data (80/10/10 split) ---
X_normal_train, X_normal_temp, y_normal_train, y_normal_temp = train_test_split(
    X_normal, y_normal, test_size=0.2, random_state=42
)
X_normal_val, X_normal_test, y_normal_val, y_normal_test = train_test_split(
    X_normal_temp, y_normal_temp, test_size=0.5, random_state=42
)

# --- 3. Split Attack Data (80/10/10 split) ---
X_attack_train, X_attack_temp, y_attack_train, y_attack_temp = train_test_split(
    X_attack, y_attack, test_size=0.2, random_state=42
)
X_attack_val, X_attack_test, y_attack_val, y_attack_test = train_test_split(
    X_attack_temp, y_attack_temp, test_size=0.5, random_state=42
)

# --- 4. Combine the splits ---
X_train = np.concatenate([X_normal_train, X_attack_train])
y_train = np.concatenate([y_normal_train, y_attack_train])

X_val = np.concatenate([X_normal_val, X_attack_val])
y_val = np.concatenate([y_normal_val, y_attack_val])

X_test = np.concatenate([X_normal_test, X_attack_test])
y_test = np.concatenate([y_normal_test, y_attack_test])

# --- 5. Shuffle the training data ---
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# --- 6. Verify the New Distribution ---
print("✅ New data split complete:")
print("Training set label distribution:", dict(zip(*np.unique(y_train, return_counts=True))))
print("Validation set label distribution:", dict(zip(*np.unique(y_val, return_counts=True))))
print("Test set label distribution:", dict(zip(*np.unique(y_test, return_counts=True))))

In [None]:
import os
import numpy as np

save_path = '../data/processed'
os.makedirs(save_path, exist_ok=True)

np.save(os.path.join(save_path, 'X_train.npy'), X_train)
np.save(os.path.join(save_path, 'y_train.npy'), y_train)
np.save(os.path.join(save_path, 'X_val.npy'), X_val)
np.save(os.path.join(save_path, 'y_val.npy'), y_val)
np.save(os.path.join(save_path, 'X_test.npy'), X_test)
np.save(os.path.join(save_path, 'y_test.npy'), y_test)

print(f"✅ All data arrays saved successfully to: {os.path.abspath(save_path)}")