In [6]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
base_dir = "SKAB"

In [1]:
# Load data from directories
def load_data(base_dir):
    data = []
    labels = []
    for folder in ["anomaly-free", "other", "valve1", "valve2"]:
        folder_path = os.path.join(base_dir, folder)
        label = 0 if folder == "anomaly-free" else 1
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # Need to handle datetime column
            df = pd.read_csv(file_path, sep=";", parse_dates=["datetime"])
            data.append(df)
            labels.extend([label] * len(df))  # Assign label to each row
    return pd.concat(data, ignore_index=True), labels

In [2]:
# Normalize and split data into training (60%), validation (20%), test (20%)
def preprocess_data(df, labels):
    # Only want to use the MinMaxScaler on numerical data, not the datetime
    numeric_data = df.select_dtypes(include=["float64", "int64"])
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(numeric_data)

    # First split: train vs (validation + test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        scaled_data, labels, test_size=0.4, stratify=labels, random_state=42
    )
    
    # Second split: validation vs test
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [7]:
# Load and preprocess data
all_data_df, labels = load_data(base_dir)
X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data_df, labels)

# Outputs for inspection
print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Train set size: 28116
Validation set size: 9372
Test set size: 9372
