In [29]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import DataLoader, WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight

In [2]:
base_dir = "SKAB"

In [24]:
# Load data from directories
def load_data(base_dir):
    data = []
    labels = []
    for folder in ["anomaly-free", "other", "valve1", "valve2"]:
        folder_path = os.path.join(base_dir, folder)
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # Load the CSV file and parse the 'datetime' column
            df = pd.read_csv(file_path, sep=";", parse_dates=["datetime"])
            
            # Assign labels based on the folder and check if 'anomaly' column exists
            if folder == "anomaly-free":
                # If the folder is 'anomaly-free', assign a 0 label for all rows
                data.append(df)
                labels.extend([0] * len(df))  # Label all rows as 0 (no anomaly)
            else:
                data.append(df)  # Add to data list for other folders as well
                
                # For other folders, check for the 'anomaly' column
                if "anomaly" in df.columns:
                    labels.extend(df["anomaly"].values)  # Use 'anomaly' column for labeling
                else:
                    # Handle case if there's no 'anomaly' column in the other folders
                    print(f"Warning: 'anomaly' column not found in {file_path}. Assigning label 1 by default.")
                    labels.extend([1] * len(df))  # Default to 1 for anomaly detection
    
    # Concatenate the data from all files and return
    return pd.concat(data, ignore_index=True), labels

In [25]:
# Load and preprocess data
all_data_df, labels = load_data(base_dir)

In [26]:
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46860 entries, 0 to 46859
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   datetime             46860 non-null  datetime64[ns]
 1   Accelerometer1RMS    46860 non-null  float64       
 2   Accelerometer2RMS    46860 non-null  float64       
 3   Current              46860 non-null  float64       
 4   Pressure             46860 non-null  float64       
 5   Temperature          46860 non-null  float64       
 6   Thermocouple         46860 non-null  float64       
 7   Voltage              46860 non-null  float64       
 8   Volume Flow RateRMS  46860 non-null  float64       
 9   anomaly              37459 non-null  float64       
 10  changepoint          37459 non-null  float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 3.9 MB


In [None]:
# Above shows that there are no missing entries in the rows. The "anomaly" and "changepoint" columns need to be
# removed due to data leakage.

In [18]:
# Understand class imbalance
label_counts = Counter(labels)

# Display the counts for each class
for label, count in label_counts.items():
    print(f"Class {label}: {count} samples")

Class 0: 9401 samples
Class 1: 37459 samples


In [27]:
def preprocess_data(df, labels):
    # Exclude columns that could cause data leakage (e.g., "anomaly" or "changepoint")
    columns_to_exclude = ["anomaly", "changepoint"]
    for col in columns_to_exclude:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    # Only want to use the MinMaxScaler on numerical data, not the datetime
    numeric_data = df.select_dtypes(include=["float64", "int64"])
    
    # Scale the numerical data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Perform stratified split into train (60%), validation (20%), and test (20%)
    X_train, X_temp, y_train, y_temp = train_test_split(
        scaled_data, labels, test_size=0.4, stratify=labels, random_state=42
    )
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test


In [28]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data_df, labels)

# Outputs for inspection
print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Train set size: 28116
Validation set size: 9372
Test set size: 9372


In [30]:
def create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test, batch_size=64):
    # Convert the data and labels into PyTorch tensors
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    
    # Compute class weights for CrossEntropyLoss
    class_weights = compute_class_weight(
        class_weight='balanced', 
        classes=[0, 1], 
        y=y_train
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float32)
    
    # Create the WeightedRandomSampler for the training data
    # The sampler will use the weights calculated from the class distribution
    class_sample_count = [len(torch.nonzero(torch.tensor(y_train) == i)) for i in [0, 1]]
    weights = [class_weights[label] for label in y_train]
    sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

    # Create DataLoader for training, validation, and test sets
    train_loader = DataLoader(train_data, batch_size=batch_size, sampler=sampler)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader, class_weights

In [15]:
# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.2, bidirectional=True):
        super(GRUModel, self).__init__()
        self.model_type = "GRU"
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        # GRU layer with bidirectional option
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,  # Dropout only applied if num_layers > 1
            bidirectional=bidirectional
        )
        
        # Batch normalization for the GRU output
        self.batch_norm = nn.BatchNorm1d(hidden_size * 2 if bidirectional else hidden_size)
        
        # Fully connected layer mapping GRU output to target output size
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)
        
        # Dropout for regularization before fully connected layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Initialize the hidden state
        h0 = torch.zeros(
            self.num_layers * (2 if self.bidirectional else 1),  # Double layers for bidirectional
            x.size(0), 
            self.hidden_size
        ).to(x.device)  # Ensure the hidden state is on the same device as input

        # GRU forward pass
        out, _ = self.gru(x, h0)
        
        # Apply batch normalization to the last time step output
        out_last = out[:, -1, :]  # Get the output of the last time step
        out_last = self.batch_norm(out_last)

        # Apply dropout and fully connected layer
        out_last = self.dropout(out_last)
        out = self.fc(out_last)
        
        return out