In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from collections import Counter
from torch.utils.data import DataLoader, WeightedRandomSampler, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
base_dir = "SKAB"
SEQ_LENGTH = 8

In [33]:
# Load data from directories
def load_data(base_dir):
    data = []
    labels = []
    for folder in ["anomaly-free", "other", "valve1", "valve2"]:
        folder_path = os.path.join(base_dir, folder)
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            # Load the CSV file and parse the 'datetime' column
            df = pd.read_csv(file_path, sep=";", parse_dates=["datetime"])
            
            # Assign labels based on the folder and check if 'anomaly' column exists
            if folder == "anomaly-free":
                # If the folder is 'anomaly-free', assign a 0 label for all rows
                data.append(df)
                labels.extend([0] * len(df))  # Label all rows as 0 (no anomaly)
            else:
                data.append(df)  # Add to data list for other folders as well
                
                # For other folders, check for the 'anomaly' column
                if "anomaly" in df.columns:
                    labels.extend(df["anomaly"].tolist())  # Use 'anomaly' column for labeling
                else:
                    # Handle case if there's no 'anomaly' column in the other folders
                    print(f"Warning: 'anomaly' column not found in {file_path}. Assigning label 1 by default.")
                    labels.extend([1] * len(df))  # Default to 1 for anomaly detection
    
    # Concatenate the data from all files
    df_combined = pd.concat(data, ignore_index=True)
    
    # Sort the concatenated dataframe by 'datetime' to maintain the temporal order
    df_combined = df_combined.sort_values(by="datetime")
    
    return df_combined, labels

In [38]:
# Load and preprocess data
all_data_df, data_labels = load_data(base_dir)

In [39]:
print(len(all_data_df), len(data_labels))

46860 46860


In [40]:
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46860 entries, 0 to 44667
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   datetime             46860 non-null  datetime64[ns]
 1   Accelerometer1RMS    46860 non-null  float64       
 2   Accelerometer2RMS    46860 non-null  float64       
 3   Current              46860 non-null  float64       
 4   Pressure             46860 non-null  float64       
 5   Temperature          46860 non-null  float64       
 6   Thermocouple         46860 non-null  float64       
 7   Voltage              46860 non-null  float64       
 8   Volume Flow RateRMS  46860 non-null  float64       
 9   anomaly              37459 non-null  float64       
 10  changepoint          37459 non-null  float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 4.3 MB


In [7]:
# Above shows that there are no missing entries in the rows. The "anomaly" and "changepoint" columns need to be
# removed due to data leakage.

In [41]:
# Understand class imbalance
label_counts = Counter(data_labels)

# Display the counts for each class
for label, count in label_counts.items():
    print(f"Class {label}: {count} samples")

Class 0: 33619 samples
Class 1.0: 13241 samples


In [42]:
def create_sequences(data, labels, sequence_length):    
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i + sequence_length])
        y.append(labels[i + sequence_length])  # Target is the label at the next time step
    return np.array(X), np.array(y)

In [43]:
def preprocess_data(df, labels, sequence_length=8):
    # Exclude columns that could cause data leakage (e.g., "anomaly" or "changepoint")
    columns_to_exclude = ["anomaly", "changepoint"]
    for col in columns_to_exclude:
        if col in df.columns:
            df = df.drop(columns=[col])
    
    # Only want to use the MinMaxScaler on numerical data, not the datetime
    numeric_data = df.select_dtypes(include=["float64", "int64"])
    
    # Scale the numerical data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    
    # Perform stratified split into train (60%), validation (20%), and test (20%)
    X_train, X_temp, y_train, y_temp = train_test_split(
        scaled_data, labels, test_size=0.4, stratify=labels, random_state=42
    )
    
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
    )
    
    # Create sequences for each split
    X_train, y_train = create_sequences(X_train, y_train, sequence_length)
    X_val, y_val = create_sequences(X_val, y_val, sequence_length)
    X_test, y_test = create_sequences(X_test, y_test, sequence_length)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [44]:
X_train, X_val, X_test, y_train, y_val, y_test = preprocess_data(all_data_df, data_labels, SEQ_LENGTH)

# Outputs for inspection
print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Train set size: 28108
Validation set size: 9364
Test set size: 9364


In [45]:
def create_dataloaders(X_train, X_val, X_test, y_train, y_val, y_test, batch_size=32):
    # Convert the data and labels into PyTorch tensors
    train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
    val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
    test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))
    
    # Compute class weights for CrossEntropyLoss
    class_weights = compute_class_weight(
        class_weight='balanced', 
        classes=np.array([0, 1]), 
        y=y_train
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float32)
    
    # Create the WeightedRandomSampler for the training data
    # The sampler will use the weights calculated from the class distribution
    class_sample_count = [len(torch.nonzero(torch.tensor(y_train) == i)) for i in [0, 1]]
    weights = [class_weights[int(label)] for label in y_train]
    sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

    # Create DataLoader for training, validation, and test sets
    train_loader = DataLoader(train_data, batch_size=batch_size, sampler=sampler)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader, class_weights

In [46]:
# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.2, bidirectional=True):
        super(GRUModel, self).__init__()
        self.model_type = "GRU"
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        # GRU layer with bidirectional option
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,  # Dropout only applied if num_layers > 1
            bidirectional=bidirectional
        )
        
        # Batch normalization for the GRU output
        self.batch_norm = nn.BatchNorm1d(hidden_size * 2 if bidirectional else hidden_size)
        
        # Fully connected layer mapping GRU output to target output size
        self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)
        
        # Dropout for regularization before fully connected layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Determine batch size dynamically
        batch_size = x.size(0)
        
        # Initialize hidden state
        h0 = torch.zeros(
            self.num_layers * (2 if self.bidirectional else 1),  # Double layers for bidirectional
            batch_size,
            self.hidden_size
        ).to(x.device)  # Ensure the hidden state is on the same device as input

        # GRU forward pass
        out, _ = self.gru(x, h0)
        
        # Apply batch normalization to the last time step output
        out_last = out[:, -1, :]  # Get the output of the last time step
        out_last = self.batch_norm(out_last)
    
        # Apply dropout and fully connected layer
        out_last = self.dropout(out_last)
        out = self.fc(out_last)
        
        return out

In [51]:
# Define the model
input_size = X_train.shape[1]  # Number of features
hidden_size = 64  # Hidden state dimension
output_size = 2  # Number of classes (normal, anomaly)
num_layers = 2
num_epochs = 20
batch_size = 32

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min',       # Reduce the LR when the validation loss stops improving
    factor=0.5,       # Reduce the LR by half
    patience=5,       # Wait for 5 epochs before reducing LR
    verbose=True,     # Print a message when the LR is updated
    min_lr=1e-6       # Minimum learning rate (avoid going too low)
)

# Create the dataloaders and get class weights for loss function
train_loader, val_loader, test_loader, class_weights = create_dataloaders(X_train, X_val, X_test, 
                                                                          y_train, y_val, y_test, batch_size=batch_size)



In [52]:
# Training loop
model = GRUModel(input_size, hidden_size, output_size, num_layers, dropout=0.2).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights.clone().detach().to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

best_val_loss = float('inf')
best_model_path = "best_model.pth"

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct, total = 0, 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Track metrics
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    train_acc = correct / total
    print(f"\nEpoch {epoch+1}/{num_epochs}, Train Accuracy: {train_acc:.4f}")

    # Validation loop
    model.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_correct += (predicted == targets).sum().item()
            val_total += targets.size(0)

    val_accuracy = 100 * val_correct / val_total
    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_accuracy:.2f}")

    # Save the model if validation loss has improved
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"Model saved with validation loss: {avg_val_loss:.4f}")

    # Step the scheduler after validation
    scheduler.step(avg_val_loss)  # This will adjust the learning rate based on validation loss



Epoch 1/20, Train Accuracy: 0.4958
Validation Loss: 0.8031, Accuracy: 28.26
Model saved with validation loss: 0.8031

Epoch 2/20, Train Accuracy: 0.4955
Validation Loss: 0.7997, Accuracy: 28.26
Model saved with validation loss: 0.7997

Epoch 3/20, Train Accuracy: 0.5012
Validation Loss: 0.8082, Accuracy: 28.26

Epoch 4/20, Train Accuracy: 0.5000
Validation Loss: 0.7788, Accuracy: 28.26
Model saved with validation loss: 0.7788

Epoch 5/20, Train Accuracy: 0.4942
Validation Loss: 0.7923, Accuracy: 28.26

Epoch 6/20, Train Accuracy: 0.4984
Validation Loss: 0.7937, Accuracy: 28.26

Epoch 7/20, Train Accuracy: 0.5050
Validation Loss: 0.8075, Accuracy: 28.26

Epoch 8/20, Train Accuracy: 0.5010
Validation Loss: 0.7961, Accuracy: 28.26

Epoch 9/20, Train Accuracy: 0.4999
Validation Loss: 0.8026, Accuracy: 28.26

Epoch 10/20, Train Accuracy: 0.4981
Validation Loss: 0.7950, Accuracy: 28.26

Epoch 11/20, Train Accuracy: 0.4955
Validation Loss: 0.7910, Accuracy: 28.26

Epoch 12/20, Train Accuracy

KeyboardInterrupt: 