In [82]:
import numpy as np
import h5py
import random
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping



# Data Preprocessing

In [83]:
file_path = './data/ROAD_dataset.h5'

In [84]:
def load_h5_data(h5_file_path, anomalies):
    """
    Load data from H5 file, including normal and anomaly data.
    """
    with h5py.File(h5_file_path, 'r') as hf:
        train_data = hf['train_data/data'][:]
        train_labels = hf['train_data/labels'][:].astype(str)

        # Replace '' and '1' with 'Normal'
        train_labels = np.where((train_labels == ''), 'Normal', train_labels)

        # Drop '1' labels from training data
        train_mask = train_labels != '1'
        train_data = train_data[train_mask]
        train_labels = train_labels[train_mask]

        # Load anomaly data
        anomaly_data, anomaly_labels = [], []
        for anomaly in anomalies:
            if anomaly != 'Normal':  # Skip 'Normal'
                group = hf[f'anomaly_data/{anomaly}']
                anomaly_data.append(group['data'][:])
                anomaly_labels.extend([anomaly] * len(group['data']))

        anomaly_data = np.concatenate(anomaly_data, axis=0) if anomaly_data else np.array([])
        anomaly_labels = np.array(anomaly_labels)

    return train_data, train_labels, anomaly_data, anomaly_labels

In [85]:
def subsample_normal_data(train_data, train_labels, num_normal_samples):
    """
    Subsample "Normal" data from the training dataset.
    """
    normal_indices = np.where(train_labels == 'Normal')[0]
    if len(normal_indices) > num_normal_samples:
        normal_indices = np.random.choice(normal_indices, num_normal_samples, replace=False)

    other_indices = np.where(train_labels != 'Normal')[0]
    all_indices = np.concatenate([normal_indices, other_indices])
    return train_data[all_indices], train_labels[all_indices]

In [86]:
def contaminate_with_anomalies(train_data, train_labels, anomaly_data, anomaly_labels, percentage_contamination, remaining_samples):
    """
    Add anomalies to the training dataset based on percentage contamination.
    """
    contaminated_data, contaminated_labels = [], []
    for anomaly, percentage in percentage_contamination.items():
        num_samples_to_add = int(remaining_samples * percentage)
        anomaly_indices = np.random.choice(
            np.where(anomaly_labels == anomaly)[0], num_samples_to_add, replace=False
        )
        contaminated_data.append(anomaly_data[anomaly_indices])
        contaminated_labels.extend([anomaly] * num_samples_to_add)

    if contaminated_data:
        contaminated_data = np.concatenate(contaminated_data, axis=0)
        contaminated_labels = np.array(contaminated_labels)

        # Append contaminated data to training data
        train_data = np.concatenate([train_data, contaminated_data], axis=0)
        train_labels = np.concatenate([train_labels, contaminated_labels], axis=0)

    return train_data, train_labels

In [87]:
def normalize(data):
    """
    Normalize the data to the range [0, 1].
    """
    normalized_data = np.zeros_like(data)
    for i, sample in enumerate(data):
        for channel in range(sample.shape[-1]):
            min_val, max_val = np.percentile(sample[..., channel], [1, 99])
            normalized = (sample[..., channel] - min_val) / (max_val - min_val + 1e-8)
            normalized_data[i, ..., channel] = np.clip(normalized, 0, 1)
    return normalized_data

In [88]:
def shuffle_and_normalize_data(train_data, train_labels):
    """
    Shuffle and normalize the training data.
    """
    shuffle_indices = np.random.permutation(len(train_data))
    train_data = train_data[shuffle_indices]
    train_labels = train_labels[shuffle_indices]

    train_data = normalize(train_data)
    return train_data, train_labels

In [89]:
def calculate_class_percentages(train_labels):
    """
    Calculate the percentages of each class in the training dataset.
    """
    unique_labels, counts = np.unique(train_labels, return_counts=True)
    return {label: count / len(train_labels) for label, count in zip(unique_labels, counts)}


In [90]:
def preprocess_lofar_data(
    h5_file_path,
    anomalies,
    percentage_contamination,
    seed=42,
    normal_percentage=None,
    total_samples=None
):
    """
    Preprocess LOFAR dataset for training and testing.
    """
    np.random.seed(seed)

    # Load data
    train_data, train_labels, anomaly_data, anomaly_labels = load_h5_data(h5_file_path, anomalies)

    # Calculate number of "Normal" samples dynamically based on percentage
    num_normal_samples = int(total_samples * normal_percentage) if normal_percentage and total_samples else len(
        np.where(train_labels == 'Normal')[0])

    # Subsample "Normal" data
    train_data, train_labels = subsample_normal_data(train_data, train_labels, num_normal_samples)

    # Add anomalies if total_samples is specified
    if total_samples:
        remaining_samples = total_samples - num_normal_samples
        train_data, train_labels = contaminate_with_anomalies(
            train_data, train_labels, anomaly_data, anomaly_labels, percentage_contamination, remaining_samples
        )

    # Shuffle and normalize data
    train_data, train_labels = shuffle_and_normalize_data(train_data, train_labels)

    # Calculate class percentages
    percentages = calculate_class_percentages(train_labels)
    print("Class Percentages in Training Data:", percentages)

    # Encode labels
    label_mapping = {label: idx for idx, label in enumerate(anomalies)}
    train_labels_encoded = np.array([label_mapping[label] for label in train_labels])

    return train_data, train_labels_encoded, percentages


In [109]:
import yaml

In [113]:
with open('./data/sampling_params.yaml', 'r') as file:
    sampling_config = yaml.safe_load(file)

In [114]:
labels = sampling_config['labels']

In [115]:
#labels = [
#    'Normal',  # Represent non-anomalous data
#    'oscillating_tile',
#    'first_order_high_noise',
#    'first_order_data_loss',
#    'third_order_data_loss',
#    'lightning',
#    'rfi_ionosphere_reflect',
#    'galactic_plane',
#    'source_in_sidelobes',
#    'solar_storm'
#]
percentage_contamination = {
    'oscillating_tile': 0.1,
    'first_order_high_noise': 0.1,
    'first_order_data_loss': 0.1,
    'third_order_data_loss': 0.1,
    'rfi_ionosphere_reflect': 0.1,
    'lightning': 0.1,
    'galactic_plane': 0.1,
    'source_in_sidelobes': 0.1,
    'solar_storm': 0.1
}

try:
    train_data, train_labels_encoded, percentages = preprocess_lofar_data(
        file_path,
        labels,
        percentage_contamination,
        seed=42,
        normal_percentage=.1,
        total_samples=1000
    )
except ValueError as e:
    print(e)


'a' cannot be empty unless no samples are taken


In [93]:
print("Train Data Shape:", train_data.shape)
print("Train Labels Shape:", train_labels_encoded.shape)


Train Data Shape: (546, 256, 256, 4)
Train Labels Shape: (546,)


In [94]:
train_data_flat = train_data.reshape(train_data.shape[0], -1) 

In [95]:
# Step 1: Split into Training+Validation and Test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(train_data_flat, train_labels_encoded, test_size=0.2, random_state=42)

# Step 2: Split Training+Validation into Training and Validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


In [96]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)



In [97]:
param_grid = {
    'n_estimators': [5, 10, 15],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy')


In [98]:
grid_search.fit(X_train, y_train)

In [99]:
# Best Parameters and Score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Test the best model
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Accuracy:", test_score)

Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 15}
Best Score: 0.6483180428134556
Test Accuracy: 0.6636363636363637


In [100]:
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nEvaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

# Detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Evaluation Metrics:
Accuracy: 0.66
Precision: 0.68
Recall: 0.66
F1-Score: 0.67

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.54      0.52        13
           1       0.69      0.64      0.67        14
           2       0.43      0.75      0.55         4
           3       1.00      0.89      0.94         9
           4       0.62      0.57      0.59        14
           5       0.67      0.67      0.67         9
           6       1.00      1.00      1.00         9
           7       0.62      0.44      0.52        18
           8       0.36      0.50      0.42        10
           9       1.00      1.00      1.00        10

    accuracy                           0.66       110
   macro avg       0.69      0.70      0.69       110
weighted avg       0.68      0.66      0.67       110



In [101]:
import tensorflow as tf
from tensorflow.keras import layers, models

def build_cnn_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
    ])
    return model


In [102]:
# Split train data into training and validation
val_data, test_data, val_labels, test_labels = train_test_split(
    train_data, train_labels_encoded, test_size=0.2, random_state=42
)

# Ensure labels are one-hot encoded
num_classes = len(labels)  # Number of classes
train_labels_onehot = to_categorical(train_labels_encoded, num_classes=num_classes)
val_labels_onehot = to_categorical(val_labels, num_classes=num_classes)
test_labels_onehot = to_categorical(test_labels, num_classes=num_classes)


In [103]:
input_shape = (256, 256, 4)  # Height, Width, Channels
num_classes = len(labels)  # Number of classes

model = build_cnn_model(input_shape, num_classes)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use categorical crossentropy for multiclass classification
              metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [104]:
early_stopping = EarlyStopping(
    monitor='val_loss',       # Metric to monitor (e.g., 'val_loss', 'val_accuracy')
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    verbose=1,                # Verbosity level
    restore_best_weights=True # Restore the model weights from the epoch with the best monitored value
)

In [105]:
history = model.fit(
    train_data, train_labels_onehot,
    validation_data=(val_data, val_labels_onehot),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)


Epoch 1/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 1s/step - accuracy: 0.2681 - loss: 2.5056 - val_accuracy: 0.6078 - val_loss: 1.3326
Epoch 2/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 1s/step - accuracy: 0.5597 - loss: 1.3726 - val_accuracy: 0.7317 - val_loss: 0.8567
Epoch 3/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 1s/step - accuracy: 0.6710 - loss: 1.0128 - val_accuracy: 0.8555 - val_loss: 0.4627
Epoch 4/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 1s/step - accuracy: 0.7678 - loss: 0.6863 - val_accuracy: 0.9106 - val_loss: 0.3073
Epoch 5/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 1s/step - accuracy: 0.8489 - loss: 0.4901 - val_accuracy: 0.9335 - val_loss: 0.2247
Epoch 6/10
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 1s/step - accuracy: 0.8402 - loss: 0.5011 - val_accuracy: 0.9495 - val_loss: 0.1781
Epoch 7/10
[1m18/18[0m [32m━━━━━━━━━━

In [106]:
test_loss, test_accuracy = model.evaluate(test_data, test_labels_onehot, batch_size=32)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 177ms/step - accuracy: 0.9943 - loss: 0.0342
Test Loss: 0.0461, Test Accuracy: 0.9909


In [117]:
test_labels.shape

(110,)

In [118]:

# Get predictions
y_pred_probs = model.predict(test_data)  # Probabilities
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels
y_true = np.argmax(test_labels_onehot, axis=1)  # Ground truth labels (if one-hot encoded)

# Calculate Precision, Recall, and F1-Score
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 183ms/step
Precision: 0.9917
Recall: 0.9909
F1-Score: 0.9909
