## Using PCA as the baseline model

In [None]:
# Libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
import pickle

In [5]:
# Load data
X_normal = np.load("../data/processed/X_normal.npy")
X_abnormal = np.load("../data/processed/X_abnormal.npy")

In [6]:
# Flatten
# Change from a spectrogram to a vector
X_normal_flat = X_normal.reshape(X_normal.shape[0], -1)
X_abnormal_flat = X_abnormal.reshape(X_abnormal.shape[0], -1)

In [None]:
# split and scale the training data


In [None]:

# 3. Split & Scale
X_train, X_test_healthy = train_test_split(X_normal_flat, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform Tests
X_test_healthy_scaled = scaler.transform(X_test_healthy)
X_test_abnormal_scaled = scaler.transform(X_abnormal_flat)

# Combine Test Set
X_test_all = np.concatenate([X_test_healthy_scaled, X_test_abnormal_scaled])
y_test = np.concatenate([np.zeros(len(X_test_healthy)), np.ones(len(X_abnormal_flat))])

# 4. Train PCA Baseline
print("Training PCA Baseline")
pca = PCA(n_components=0.95)
pca.fit(X_train_scaled)

# 5. Inference & Anomaly Score (Reconstruction Error)
X_recon = pca.inverse_transform(pca.transform(X_test_all))
reconstruction_error = np.mean(np.square(X_test_all - X_recon), axis=1)

# 6. Save Results for Comparison Notebook
results = {
    "y_true": y_test,
    "y_score": reconstruction_error,
    "model_name": "PCA Baseline"
}
with open("../data/processed/results_pca.pkl", "wb") as f:
    pickle.dump(results, f)

print(f"Baseline AUC: {auc(*roc_curve(y_test, reconstruction_error)[:2]):.4f}")

Loading pre-processed data...


Training PCA Baseline
Baseline AUC: 0.6996
