In [None]:
import numpy as np
import os
import pickle
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

class NoveltyDetectionSystem:
    def __init__(self, pca_path, kmeans_path, encoder_path):
        """Initialize the system with pre-trained PCA, KMeans, and Label Encoder."""
        with open(pca_path, 'rb') as f:
            self.pca = pickle.load(f)
        with open(kmeans_path, 'rb') as f:
            self.kmeans = pickle.load(f)
        with open(encoder_path, 'rb') as f:
            self.label_encoder = pickle.load(f)

        self.n_clusters = self.kmeans.n_clusters

    def extract_features(self, image_paths):
        """Extract VGG16 fc1 features from a batch of preprocessed images."""
        from tensorflow.keras.applications.vgg16 import VGG16
        model = VGG16(weights='imagenet', include_top=True)
        feature_extractor = Model(inputs=model.input, 
                                  outputs=model.get_layer('fc1').output)
        
        features = []
        for img_path in image_paths:
            img = load_img(img_path, target_size=(224, 224))
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
            features.append(feature_extractor.predict(np.expand_dims(img_array, axis=0)).flatten())
        
        return np.array(features)

    def process_batch(self, batch_features):
        """Process a batch of images and detect novelty."""
        # Step 1: Transform features using PCA
        transformed_features = self.pca.transform(batch_features)

        # Step 2: Compute distances to existing clusters
        distances = self.kmeans.transform(transformed_features)
        min_distances = np.min(distances, axis=1)

        # Step 3: Detect novelty using a threshold
        threshold = np.quantile(min_distances, 0.95)
        novelty_mask = min_distances > threshold
        novel_features = batch_features[novelty_mask]

        if len(novel_features) > 0:
            print(f"Detected {len(novel_features)} novel samples.")

            # Step 4: Handle novel samples
            self._create_new_cluster(novel_features)
            self._update_system(batch_features)
        else:
            print("No novel samples detected.")

    def _create_new_cluster(self, novel_features):
        """Cluster novel samples and create new clusters."""
        dbscan = DBSCAN(eps=0.5, min_samples=2)
        transformed = self.pca.transform(novel_features)
        labels = dbscan.fit_predict(transformed)

        n_new_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        self.n_clusters += n_new_clusters
        print(f"Created {n_new_clusters} new cluster(s). Total clusters: {self.n_clusters}")

    def _update_system(self, batch_features):
        """Update PCA and KMeans with the expanded dataset."""
        # Combine existing and batch features
        combined_features = np.vstack([self.pca.inverse_transform(self.kmeans.cluster_centers_), batch_features])

        # Retrain PCA
        self.pca = PCA(n_components=50)
        combined_transformed = self.pca.fit_transform(combined_features)

        # Retrain KMeans
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10)
        self.kmeans.fit(combined_transformed)

    def visualize_clusters(self):
        """Visualize clusters using t-SNE."""
        all_features = self.pca.inverse_transform(self.kmeans.cluster_centers_)
        tsne = TSNE(n_components=2, random_state=0)
        reduced = tsne.fit_transform(all_features)

        plt.scatter(reduced[:, 0], reduced[:, 1], c=np.arange(self.n_clusters))
        plt.title(f"Cluster Visualization ({self.n_clusters} Clusters)")
        plt.xlabel("t-SNE Dim 1")
        plt.ylabel("t-SNE Dim 2")
        plt.colorbar(label="Cluster ID")
        plt.show()

# Usage
# Paths to pre-trained models
pca_path = "../models/pca.pickle"
kmeans_path = "../models/kmeans.pickle"
encoder_path = "../models/label_encoder.pickle"

# Initialize the system
system = NoveltyDetectionSystem(pca_path, kmeans_path, encoder_path)

# Load the batch of images
batch_dir = Path("NEU_mixed_batch_40")
batch_images = sorted(batch_dir.glob("*.bmp"))  # Adjust extension if needed
batch_features = system.extract_features(batch_images)

# Process the batch
system.process_batch(batch_features)

# Visualize the clusters
system.visualize_clusters()

