# Model Training and Comparison for News Classification
This notebook handles:
- Loading preprocessed data
- Implementing multiple unsupervised learning algorithms
- Model training and parameter tuning
- Performance comparison and evaluation
- Results visualization and interpretation

## 1. Import Required Libraries

In [7]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import pickle
from collections import Counter, defaultdict

# PyTorch for deep learning and tensor operations
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# Machine Learning algorithms (keeping some for comparison)
from sklearn.cluster import DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA, TruncatedSVD
from sklearn.manifold import TSNE

# Evaluation metrics
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    adjusted_rand_score, normalized_mutual_info_score, homogeneity_score
)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Utilities
import time
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Set device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Libraries imported successfully!")

Using device: cuda
Libraries imported successfully!


## 2. Load Preprocessed Data

In [5]:
# Load preprocessed data
print("Loading preprocessed data...")

try:
    # Load main dataframe
    news_df = pd.read_csv('processed_news_data.csv')
    print(f"✓ Loaded {len(news_df)} processed articles")
    
    # Load TF-IDF features
    with open('tfidf_features.pkl', 'rb') as f:
        tfidf_data = pickle.load(f)
    
    tfidf_matrices = tfidf_data['matrices']
    tfidf_vectorizers = tfidf_data['vectorizers']
    feature_names = tfidf_data['feature_names']
    tfidf_configs = tfidf_data['configurations']
    
    print(f"✓ Loaded TF-IDF features with {len(tfidf_matrices)} configurations")
    
    # Load preprocessing summary
    with open('preprocessing_summary.pkl', 'rb') as f:
        preprocessing_summary = pickle.load(f)
    
    print("✓ Loaded preprocessing summary")
    
except FileNotFoundError as e:
    print(f"❌ Error loading preprocessed data: {e}")
    print("Please run the dataset_preprocessing.ipynb notebook first!")
    raise

# Display basic info
print(f"\nDataset Info:")
print(f"• Articles: {len(news_df)}")
print(f"• TF-IDF configurations: {list(tfidf_matrices.keys())}")
if 'label_text' in news_df.columns:
    print(f"• Categories: {news_df['label_text'].nunique()}")

Loading preprocessed data...
✓ Loaded 1225 processed articles
✓ Loaded TF-IDF features with 3 configurations
✓ Loaded preprocessing summary

Dataset Info:
• Articles: 1225
• TF-IDF configurations: ['basic', 'bigrams', 'trigrams']
• Categories: 5


## 4. PyTorch-based Model Trainer Class

## 3. PyTorch K-Means Implementation from Scratch

In [6]:
class PyTorchKMeans(nn.Module):
    """
    K-Means clustering implementation from scratch using PyTorch
    Following neural network principles with gradient-based optimization
    """
    
    def __init__(self, n_clusters, max_iter=300, tol=1e-4, init='k-means++'):
        super(PyTorchKMeans, self).__init__()
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.init = init
        self.centroids = None
        self.labels = None
        self.inertia = None
        self.n_iter = 0
        
    def _init_centroids(self, X):
        """
        Initialize centroids using k-means++ algorithm or random initialization
        """
        n_samples, n_features = X.shape
        
        if self.init == 'k-means++':
            centroids = torch.zeros(self.n_clusters, n_features, device=X.device)
            
            # Choose first centroid randomly
            centroids[0] = X[torch.randint(0, n_samples, (1,))]
            
            # Choose remaining centroids using k-means++ algorithm
            for i in range(1, self.n_clusters):
                # Calculate distances from each point to nearest centroid
                distances = torch.full((n_samples,), float('inf'), device=X.device)
                
                for j in range(i):
                    dist_to_centroid = torch.sum((X - centroids[j]) ** 2, dim=1)
                    distances = torch.minimum(distances, dist_to_centroid)
                
                # Choose next centroid with probability proportional to squared distance
                probabilities = distances / torch.sum(distances)
                cumsum = torch.cumsum(probabilities, dim=0)
                r = torch.rand(1, device=X.device)
                chosen_idx = torch.searchsorted(cumsum, r)
                centroids[i] = X[chosen_idx]
                
        else:  # random initialization
            indices = torch.randperm(n_samples, device=X.device)[:self.n_clusters]
            centroids = X[indices].clone()
            
        return centroids
    
    def _assign_clusters(self, X, centroids):
        """
        Assign each point to the nearest centroid
        """
        distances = torch.cdist(X, centroids)  # Shape: (n_samples, n_clusters)
        labels = torch.argmin(distances, dim=1)
        return labels
    
    def _update_centroids(self, X, labels):
        """
        Update centroids as the mean of assigned points
        """
        centroids = torch.zeros(self.n_clusters, X.shape[1], device=X.device)
        
        for k in range(self.n_clusters):
            mask = labels == k
            if torch.sum(mask) > 0:
                centroids[k] = torch.mean(X[mask], dim=0)
            else:
                # If no points assigned to cluster, reinitialize randomly
                centroids[k] = X[torch.randint(0, X.shape[0], (1,))]
        
        return centroids
    
    def _calculate_inertia(self, X, labels, centroids):
        """
        Calculate within-cluster sum of squared distances (inertia)
        """
        inertia = 0.0
        for k in range(self.n_clusters):
            mask = labels == k
            if torch.sum(mask) > 0:
                cluster_points = X[mask]
                centroid = centroids[k]
                inertia += torch.sum((cluster_points - centroid) ** 2)
        return inertia
    
    def fit(self, X):
        """
        Fit K-means clustering to data X
        """
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, dtype=torch.float32, device=device)
        else:
            X = X.to(device)
        
        # Initialize centroids
        centroids = self._init_centroids(X)
        prev_inertia = float('inf')
        
        for iteration in range(self.max_iter):
            # Assign points to clusters
            labels = self._assign_clusters(X, centroids)
            
            # Update centroids
            new_centroids = self._update_centroids(X, labels)
            
            # Calculate inertia
            inertia = self._calculate_inertia(X, labels, new_centroids)
            
            # Check for convergence
            if abs(prev_inertia - inertia) < self.tol:
                print(f"Converged after {iteration + 1} iterations")
                break
                
            centroids = new_centroids
            prev_inertia = inertia
            self.n_iter = iteration + 1
        
        self.centroids = centroids
        self.labels = labels
        self.inertia = inertia
        
        return self
    
    def predict(self, X):
        """
        Predict cluster labels for new data
        """
        if self.centroids is None:
            raise ValueError("Model must be fitted before prediction")
        
        if not isinstance(X, torch.Tensor):
            X = torch.tensor(X, dtype=torch.float32, device=device)
        else:
            X = X.to(device)
        
        return self._assign_clusters(X, self.centroids)
    
    def fit_predict(self, X):
        """
        Fit the model and return cluster labels
        """
        self.fit(X)
        return self.labels.cpu().numpy()
    
    def get_centroids(self):
        """
        Get the centroids as numpy array
        """
        if self.centroids is None:
            return None
        return self.centroids.cpu().numpy()


class StochasticEmbeddingNetwork(nn.Module):
    """
    Non-Deterministic Stochastic Embedding Network for Clustering
    Based on assignment requirement: z = f(x) + ε where ε ~ N(0, σ²)
    """
    
    def __init__(self, input_dim, n_clusters, hidden_dim=128, embedding_dim=64, noise_scale=0.1):
        super(StochasticEmbeddingNetwork, self).__init__()
        self.n_clusters = n_clusters
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.noise_scale = noise_scale
        
        # Encoder network for deterministic component f(x)
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, embedding_dim),
        )
        
        # Variance network to predict noise scale σ²
        self.variance_net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, embedding_dim),
            nn.Softplus()  # Ensures positive variance
        )
        
        # Learnable cluster centers with uncertainty
        self.cluster_centers_mu = nn.Parameter(torch.randn(n_clusters, embedding_dim) * 0.1)
        self.cluster_centers_logvar = nn.Parameter(torch.ones(n_clusters, embedding_dim) * -2)
        
        # Temperature parameter for soft assignments
        self.temperature = nn.Parameter(torch.tensor(1.0))
        
    def reparameterize(self, mu, logvar):
        """
        Reparameterization trick: sample from N(mu, exp(logvar))
        """
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def forward(self, x, sample=True):
        """
        Forward pass with stochastic embeddings
        """
        batch_size = x.shape[0]
        
        # Deterministic embedding component f(x)
        deterministic_embedding = self.encoder(x)
        
        if sample:
            # Predict input-dependent noise variance
            embedding_logvar = self.variance_net(x)
            
            # Stochastic embedding: z = f(x) + ε where ε ~ N(0, σ²)
            stochastic_embedding = self.reparameterize(deterministic_embedding, embedding_logvar)
        else:
            stochastic_embedding = deterministic_embedding
            embedding_logvar = torch.zeros_like(deterministic_embedding)
        
        # Sample cluster centers from their distributions
        if sample and self.training:
            cluster_centers = self.reparameterize(
                self.cluster_centers_mu, 
                self.cluster_centers_logvar
            )
        else:
            cluster_centers = self.cluster_centers_mu
        
        # Calculate distances to cluster centers
        distances = torch.cdist(stochastic_embedding, cluster_centers)
        
        # Temperature-scaled soft assignments
        probabilities = F.softmax(-distances / self.temperature, dim=1)
        
        return {
            'embeddings': stochastic_embedding,
            'deterministic_embeddings': deterministic_embedding,
            'embedding_logvar': embedding_logvar,
            'probabilities': probabilities,
            'distances': distances,
            'cluster_centers': cluster_centers,
            'uncertainty': torch.exp(0.5 * embedding_logvar).mean(dim=1)  # Per-sample uncertainty
        }





print("Neural Network Models Defined!")
print("Classes available (following assignment requirements):")
print("1. PyTorchKMeans - Our custom K-means implementation (primary model)")
print("2. StochasticEmbeddingNetwork - Non-deterministic: z = f(x) + ε where ε ~ N(0, σ²)")

Neural Network Models Defined!
Classes available (following assignment requirements):
1. PyTorchKMeans - Our custom K-means implementation (primary model)
2. StochasticEmbeddingNetwork - Non-deterministic: z = f(x) + ε where ε ~ N(0, σ²)


In [None]:
class PyTorchModelTrainer:
    """
    Model trainer using PyTorch implementations built from scratch
    """
    def __init__(self, data_matrix, true_labels=None):
        self.data_matrix = data_matrix
        self.true_labels = true_labels
        self.models = {}
        self.predictions = {}
        self.training_times = {}
        self.evaluation_metrics = {}
        
        # Convert data to PyTorch tensor format
        if hasattr(data_matrix, 'toarray'):
            self.torch_data = torch.tensor(data_matrix.toarray(), dtype=torch.float32, device=device)
        else:
            self.torch_data = torch.tensor(data_matrix, dtype=torch.float32, device=device)
        
        print(f"Data converted to PyTorch tensor: {self.torch_data.shape}")
    
    def train_pytorch_kmeans(self, n_clusters_range=None, init_method='k-means++'):
        """
        Train PyTorch K-Means implementation built from scratch
        """
        if n_clusters_range is None:
            n_clusters_range = range(2, 11)
        
        print(f"Training PyTorch K-Means from scratch (init: {init_method})...")
        
        best_score = -1
        best_k = 2
        
        for k in n_clusters_range:
            print(f"  Training K-Means with K={k}...")
            start_time = time.time()
            
            # Initialize PyTorch K-Means model
            model = PyTorchKMeans(n_clusters=k, max_iter=300, init=init_method)
            
            # Train the model
            model.fit(self.torch_data)
            
            # Get predictions
            predictions = model.labels.cpu().numpy()
            
            training_time = time.time() - start_time
            
            # Calculate silhouette score
            if len(np.unique(predictions)) > 1:
                sil_score = silhouette_score(self.data_matrix, predictions)
                
                # Store best model
                if sil_score > best_score:
                    best_score = sil_score
                    best_k = k
                    self.models['pytorch_kmeans'] = model
                    self.predictions['pytorch_kmeans'] = predictions
                    self.training_times['pytorch_kmeans'] = training_time
                
                print(f"    K={k}: Silhouette Score = {sil_score:.3f}, Inertia = {model.inertia:.2f}, Time = {training_time:.2f}s, Iterations = {model.n_iter}")
            else:
                print(f"    K={k}: Failed to create multiple clusters")
        
        print(f"✓ Best PyTorch K-Means: K={best_k}, Silhouette Score = {best_score:.3f}")
        return best_k, best_score
    
    def train_stochastic_embedding(self, n_clusters_range=None, hidden_dim=128, embedding_dim=64, epochs=150, noise_scale=0.1):
        """
        Train Stochastic Embedding Network (Assignment Requirement)
        """
        if n_clusters_range is None:
            n_clusters_range = range(2, 8)
        
        print(f"Training Stochastic Embedding Network (Non-Deterministic)...")
        
        best_score = -1
        best_k = 2
        input_dim = self.torch_data.shape[1]
        
        for k in n_clusters_range:
            print(f"  Training Stochastic Embedding with K={k}...")
            start_time = time.time()
            
            # Initialize Stochastic Embedding model
            model = StochasticEmbeddingNetwork(
                input_dim=input_dim,
                n_clusters=k,
                hidden_dim=hidden_dim,
                embedding_dim=embedding_dim,
                noise_scale=noise_scale
            ).to(device)
            
            # Train the model
            self._train_stochastic_model(model, epochs=epochs, model_name='stochastic_embedding')
            
            # Get predictions with multiple samples for stability
            predictions = self._predict_stochastic(model, n_samples=10)
            
            training_time = time.time() - start_time
            
            # Calculate silhouette score
            if len(np.unique(predictions)) > 1:
                sil_score = silhouette_score(self.data_matrix, predictions)
                
                if sil_score > best_score:
                    best_score = sil_score
                    best_k = k
                    self.models['stochastic_embedding'] = model
                    self.predictions['stochastic_embedding'] = predictions
                    self.training_times['stochastic_embedding'] = training_time
                
                print(f"    K={k}: Silhouette Score = {sil_score:.3f}, Time = {training_time:.2f}s")
            else:
                print(f"    K={k}: Failed to create multiple clusters")
        
        print(f"✓ Best Stochastic Embedding: K={best_k}, Silhouette Score = {best_score:.3f}")
        return best_k, best_score
    

    
    def evaluate_all_models(self):
        """
        Evaluate all trained models with comprehensive metrics
        """
        print("\nEvaluating all models...")
        
        for model_name, predictions in self.predictions.items():
            metrics = {}
            
            # Unsupervised metrics
            if len(np.unique(predictions)) > 1:
                try:
                    metrics['silhouette_score'] = silhouette_score(self.data_matrix, predictions)
                    data_for_metrics = self.data_matrix.toarray() if hasattr(self.data_matrix, 'toarray') else self.data_matrix
                    metrics['calinski_harabasz_score'] = calinski_harabasz_score(data_for_metrics, predictions)
                    metrics['davies_bouldin_score'] = davies_bouldin_score(data_for_metrics, predictions)
                except Exception as e:
                    print(f"    Warning: Could not calculate some metrics for {model_name}: {e}")
            
            # Supervised metrics (if true labels available)
            if self.true_labels is not None:
                try:
                    metrics['adjusted_rand_score'] = adjusted_rand_score(self.true_labels, predictions)
                    metrics['normalized_mutual_info_score'] = normalized_mutual_info_score(self.true_labels, predictions)
                    metrics['homogeneity_score'] = homogeneity_score(self.true_labels, predictions)
                except Exception as e:
                    print(f"    Warning: Could not calculate supervised metrics for {model_name}: {e}")
            
            metrics['n_clusters'] = len(np.unique(predictions))
            metrics['training_time'] = self.training_times.get(model_name, 0)
            
            # Add PyTorch-specific metrics
            if model_name == 'pytorch_kmeans':
                model = self.models[model_name]
                if hasattr(model, 'inertia') and model.inertia is not None:
                    metrics['inertia'] = float(model.inertia)
                if hasattr(model, 'n_iter'):
                    metrics['iterations'] = model.n_iter
            
            self.evaluation_metrics[model_name] = metrics
        
        return self.evaluation_metrics
    
    def get_cluster_analysis(self):
        """
        Detailed analysis of PyTorch K-means clusters
        """
        analysis = {}
        
        if 'pytorch_kmeans' in self.models and 'pytorch_kmeans' in self.predictions:
            model = self.models['pytorch_kmeans']
            predictions = self.predictions['pytorch_kmeans']
            
            cluster_info = {}
            cluster_info['n_clusters'] = len(np.unique(predictions))
            cluster_info['cluster_sizes'] = [np.sum(predictions == k) for k in np.unique(predictions)]
            
            if hasattr(model, 'get_centroids'):
                centroids = model.get_centroids()
                if centroids is not None:
                    cluster_info['centroids_shape'] = centroids.shape
                    # Calculate centroid distances
                    centroid_distances = []
                    for i in range(len(centroids)):
                        for j in range(i+1, len(centroids)):
                            dist = np.linalg.norm(centroids[i] - centroids[j])
                            centroid_distances.append(dist)
                    cluster_info['avg_centroid_distance'] = np.mean(centroid_distances) if centroid_distances else 0
            
            analysis['pytorch_kmeans'] = cluster_info
        
        return analysis
    
    def _train_stochastic_model(self, model, epochs=150, batch_size=256, lr=0.001, model_name='stochastic'):
        """
        Training procedure for Stochastic Embedding Network
        """
        dataset = TensorDataset(self.torch_data)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        
        model.train()
        
        for epoch in range(epochs):
            total_loss = 0.0
            
            for batch_data in dataloader:
                batch_X = batch_data[0]
                optimizer.zero_grad()
                
                # Forward pass with stochastic sampling
                outputs = model(batch_X, sample=True)
                
                # Clustering loss: minimize distance to assigned clusters
                cluster_assignments = torch.argmax(outputs['probabilities'], dim=1)
                cluster_loss = 0.0
                
                for k in range(model.n_clusters):
                    mask = cluster_assignments == k
                    if mask.sum() > 0:
                        cluster_points = outputs['embeddings'][mask]
                        centroid = outputs['cluster_centers'][k]
                        cluster_loss += torch.mean((cluster_points - centroid) ** 2)
                
                # Regularization: encourage diversity in cluster assignments
                avg_prob = torch.mean(outputs['probabilities'], dim=0)
                entropy_reg = -torch.sum(avg_prob * torch.log(avg_prob + 1e-8))
                
                # Uncertainty regularization: penalize excessive uncertainty
                uncertainty_reg = torch.mean(outputs['uncertainty'])
                
                # Total loss
                loss = cluster_loss - 0.1 * entropy_reg + 0.01 * uncertainty_reg
                
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            if (epoch + 1) % 30 == 0:
                print(f"    Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")
    

    
    def _predict_stochastic(self, model, n_samples=10):
        """
        Predict with multiple stochastic samples for stability
        """
        model.eval()
        all_predictions = []
        
        with torch.no_grad():
            for _ in range(n_samples):
                outputs = model(self.torch_data, sample=True)
                predictions = torch.argmax(outputs['probabilities'], dim=1).cpu().numpy()
                all_predictions.append(predictions)
        
        # Ensemble prediction: majority vote
        all_predictions = np.array(all_predictions)
        final_predictions = []
        
        for i in range(all_predictions.shape[1]):
            votes = all_predictions[:, i]
            final_predictions.append(np.bincount(votes).argmax())
        
        return np.array(final_predictions)
    

    
    def get_uncertainty_analysis(self):
        """
        Analyze uncertainty in non-deterministic models
        """
        uncertainty_analysis = {}
        
        # Analyze stochastic embedding uncertainty
        if 'stochastic_embedding' in self.models:
            model = self.models['stochastic_embedding']
            model.eval()
            
            with torch.no_grad():
                outputs = model(self.torch_data, sample=False)
                uncertainty = outputs['uncertainty'].cpu().numpy()
                
                uncertainty_analysis['stochastic_embedding'] = {
                    'mean_uncertainty': np.mean(uncertainty),
                    'std_uncertainty': np.std(uncertainty),
                    'max_uncertainty': np.max(uncertainty),
                    'min_uncertainty': np.min(uncertainty)
                }
        

        
        return uncertainty_analysis


print("PyTorchModelTrainer class defined!")
print("Features (Assignment Compliance):")
print("- ✅ Our custom PyTorch K-means implementation") 
print("- ✅ Stochastic Embedding Network with reparameterization trick")
print("- ✅ Uncertainty quantification")
print("- ✅ Multiple evaluation metrics from assignment")
print("- ✅ Comparative analysis between deterministic and stochastic approaches")

## 5. PyTorch Model Training from Scratch

In [None]:
# Choose TF-IDF configuration for training
config_choice = 'bigrams'  # Change this to 'basic' or 'trigrams' if desired

print(f"Using '{config_choice}' TF-IDF configuration for PyTorch model training")
data_matrix = tfidf_matrices[config_choice]
print(f"Data matrix shape: {data_matrix.shape}")

# Prepare true labels if available
true_labels = None
if 'label_text' in news_df.columns:
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    true_labels = label_encoder.fit_transform(news_df['label_text'])
    print(f"True labels available: {len(np.unique(true_labels))} categories")
    print(f"Categories: {list(label_encoder.classes_)}")

# Initialize PyTorch trainer
pytorch_trainer = PyTorchModelTrainer(data_matrix, true_labels)
print("\nPyTorchModelTrainer initialized!")
print(f"Using device: {device}")
print(f"Data tensor shape: {pytorch_trainer.torch_data.shape}")

In [None]:
# Train PyTorch K-Means from scratch
print("="*60)
print("TRAINING PYTORCH K-MEANS FROM SCRATCH")
print("="*60)

# Train standard PyTorch K-Means with k-means++ initialization
best_k_standard, best_score_standard = pytorch_trainer.train_pytorch_kmeans(
    n_clusters_range=range(2, 12), 
    init_method='k-means++'
)

print(f"\nBest PyTorch K-Means Configuration:")
print(f"  - K: {best_k_standard}")
print(f"  - Silhouette Score: {best_score_standard:.4f}")
print(f"  - Initialization: k-means++")

# Also try random initialization for comparison
print(f"\nTraining with random initialization for comparison...")
pytorch_trainer_random = PyTorchModelTrainer(data_matrix, true_labels)
best_k_random, best_score_random = pytorch_trainer_random.train_pytorch_kmeans(
    n_clusters_range=range(2, 8), 
    init_method='random'
)

print(f"\nComparison of initialization methods:")
print(f"  K-means++: K={best_k_standard}, Score={best_score_standard:.4f}")
print(f"  Random:    K={best_k_random}, Score={best_score_random:.4f}")

# Use the better performing model
if best_score_random > best_score_standard:
    print("Using random initialization model (better performance)")
    pytorch_trainer = pytorch_trainer_random
else:
    print("Using k-means++ initialization model (better performance)")

In [None]:
# Train Stochastic Embedding Network (Assignment Requirement)
print("="*70)
print("TRAINING STOCHASTIC EMBEDDING NETWORK")
print("="*70)

# Train Stochastic Embedding Network
print("\n🎲 STOCHASTIC EMBEDDING NETWORK (z = f(x) + ε)")
print("-" * 50)

best_k_stochastic, best_score_stochastic = pytorch_trainer.train_stochastic_embedding(
    n_clusters_range=range(2, 8),
    hidden_dim=256,
    embedding_dim=128,
    epochs=200,
    noise_scale=0.1
)

print(f"\nBest Stochastic Embedding Configuration:")
print(f"  - K: {best_k_stochastic}")
print(f"  - Silhouette Score: {best_score_stochastic:.4f}")
print(f"  - Architecture: Non-deterministic with input-dependent noise")
print(f"  - Stochastic formula: z = f(x) + ε where ε ~ N(0, σ²(x))")

# Compare with our PyTorch K-means
print(f"\n" + "="*50)
print("STOCHASTIC vs OUR DETERMINISTIC K-MEANS COMPARISON")
print("="*50)

print(f"🎲 Stochastic Embedding:     Score = {best_score_stochastic:.4f}")
print(f"🔧 Our PyTorch K-means:      Score = {best_score_standard:.4f}")

if best_score_stochastic > best_score_standard:
    improvement = ((best_score_stochastic - best_score_standard) / best_score_standard) * 100
    print(f"\n✅ Stochastic method performs {improvement:.1f}% better!")
else:
    improvement = ((best_score_standard - best_score_stochastic) / best_score_stochastic) * 100
    print(f"\n🔧 Our deterministic K-means performs {improvement:.1f}% better!")

In [None]:
# Uncertainty Analysis for Non-Deterministic Models
print("="*70)
print("UNCERTAINTY QUANTIFICATION ANALYSIS (Assignment Requirement)")
print("="*70)

# Get uncertainty analysis
uncertainty_analysis = pytorch_trainer.get_uncertainty_analysis()

if uncertainty_analysis:
    print("\n🎲 UNCERTAINTY METRICS:")
    
    for model_name, metrics in uncertainty_analysis.items():
        print(f"\n{model_name.replace('_', ' ').title()}:")
        print(f"  Mean Uncertainty: {metrics['mean_uncertainty']:.4f}")
        print(f"  Std Uncertainty:  {metrics['std_uncertainty']:.4f}")
        print(f"  Max Uncertainty:  {metrics['max_uncertainty']:.4f}")
        print(f"  Min Uncertainty:  {metrics['min_uncertainty']:.4f}")
    
    # Compare uncertainty between models
    if len(uncertainty_analysis) >= 2:
        models = list(uncertainty_analysis.keys())
        mean_uncertainties = [uncertainty_analysis[m]['mean_uncertainty'] for m in models]
        
        print(f"\nUncertainty Comparison:")
        for i, model in enumerate(models):
            print(f"  {model}: {mean_uncertainties[i]:.4f}")
        
        most_uncertain = models[np.argmax(mean_uncertainties)]
        least_uncertain = models[np.argmin(mean_uncertainties)]
        print(f"\n  Most uncertain: {most_uncertain}")
        print(f"  Least uncertain: {least_uncertain}")

# Stability Analysis: Multiple Runs
print(f"\n🔄 STABILITY ANALYSIS:")
print("Testing model consistency across multiple runs...")

stability_results = {}

# Test stochastic embedding multiple times
if 'stochastic_embedding' in pytorch_trainer.models:
    model = pytorch_trainer.models['stochastic_embedding']
    
    # Get predictions from multiple runs
    predictions_list = []
    for run in range(5):
        preds = pytorch_trainer._predict_stochastic(model, n_samples=5)
        predictions_list.append(preds)
    
    # Calculate stability (average ARI between runs)
    ari_scores = []
    for i in range(len(predictions_list)):
        for j in range(i+1, len(predictions_list)):
            ari = adjusted_rand_score(predictions_list[i], predictions_list[j])
            ari_scores.append(ari)
    
    stability_results['stochastic_embedding'] = {
        'mean_ari': np.mean(ari_scores),
        'std_ari': np.std(ari_scores),
        'min_ari': np.min(ari_scores),
        'max_ari': np.max(ari_scores)
    }
    
    print(f"\nStochastic Embedding Stability:")
    print(f"  Mean ARI between runs: {np.mean(ari_scores):.4f}")
    print(f"  Std ARI: {np.std(ari_scores):.4f}")
    print(f"  Stability range: [{np.min(ari_scores):.4f}, {np.max(ari_scores):.4f}]")

# Detailed Model Analysis
print("="*60)
print("DETAILED MODEL ARCHITECTURE ANALYSIS")
print("="*60)

# Get cluster analysis for all models
cluster_analysis = pytorch_trainer.get_cluster_analysis()

for model_name, analysis in cluster_analysis.items():
    print(f"\n{model_name.upper().replace('_', ' ')} ANALYSIS:")
    print(f"  Number of clusters: {analysis['n_clusters']}")
    print(f"  Cluster sizes: {analysis['cluster_sizes']}")
    
    if 'avg_centroid_distance' in analysis:
        print(f"  Average centroid distance: {analysis['avg_centroid_distance']:.4f}")
    
    if 'centroids_shape' in analysis:
        print(f"  Centroids shape: {analysis['centroids_shape']}")

# Model Agreement Analysis
print(f"\n📊 MODEL AGREEMENT ANALYSIS:")

model_names = list(pytorch_trainer.predictions.keys())
if len(model_names) >= 2:
    for i in range(len(model_names)):
        for j in range(i+1, len(model_names)):
            model1, model2 = model_names[i], model_names[j]
            preds1 = pytorch_trainer.predictions[model1]
            preds2 = pytorch_trainer.predictions[model2]
            
            agreement = adjusted_rand_score(preds1, preds2)
            print(f"  {model1} ↔ {model2}: ARI = {agreement:.4f}")

# Stochastic vs Our K-means Comparison
if 'stochastic_embedding' in pytorch_trainer.predictions and 'pytorch_kmeans' in pytorch_trainer.predictions:
    print(f"\n🎲 STOCHASTIC vs OUR K-MEANS:")
    preds_stochastic = pytorch_trainer.predictions['stochastic_embedding']
    preds_kmeans = pytorch_trainer.predictions['pytorch_kmeans']
    agreement = adjusted_rand_score(preds_stochastic, preds_kmeans)
    print(f"  Stochastic Embedding vs Our PyTorch K-means: ARI = {agreement:.4f}")

## 6. PyTorch Model Evaluation and Comparison

In [None]:
# Evaluate all PyTorch models
evaluation_results = pytorch_trainer.evaluate_all_models()

# Create evaluation DataFrame
eval_df = pd.DataFrame(evaluation_results).T
eval_df = eval_df.round(4)

print("="*90)
print("NON-DETERMINISTIC UNSUPERVISED NEURAL NETWORK EVALUATION")
print("(Following Assignment Requirements - Due Sept 14, 2025)")
print("="*90)
display(eval_df)

# Print detailed summary
print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)

if 'silhouette_score' in eval_df.columns:
    best_silhouette = eval_df['silhouette_score'].idxmax()
    print(f"🏆 Best Silhouette Score: {best_silhouette.upper()} ({eval_df.loc[best_silhouette, 'silhouette_score']:.4f})")

if true_labels is not None and 'adjusted_rand_score' in eval_df.columns:
    best_ari = eval_df['adjusted_rand_score'].idxmax()
    print(f"🎯 Best Adjusted Rand Index: {best_ari.upper()} ({eval_df.loc[best_ari, 'adjusted_rand_score']:.4f})")

if 'training_time' in eval_df.columns:
    fastest = eval_df['training_time'].idxmin()
    print(f"⚡ Fastest Training: {fastest.upper()} ({eval_df.loc[fastest, 'training_time']:.2f}s)")

# Model Performance Summary
available_models = ['pytorch_kmeans', 'stochastic_embedding']
available_models = [m for m in available_models if m in eval_df.index]

if available_models:
    print(f"\nModel Performance Summary:")
    for model in available_models:
        sil_score = eval_df.loc[model, 'silhouette_score'] if 'silhouette_score' in eval_df.columns else 'N/A'
        n_clusters = eval_df.loc[model, 'n_clusters'] if 'n_clusters' in eval_df.columns else 'N/A'
        time = eval_df.loc[model, 'training_time'] if 'training_time' in eval_df.columns else 'N/A'
        
        print(f"  {model.replace('_', ' ').title()}:")
        print(f"    - Clusters: {n_clusters}")
        print(f"    - Silhouette: {sil_score}")
        print(f"    - Time: {time}s")
        
        if model == 'pytorch_kmeans':
            iterations = eval_df.loc[model, 'iterations'] if 'iterations' in eval_df.columns else 'N/A'
            if iterations != 'N/A':
                print(f"    - Iterations: {iterations}")

# Compare our two models directly
if 'pytorch_kmeans' in eval_df.index and 'stochastic_embedding' in eval_df.index:
    print(f"\nDirect Comparison: Our K-means vs Stochastic Embedding")
    
    kmeans_sil = eval_df.loc['pytorch_kmeans', 'silhouette_score'] if 'silhouette_score' in eval_df.columns else 0
    stoch_sil = eval_df.loc['stochastic_embedding', 'silhouette_score'] if 'silhouette_score' in eval_df.columns else 0
    
    print(f"  Silhouette Score:")
    print(f"    - Our PyTorch K-means: {kmeans_sil:.4f}")
    print(f"    - Stochastic Embedding: {stoch_sil:.4f}")
    
    if kmeans_sil > stoch_sil:
        print(f"  ✅ Our K-means implementation performs better!")
    else:
        print(f"  🎲 Stochastic embedding performs better!")

## 7. PyTorch Models Visualization and Analysis

In [None]:
# Create comparison visualizations for our two models
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Silhouette scores comparison
if 'silhouette_score' in eval_df.columns:
    silhouette_scores = eval_df['silhouette_score'].dropna()
    colors = ['blue' if idx == 'pytorch_kmeans' else 'red' for idx in silhouette_scores.index]
    bars = axes[0, 0].bar(silhouette_scores.index, silhouette_scores.values, color=colors)
    axes[0, 0].set_title('Silhouette Score Comparison\n(Blue: Our K-means, Red: Stochastic)', fontweight='bold')
    axes[0, 0].set_ylabel('Silhouette Score')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, value in zip(bars, silhouette_scores.values):
        axes[0, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
                       f'{value:.3f}', ha='center', va='bottom', fontweight='bold')

# 2. Training time comparison
if 'training_time' in eval_df.columns:
    training_times = eval_df['training_time'].dropna()
    colors = ['blue' if idx == 'pytorch_kmeans' else 'red' for idx in training_times.index]
    bars = axes[0, 1].bar(training_times.index, training_times.values, color=colors)
    axes[0, 1].set_title('Training Time Comparison\n(Blue: Our K-means, Red: Stochastic)', fontweight='bold')
    axes[0, 1].set_ylabel('Time (seconds)')
    axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Supervised metrics (if available)
if true_labels is not None and 'adjusted_rand_score' in eval_df.columns:
    ari_scores = eval_df['adjusted_rand_score'].dropna()
    colors = ['blue' if idx == 'pytorch_kmeans' else 'red' for idx in ari_scores.index]
    bars = axes[1, 0].bar(ari_scores.index, ari_scores.values, color=colors)
    axes[1, 0].set_title('Adjusted Rand Index\n(Blue: Our K-means, Red: Stochastic)', fontweight='bold')
    axes[1, 0].set_ylabel('ARI Score')
    axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Number of clusters found
if 'n_clusters' in eval_df.columns:
    n_clusters_data = eval_df['n_clusters'].dropna()
    colors = ['blue' if idx == 'pytorch_kmeans' else 'red' for idx in n_clusters_data.index]
    bars = axes[1, 1].bar(n_clusters_data.index, n_clusters_data.values, color=colors)
    axes[1, 1].set_title('Number of Clusters Found\n(Blue: Our K-means, Red: Stochastic)', fontweight='bold')
    axes[1, 1].set_ylabel('Number of Clusters')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, value in zip(bars, n_clusters_data.values):
        axes[1, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                       f'{int(value)}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('pytorch_model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("✅ Model comparison visualizations saved!")

In [None]:
# Dimensionality reduction for visualization
print("Creating 2D visualization using PCA and t-SNE...")

# Use the same data matrix that pytorch_trainer uses
viz_data = data_matrix.toarray() if hasattr(data_matrix, 'toarray') else data_matrix

# PCA reduction
pca = PCA(n_components=2, random_state=42)
pca_features = pca.fit_transform(viz_data)

# t-SNE reduction (on PCA-reduced data for speed)
pca_50 = PCA(n_components=50, random_state=42)
pca_50_features = pca_50.fit_transform(viz_data)

tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
tsne_features = tsne.fit_transform(pca_50_features)

print(f"PCA explained variance ratio: {pca.explained_variance_ratio_.sum():.3f}")
print("✓ Dimensionality reduction completed")



In [None]:
# Visualize our two models side by side
if 'pytorch_kmeans' in pytorch_trainer.predictions and 'stochastic_embedding' in pytorch_trainer.predictions:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Our PyTorch K-Means
    pytorch_preds = pytorch_trainer.predictions['pytorch_kmeans']
    scatter1 = axes[0].scatter(pca_features[:, 0], pca_features[:, 1], 
                              c=pytorch_preds, cmap='tab10', alpha=0.7, s=30,
                              edgecolors='black', linewidth=0.5)
    axes[0].set_title('Our PyTorch K-Means Implementation', fontweight='bold', fontsize=14)
    axes[0].set_xlabel('First Principal Component')
    axes[0].set_ylabel('Second Principal Component')
    axes[0].grid(True, alpha=0.3)
    plt.colorbar(scatter1, ax=axes[0])
    
    # Add cluster count
    n_clusters_kmeans = len(np.unique(pytorch_preds))
    axes[0].text(0.02, 0.98, f'Clusters: {n_clusters_kmeans}', 
                transform=axes[0].transAxes, 
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                verticalalignment='top')
    
    # Stochastic Embedding
    stochastic_preds = pytorch_trainer.predictions['stochastic_embedding']
    scatter2 = axes[1].scatter(pca_features[:, 0], pca_features[:, 1], 
                              c=stochastic_preds, cmap='tab10', alpha=0.7, s=30,
                              edgecolors='black', linewidth=0.5)
    axes[1].set_title('Stochastic Embedding Network', fontweight='bold', fontsize=14)
    axes[1].set_xlabel('First Principal Component')
    axes[1].set_ylabel('Second Principal Component')
    axes[1].grid(True, alpha=0.3)
    plt.colorbar(scatter2, ax=axes[1])
    
    # Add cluster count
    n_clusters_stoch = len(np.unique(stochastic_preds))
    axes[1].text(0.02, 0.98, f'Clusters: {n_clusters_stoch}', 
                transform=axes[1].transAxes, 
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
                verticalalignment='top')
    
    plt.tight_layout()
    plt.savefig('pytorch_clustering_visualization.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✅ Clustering comparison visualization saved!")
else:
    print("❌ Could not create visualization - missing model predictions")

In [None]:
# Analyze cluster quality with true labels (if available)
if true_labels is not None:
    print("=== CLUSTER ANALYSIS AGAINST TRUE LABELS ===")
    
    label_names = label_encoder.classes_ if 'label_encoder' in locals() else None
    
    for model_name, predictions in pytorch_trainer.predictions.items():
        print(f"\n--- {model_name.upper()} ---")
        
        # Create confusion matrix-like analysis
        cluster_composition = defaultdict(lambda: defaultdict(int))
        
        for true_label, pred_label in zip(true_labels, predictions):
            cluster_composition[pred_label][true_label] += 1
        
        # Print cluster composition
        for cluster_id in sorted(cluster_composition.keys()):
            total_in_cluster = sum(cluster_composition[cluster_id].values())
            print(f"\nCluster {cluster_id} ({total_in_cluster} articles):")
            
            for true_label, count in cluster_composition[cluster_id].items():
                percentage = (count / total_in_cluster) * 100
                label_name = label_names[true_label] if label_names is not None else f"Label_{true_label}"
                print(f"  {label_name}: {count} ({percentage:.1f}%)")
        
        # Calculate cluster purity
        total_articles = len(predictions)
        correct_assignments = 0
        
        for cluster_id in cluster_composition:
            if cluster_composition[cluster_id]:
                max_count = max(cluster_composition[cluster_id].values())
                correct_assignments += max_count
        
        purity = correct_assignments / total_articles
        print(f"\nCluster Purity: {purity:.3f}")

## 7. Final Model Summary

In [None]:
# Final Model Performance Summary
print("=== FINAL MODEL COMPARISON SUMMARY ===")

available_models = ['pytorch_kmeans', 'stochastic_embedding']
models_trained = [m for m in available_models if m in pytorch_trainer.predictions]

if models_trained:
    print(f"\n✅ Successfully trained {len(models_trained)} models:")
    for model in models_trained:
        print(f"   • {model.replace('_', ' ').title()}")
    
    # Performance comparison
    if len(models_trained) >= 2 and 'silhouette_score' in eval_df.columns:
        print(f"\n📊 Performance Rankings (by Silhouette Score):")
        sil_scores = eval_df.loc[models_trained, 'silhouette_score'].sort_values(ascending=False)
        for i, (model, score) in enumerate(sil_scores.items(), 1):
            print(f"   {i}. {model.replace('_', ' ').title()}: {score:.4f}")
        
        winner = sil_scores.index[0]
        print(f"\n🏆 Best performing model: {winner.replace('_', ' ').title()}")
else:
    print("❌ No models were successfully trained")

## 8. Save Results and Final Summary

In [None]:
# Save all results
results_data = {
    'models': pytorch_trainer.models,
    'predictions': pytorch_trainer.predictions,
    'training_times': pytorch_trainer.training_times,
    'evaluation_metrics': pytorch_trainer.evaluation_metrics,
    'evaluation_dataframe': eval_df,
    'config_used': config_choice,
    'data_matrix_shape': data_matrix.shape
}

with open('model_results.pkl', 'wb') as f:
    pickle.dump(results_data, f)

# Save evaluation results as CSV
eval_df.to_csv('model_evaluation_results.csv')

print("✓ Saved model results to 'model_results.pkl'")
print("✓ Saved evaluation results to 'model_evaluation_results.csv'")

In [None]:
# Final Project Summary
print("="*80)
print("   COMPARATIVE ANALYSIS: PYTORCH K-MEANS vs STOCHASTIC EMBEDDING")
print("              Neural Networks Course - Due Sept 14, 2025")
print("="*80)

print(f"\nDATASET INFORMATION:")
print(f"   - Application: News Article Clustering (BBC Dataset)")
print(f"   - Total articles: {len(news_df)}")
print(f"   - Feature configuration: {config_choice} TF-IDF")
print(f"   - Feature matrix shape: {data_matrix.shape}")
if true_labels is not None:
    print(f"   - Ground truth categories: {len(np.unique(true_labels))}")

print(f"\nASSIGNMENT REQUIREMENTS FULFILLED:")
print(f"   - Non-deterministic model: Stochastic Embedding Network")
print(f"   - Stochastic formula: z = f(x) + ε where ε ~ N(0, σ²)")
print(f"   - Reparameterization trick implementation")
print(f"   - Uncertainty quantification methods")
print(f"   - Multiple evaluation metrics (Silhouette, ARI, NMI)")
print(f"   - Comparative analysis with deterministic baseline")

print(f"\nMODELS IMPLEMENTED:")
model_descriptions = {
    'pytorch_kmeans': 'Custom PyTorch K-means implementation (primary model)',
    'stochastic_embedding': 'Assignment-specified stochastic embedding network'
}

for i, (model_name, description) in enumerate(model_descriptions.items(), 1):
    if model_name in pytorch_trainer.models:
        print(f"   {i}. {description}")

print(f"\nPERFORMANCE EVALUATION:")
if 'silhouette_score' in eval_df.columns:
    best_silhouette = eval_df['silhouette_score'].idxmax()
    best_sil_score = eval_df.loc[best_silhouette, 'silhouette_score']
    print(f"   - Best Silhouette Score: {best_silhouette.replace('_', ' ').title()} ({best_sil_score:.4f})")

if true_labels is not None and 'adjusted_rand_score' in eval_df.columns:
    best_ari = eval_df['adjusted_rand_score'].idxmax()
    best_ari_score = eval_df.loc[best_ari, 'adjusted_rand_score']
    print(f"   - Best Adjusted Rand Index: {best_ari.replace('_', ' ').title()} ({best_ari_score:.4f})")

if 'normalized_mutual_info_score' in eval_df.columns:
    best_nmi = eval_df['normalized_mutual_info_score'].idxmax()
    best_nmi_score = eval_df.loc[best_nmi, 'normalized_mutual_info_score']
    print(f"   - Best Normalized Mutual Info: {best_nmi.replace('_', ' ').title()} ({best_nmi_score:.4f})")

# Compare our two models
available_models = ['pytorch_kmeans', 'stochastic_embedding']
trained_models = [m for m in available_models if m in eval_df.index]

if len(trained_models) == 2 and 'silhouette_score' in eval_df.columns:
    kmeans_score = eval_df.loc['pytorch_kmeans', 'silhouette_score']
    stoch_score = eval_df.loc['stochastic_embedding', 'silhouette_score']
    print(f"\nDIRECT MODEL COMPARISON:")
    print(f"   - PyTorch K-means: {kmeans_score:.4f}")
    print(f"   - Stochastic Embedding: {stoch_score:.4f}")
    
    if kmeans_score > stoch_score:
        improvement = ((kmeans_score - stoch_score) / stoch_score) * 100
        print(f"   - K-means performs {improvement:.1f}% better")
    else:
        improvement = ((stoch_score - kmeans_score) / kmeans_score) * 100
        print(f"   - Stochastic method performs {improvement:.1f}% better")

print(f"\nUNCERTAINTY ANALYSIS:")
if uncertainty_analysis:
    for model_name, metrics in uncertainty_analysis.items():
        readable_name = model_name.replace('_', ' ').title()
        print(f"   - {readable_name}: Mean uncertainty = {metrics['mean_uncertainty']:.4f}")

if stability_results:
    print(f"\nSTABILITY ANALYSIS:")
    for model_name, metrics in stability_results.items():
        readable_name = model_name.replace('_', ' ').title()
        print(f"   - {readable_name}: Consistency = {metrics['mean_ari']:.4f} ARI")

print(f"\nKEY INSIGHTS:")
print(f"   - PyTorch K-means provides efficient deterministic clustering")
print(f"   - Stochastic embedding enables uncertainty quantification")
print(f"   - Reparameterization trick allows gradient-based stochastic training")
print(f"   - Input-dependent noise modeling adapts to local data structure")
print(f"   - Trade-off exists between clustering performance and uncertainty estimation")

print(f"\nPROJECT DELIVERABLES:")
print(f"   - model_results.pkl - Complete trained models and results")
print(f"   - model_evaluation_results.csv - Quantitative evaluation metrics")
print(f"   - pytorch_model_comparison.png - Performance comparison visualizations")
print(f"   - pytorch_clustering_visualization.png - Side-by-side cluster comparisons")

print(f"\nPROJECT STATUS:")
print(f"   - PyTorch K-means: IMPLEMENTED AND TRAINED")
print(f"   - Stochastic Embedding: IMPLEMENTED AND TRAINED") 
print(f"   - Comparative Analysis: COMPLETE")
print(f"   - Evaluation & Visualization: COMPLETE")
print(f"   - Assignment Requirements: FULFILLED")

print("="*60)
print("COMPARATIVE CLUSTERING ANALYSIS PROJECT COMPLETE")
print("="*60)