# Get The Data

In [8]:

import sys
import os
proj_root = os.path.abspath("..")  
if proj_root not in sys.path:
    sys.path.insert(0, proj_root)

from ml.preprocessing import Preprocessor, Initial_Transformation  # noqa: E402


In [9]:
mobile , df = Initial_Transformation(file_path="../ml/dataset")
preprocessing = Preprocessor()

preprocessing.fit(mobile)
transformed = preprocessing.transform(mobile)
print("=" * 60)
print("FEATURE TRANSFORMATION RESULTS")
print("=" * 60)


print(transformed[50:60])
print(transformed.shape)

FEATURE TRANSFORMATION RESULTS
[[ 1.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000

-----

# MlFlow Setup

In [10]:
from mlflow import MlflowClient
from pprint import pprint


In [11]:
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/918627768166686221', creation_time=1761695007629, experiment_id='918627768166686221', last_update_time=1761740089160, lifecycle_stage='active', name='Link', tags={'mlflow.experimentKind': 'custom_model_development',
 'mlflow.note.content': 'This is Linkage experiment',
 'project_name': 'stapler_clustering_traditional_ML'}>, <Experiment: artifact_location='mlflow-artifacts:/694982361173019963', creation_time=1761690946251, experiment_id='694982361173019963', last_update_time=1761739926270, lifecycle_stage='active', name='K-means', tags={'mlflow.experimentKind': 'custom_model_development',
 'mlflow.note.content': 'This is K-means experiment',
 'project_name': 'stapler_clustering_traditional_ML'}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1761057273861, experiment_id='0', last_update_time=1761057273861, lifecycle_stage='active', name='Default', tags={'mlflow.experimentKind': 'custom_model_development'}>]


In [12]:
default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "Default"
][0]

pprint(default_experiment)

{'lifecycle_stage': 'active', 'name': 'Default'}


In [13]:
import mlflow

# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# FlexibleNestedClusteringSystem

In [19]:
import numpy as np
from typing import Dict, Any
from sklearn.cluster import (AgglomerativeClustering, KMeans, DBSCAN, 
                           SpectralClustering, MiniBatchKMeans)
from sklearn.base import BaseEstimator, ClusterMixin

class FlexibleNestedClusteringSystem(BaseEstimator, ClusterMixin):
    def __init__(self, 
                 level1_config: Dict[str, Any],
                 level2_config: Dict[str, Any], 
                 level3_config: Dict[str, Any],
                 min_samples_per_final_cluster: int = 3,
                 original_data=None):
        """
        Initialize flexible nested clustering system.

        Args:
            level1_config: Configuration for Level 1 clustering
            level2_config: Configuration for Level 2 clustering  
            level3_config: Configuration for Level 3 clustering
            min_samples_per_final_cluster: Minimum samples in final clusters
            original_data: The original (untransformed) data (optional, used for reporting only)
            
        Example configs:
        level1_config = {
            'algorithm': 'AgglomerativeClustering',
            'n_clusters': 5,
            'linkage': 'complete'
        }
        level2_config = {
            'algorithm': 'KMeans', 
            'n_clusters': 3,
            'random_state': 42
        }
        level3_config = {
            'algorithm': 'GaussianMixture',
            'n_components': 2,
            'random_state': 42
        }
        """
        self.level1_config = level1_config
        self.level2_config = level2_config
        self.level3_config = level3_config
        self.min_samples_per_final_cluster = min_samples_per_final_cluster
        
        # Store the original data, used for summary/visualization only
        self.original_data = original_data
        
        # Storage for models and results
        self.level1_model = None
        self.level2_models = {}
        self.level3_models = {}
        self.level1_labels = None
        self.level2_labels = {}
        self.level3_labels = {}
        self.sample_assignments = None
        self.cluster_hierarchy = {}
    
    def _create_algorithm(self, config: Dict[str, Any]):
        """Create clustering algorithm from configuration."""
        algorithm_name = config['algorithm']
        params = {k: v for k, v in config.items() if k != 'algorithm'}
        
        algorithm_map = {
            'AgglomerativeClustering': AgglomerativeClustering,
            'KMeans': KMeans,
            'DBSCAN': DBSCAN,
            'SpectralClustering': SpectralClustering,
            'MiniBatchKMeans': MiniBatchKMeans,
        }
        
        if algorithm_name not in algorithm_map:
            raise ValueError(f"Unsupported algorithm: {algorithm_name}")
        
        return algorithm_map[algorithm_name](**params)
    
    def fit(self, X, y=None, sample_names=None):
        if sample_names is None:
            sample_names = [f"Sample_{i}" for i in range(len(X))]
        
        self.sample_names = sample_names
        self.n_samples = len(X)
        
        print("üîß Starting Flexible Nested Clustering")
        print(f"üìä Data shape: {X.shape}")
        print(f"üéØ Level 1: {self.level1_config['algorithm']} ‚Üí {self.level2_config['algorithm']} ‚Üí {self.level3_config['algorithm']}")
        print("=" * 60)
        
        # Level 1: Top-level clustering
        print(f"üìç Level 1: {self.level1_config['algorithm']} clustering...")
        self._fit_level1(X)
        
        # Level 2: Sub-clustering
        print(f"üìç Level 2: {self.level2_config['algorithm']} sub-clustering...")
        self._fit_level2(X)
        
        # Level 3: Sub-sub-clustering
        print(f"üìç Level 3: {self.level3_config['algorithm']} sub-sub-clustering...")
        self._fit_level3(X)
        
        # # Build hierarchy
        self._build_hierarchy()
        
        print("‚úÖ Flexible nested clustering completed!")

        self._print_hierarchy_summary(original_data=self.original_data)
        
        return self

    def _fit_level1(self, X):
        """Fit Level 1 clustering (top level)."""
        self.level1_model = self._create_algorithm(self.level1_config)
        
        # Handle different algorithm types
        if hasattr(self.level1_model, 'fit_predict'):
            self.level1_labels = self.level1_model.fit_predict(X)
        elif hasattr(self.level1_model, 'fit'):
            self.level1_model.fit(X)
            if hasattr(self.level1_model, 'labels_'):
                self.level1_labels = self.level1_model.labels_
            elif hasattr(self.level1_model, 'predict'):
                self.level1_labels = self.level1_model.predict(X)
            else:
                raise ValueError(f"Algorithm {self.level1_config['algorithm']} doesn't support clustering")

    
    def _fit_level2(self, X):
        """Fit Level 2 clustering (sub-clusters within each Level 1 cluster)."""
        n_level1_clusters = len(np.unique(self.level1_labels))
        
        for level1_id in range(n_level1_clusters):
            # Get samples belonging to this Level 1 cluster
            mask = self.level1_labels == level1_id
            X_subset = X[mask]
            
            if len(X_subset) < self.min_samples_per_final_cluster:
                print(f"   Level 1 cluster {level1_id}: {len(X_subset)} samples (too few for sub-clustering)")
                self.level2_labels[level1_id] = np.zeros(len(X_subset), dtype=int)
                continue
            
            # Create algorithm for this level
            level2_model = self._create_algorithm(self.level2_config)
            
            # Handle different algorithm types
            try:
                if hasattr(level2_model, 'fit_predict'):
                    level2_labels = level2_model.fit_predict(X_subset)
                elif hasattr(level2_model, 'fit'):
                    level2_model.fit(X_subset)
                    if hasattr(level2_model, 'labels_'):
                        level2_labels = level2_model.labels_
                    elif hasattr(level2_model, 'predict'):
                        level2_labels = level2_model.predict(X_subset)
                    else:
                        raise ValueError(f"Algorithm {self.level2_config['algorithm']} doesn't support clustering")
                
                self.level2_models[level1_id] = level2_model
                self.level2_labels[level1_id] = level2_labels

                    
            except Exception as e:
                print(f"   Level 1 cluster {level1_id}: Error in sub-clustering - {str(e)}")
                self.level2_labels[level1_id] = np.zeros(len(X_subset), dtype=int)
    
    def _fit_level3(self, X):
        """Fit Level 3 clustering (sub-sub-clusters within each Level 2 cluster)."""
        for level1_id in self.level2_labels.keys():
            if level1_id not in self.level2_labels:
                continue
                
            level2_labels = self.level2_labels[level1_id]
            unique_level2_labels = np.unique(level2_labels)
            
            for level2_id in unique_level2_labels:
                # Get samples belonging to this Level 2 cluster
                level1_mask = self.level1_labels == level1_id
                level2_mask = level2_labels == level2_id
                combined_mask = level1_mask & np.isin(np.arange(len(X)), 
                                                   np.where(level1_mask)[0][level2_mask])
                
                X_subset = X[combined_mask]
                
                if len(X_subset) < self.min_samples_per_final_cluster:
                    print(f"   Level 2 cluster ({level1_id}, {level2_id}): {len(X_subset)} samples (too few for sub-sub-clustering)")
                    self.level3_labels[(level1_id, level2_id)] = np.zeros(len(X_subset), dtype=int)
                    continue
                
                # Create algorithm for this level
                level3_model = self._create_algorithm(self.level3_config)
                
                # Handle different algorithm types
                try:
                    if hasattr(level3_model, 'fit_predict'):
                        level3_labels = level3_model.fit_predict(X_subset)
                    elif hasattr(level3_model, 'fit'):
                        level3_model.fit(X_subset)
                        if hasattr(level3_model, 'labels_'):
                            level3_labels = level3_model.labels_
                        elif hasattr(level3_model, 'predict'):
                            level3_labels = level3_model.predict(X_subset)
                        else:
                            raise ValueError(f"Algorithm {self.level3_config['algorithm']} doesn't support clustering")
                    
                    self.level3_models[(level1_id, level2_id)] = level3_model
                    self.level3_labels[(level1_id, level2_id)] = level3_labels
                    

                        
                except Exception as e:
                    print(f"   Level 2 cluster ({level1_id}, {level2_id}): Error in sub-sub-clustering - {str(e)}")
                    self.level3_labels[(level1_id, level2_id)] = np.zeros(len(X_subset), dtype=int)

    def _build_hierarchy(self):
        """Build the complete hierarchy structure."""
        self.sample_assignments = []
        
        for sample_idx in range(self.n_samples):
            # Get Level 1 assignment
            level1_id = self.level1_labels[sample_idx]
            
            # Get Level 2 assignment
            level1_mask = self.level1_labels == level1_id
            level1_indices = np.where(level1_mask)[0]
            sample_position_in_level1 = np.where(level1_indices == sample_idx)[0][0]
            level2_id = self.level2_labels[level1_id][sample_position_in_level1]
            
            # Get Level 3 assignment
            level2_mask = self.level2_labels[level1_id] == level2_id
            level2_indices = np.where(level1_mask)[0][level2_mask]
            sample_position_in_level2 = np.where(level2_indices == sample_idx)[0][0]
            level3_id = self.level3_labels[(level1_id, level2_id)][sample_position_in_level2]
            
            assignment = {
                'sample_idx': sample_idx,
                'sample_name': self.sample_names[sample_idx],
                'level1_id': level1_id,
                'level2_id': level2_id,
                'level3_id': level3_id,
                'full_path': f"{level1_id}_{level2_id}_{level3_id}"
            }
            
            self.sample_assignments.append(assignment)
        
        # Build cluster hierarchy
        self._build_cluster_hierarchy()
    
    def _build_cluster_hierarchy(self):
        """Build the cluster hierarchy structure."""
        self.cluster_hierarchy = {}
        
        # Level 1 clusters
        for level1_id in range(len(np.unique(self.level1_labels))):
            level1_samples = [a for a in self.sample_assignments if a['level1_id'] == level1_id]
            
            self.cluster_hierarchy[level1_id] = {
                'level': 1,
                'cluster_id': level1_id,
                'sample_count': len(level1_samples),
                'samples': level1_samples,
                'children': {}
            }
            
            # Level 2 clusters within this Level 1 cluster
            unique_level2 = set(a['level2_id'] for a in level1_samples)
            for level2_id in unique_level2:
                level2_samples = [a for a in level1_samples if a['level2_id'] == level2_id]
                
                self.cluster_hierarchy[level1_id]['children'][level2_id] = {
                    'level': 2,
                    'cluster_id': level2_id,
                    'sample_count': len(level2_samples),
                    'samples': level2_samples,
                    'children': {}
                }
                
                # Level 3 clusters within this Level 2 cluster
                unique_level3 = set(a['level3_id'] for a in level2_samples)
                for level3_id in unique_level3:
                    level3_samples = [a for a in level2_samples if a['level3_id'] == level3_id]
                    
                    self.cluster_hierarchy[level1_id]['children'][level2_id]['children'][level3_id] = {
                        'level': 3,
                        'cluster_id': level3_id,
                        'sample_count': len(level3_samples),
                        'samples': level3_samples,
                        'children': {}
                    }

    def _print_hierarchy_summary(self, show_samples=5, original_data=None, feature_names=None, floatfmt=".2f"):
        """
        Print a summary of the clustering hierarchy.
        For each Level 3 cluster, shows up to `show_samples` samples, each line showing brand, price, and category.
        `original_data` should be the original (untouched/untransformed) dataset as a numpy array or pandas DataFrame.
        `feature_names` may be optionally provided (for pandas or arrays).
        """
        print("\nüìä HIERARCHY SUMMARY")
        print("=" * 50)
        
        total_final_clusters = 0

        # Use the class's original_data if original_data not provided
        if original_data is None:
            if self.original_data is None:
                raise ValueError("original_data (the original features dataset) must be provided to show sample vectors.")
            original_data = self.original_data

        # Setup: get indices (column numbers) for 'brand', 'price', 'category'
        if hasattr(original_data, "iloc"):
            original_data_arr = original_data.values
            column_names = original_data.columns.tolist() if feature_names is None else feature_names
        else:
            original_data_arr = np.array(original_data)
            column_names = [f"feat_{i}" for i in range(original_data.shape[1])] if feature_names is None else feature_names

        # Find the indices of 'brand', 'price', and 'category'
        def get_feature_idx(name):
            try:
                return column_names.index(name)
            except ValueError:
                raise ValueError(f"Required feature '{name}' not found in column names: {column_names}")
        brand_idx = get_feature_idx('brand')
        price_idx = get_feature_idx('price')
        category_idx = get_feature_idx('category')
        show_indices = [brand_idx, price_idx, category_idx]

        for level1_id, level1_data in self.cluster_hierarchy.items():
            print(f"Level 1 Cluster {level1_id}: {level1_data['sample_count']} samples")
            
            for level2_id, level2_data in level1_data['children'].items():
                print(f"  Level 2 Cluster {level2_id}: {level2_data['sample_count']} samples")
                
                for level3_id, level3_data in level2_data['children'].items():
                    sample_indices = [a['sample_idx'] for a in level3_data['samples']]
                    sample_count = level3_data['sample_count']
                    total_final_clusters += 1
                    shown_indices = sample_indices[:show_samples]
                    shown_data = original_data_arr[shown_indices]
                    print(f"    Level 3 Cluster {level3_id}: {sample_count} samples")
                    print(f"      Showing up to {show_samples} samples (brand | price | category):")
                    for row in shown_data:
                        to_show = []
                        for idx in show_indices:
                            v = row[idx]
                            # Format price/number if appropriate
                            if idx == price_idx:
                                try:
                                    to_show.append(f"{float(v):{floatfmt}}")
                                except Exception:
                                    to_show.append(str(v))
                            else:
                                to_show.append(str(v))
                        print(f"        {to_show[0]} | {to_show[1]} | {to_show[2]}")
        
        print(f"\nüéØ Total final clusters: {total_final_clusters}")
        print(f"üìà Average samples per final cluster: {self.n_samples / total_final_clusters:.1f}")
    
    def get_cluster_path(self, sample_idx):
        """Get the complete cluster path for a sample."""
        if sample_idx >= len(self.sample_assignments):
            return None
        return self.sample_assignments[sample_idx]
    
    def get_samples_in_cluster(self, level1_id, level2_id=None, level3_id=None):
        """Get all samples in a specific cluster at any level."""
        if level2_id is None:
            # Level 1 cluster
            return [a for a in self.sample_assignments if a['level1_id'] == level1_id]
        elif level3_id is None:
            # Level 2 cluster
            return [a for a in self.sample_assignments 
                   if a['level1_id'] == level1_id and a['level2_id'] == level2_id]
        else:
            # Level 3 cluster
            return [a for a in self.sample_assignments 
                   if a['level1_id'] == level1_id and a['level2_id'] == level2_id and a['level3_id'] == level3_id]


# Metrics

In [20]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def calculate_silhouette_score_nested(clustering_system, X):
    """
    Calculate Silhouette Score for nested clustering system.
    Returns the average Silhouette Score across all levels.
    
    Args:
        clustering_system: FlexibleNestedClusteringSystem instance
        X: Feature matrix used for clustering
        
    Returns:
        dict: Dictionary containing scores for each level and average score
    """
    scores = {}
    
    # Level 1 Silhouette Score
    if clustering_system.level1_labels is not None:
        try:
            level1_score = silhouette_score(X, clustering_system.level1_labels)
            scores['level1'] = level1_score
            print(f"Level 1 Silhouette Score: {level1_score:.4f}")
        except Exception as e:
            print(f"Error calculating Level 1 Silhouette Score: {e}")
            scores['level1'] = None
    
    # Level 2 Silhouette Scores
    level2_scores = []
    for level1_id in clustering_system.level2_labels.keys():
        if clustering_system.level2_labels[level1_id] is not None:
            # Get samples belonging to this Level 1 cluster
            mask = clustering_system.level1_labels == level1_id
            X_subset = X[mask]
            level2_labels = clustering_system.level2_labels[level1_id]
            
            if len(np.unique(level2_labels)) > 1 and len(X_subset) > 1:
                try:
                    level2_score = silhouette_score(X_subset, level2_labels)
                    level2_scores.append(level2_score)
                except Exception as e:
                    print(f"Error calculating Level 2 Silhouette Score for cluster {level1_id}: {e}")
    
    if level2_scores:
        scores['level2'] = np.mean(level2_scores)
    else:
        scores['level2'] = None
    
    # Level 3 Silhouette Scores
    level3_scores = []
    for (level1_id, level2_id) in clustering_system.level3_labels.keys():
        if clustering_system.level3_labels[(level1_id, level2_id)] is not None:
            # Get samples belonging to this Level 2 cluster
            level1_mask = clustering_system.level1_labels == level1_id
            level2_mask = clustering_system.level2_labels[level1_id] == level2_id
            combined_mask = level1_mask & np.isin(np.arange(len(X)), 
                                               np.where(level1_mask)[0][level2_mask])
            
            X_subset = X[combined_mask]
            level3_labels = clustering_system.level3_labels[(level1_id, level2_id)]
            
            if len(np.unique(level3_labels)) > 1 and len(X_subset) > 1:
                try:
                    level3_score = silhouette_score(X_subset, level3_labels)
                    level3_scores.append(level3_score)
                except Exception as e:
                    print(f"Error calculating Level 3 Silhouette Score for cluster ({level1_id},{level2_id}): {e}")
    
    if level3_scores:
        scores['level3'] = np.mean(level3_scores)
    else:
        scores['level3'] = None
    
    # Calculate overall average
    valid_scores = [score for score in [scores['level1'], scores['level2'], scores['level3']] if score is not None]
    if valid_scores:
        scores['average'] = np.mean(valid_scores)
        print(f"Overall Average Silhouette: {scores['average']:.4f}")

    else:
        scores['average'] = None
        print("No valid Silhouette Scores calculated")
    
    return scores


def calculate_davies_bouldin_score_nested(clustering_system, X):
    """
    Calculate Davies-Bouldin Index for nested clustering system.
    Returns the average Davies-Bouldin Index across all levels.
    Lower values indicate better clustering.
    
    Args:
        clustering_system: FlexibleNestedClusteringSystem instance
        X: Feature matrix used for clustering
        
    Returns:
        dict: Dictionary containing scores for each level and average score
    """
    scores = {}
    
    # Level 1 Davies-Bouldin Score
    if clustering_system.level1_labels is not None:
        try:
            level1_score = davies_bouldin_score(X, clustering_system.level1_labels)
            scores['level1'] = level1_score
        except Exception as e:
            print(f"Error calculating Level 1 Davies-Bouldin Index: {e}")
            scores['level1'] = None
    
    # Level 2 Davies-Bouldin Scores
    level2_scores = []
    for level1_id in clustering_system.level2_labels.keys():
        if clustering_system.level2_labels[level1_id] is not None:
            # Get samples belonging to this Level 1 cluster
            mask = clustering_system.level1_labels == level1_id
            X_subset = X[mask]
            level2_labels = clustering_system.level2_labels[level1_id]
            
            if len(np.unique(level2_labels)) > 1 and len(X_subset) > 1:
                try:
                    level2_score = davies_bouldin_score(X_subset, level2_labels)
                    level2_scores.append(level2_score)
                except Exception as e:
                    print(f"Error calculating Level 2 Davies-Bouldin Index for cluster {level1_id}: {e}")
    
    if level2_scores:
        scores['level2'] = np.mean(level2_scores)
    else:
        scores['level2'] = None
    
    # Level 3 Davies-Bouldin Scores
    level3_scores = []
    for (level1_id, level2_id) in clustering_system.level3_labels.keys():
        if clustering_system.level3_labels[(level1_id, level2_id)] is not None:
            # Get samples belonging to this Level 2 cluster
            level1_mask = clustering_system.level1_labels == level1_id
            level2_mask = clustering_system.level2_labels[level1_id] == level2_id
            combined_mask = level1_mask & np.isin(np.arange(len(X)), 
                                               np.where(level1_mask)[0][level2_mask])
            
            X_subset = X[combined_mask]
            level3_labels = clustering_system.level3_labels[(level1_id, level2_id)]
            
            if len(np.unique(level3_labels)) > 1 and len(X_subset) > 1:
                try:
                    level3_score = davies_bouldin_score(X_subset, level3_labels)
                    level3_scores.append(level3_score)
                except Exception as e:
                    print(f"Error calculating Level 3 Davies-Bouldin Index for cluster ({level1_id},{level2_id}): {e}")
    
    if level3_scores:
        scores['level3'] = np.mean(level3_scores)
    else:
        scores['level3'] = None
    
    # Calculate overall average
    valid_scores = [score for score in [scores['level1'], scores['level2'], scores['level3']] if score is not None]
    if valid_scores:
        scores['average'] = np.mean(valid_scores)
        print(f"Overall Average Davies-Bouldin Index: {scores['average']:.4f}")
    else:
        scores['average'] = None
        print("No valid Davies-Bouldin Index calculated")
    
    return scores


def calculate_calinski_harabasz_score_nested(clustering_system, X):
    """
    Calculate Calinski-Harabasz Index for nested clustering system.
    Returns the average Calinski-Harabasz Index across all levels.
    Higher values indicate better clustering.
    
    Args:
        clustering_system: FlexibleNestedClusteringSystem instance
        X: Feature matrix used for clustering
        
    Returns:
        dict: Dictionary containing scores for each level and average score
    """
    scores = {}
    
    # Level 1 Calinski-Harabasz Score
    if clustering_system.level1_labels is not None:
        try:
            level1_score = calinski_harabasz_score(X, clustering_system.level1_labels)
            scores['level1'] = level1_score
        except Exception as e:
            print(f"Error calculating Level 1 Calinski-Harabasz Index: {e}")
            scores['level1'] = None
    
    # Level 2 Calinski-Harabasz Scores
    level2_scores = []
    for level1_id in clustering_system.level2_labels.keys():
        if clustering_system.level2_labels[level1_id] is not None:
            # Get samples belonging to this Level 1 cluster
            mask = clustering_system.level1_labels == level1_id
            X_subset = X[mask]
            level2_labels = clustering_system.level2_labels[level1_id]
            
            if len(np.unique(level2_labels)) > 1 and len(X_subset) > 1:
                try:
                    level2_score = calinski_harabasz_score(X_subset, level2_labels)
                    level2_scores.append(level2_score)
                except Exception as e:
                    print(f"Error calculating Level 2 Calinski-Harabasz Index for cluster {level1_id}: {e}")
    
    if level2_scores:
        scores['level2'] = np.mean(level2_scores)
    else:
        scores['level2'] = None
    
    # Level 3 Calinski-Harabasz Scores
    level3_scores = []
    for (level1_id, level2_id) in clustering_system.level3_labels.keys():
        if clustering_system.level3_labels[(level1_id, level2_id)] is not None:
            # Get samples belonging to this Level 2 cluster
            level1_mask = clustering_system.level1_labels == level1_id
            level2_mask = clustering_system.level2_labels[level1_id] == level2_id
            combined_mask = level1_mask & np.isin(np.arange(len(X)), 
                                               np.where(level1_mask)[0][level2_mask])
            
            X_subset = X[combined_mask]
            level3_labels = clustering_system.level3_labels[(level1_id, level2_id)]
            
            if len(np.unique(level3_labels)) > 1 and len(X_subset) > 1:
                try:
                    level3_score = calinski_harabasz_score(X_subset, level3_labels)
                    level3_scores.append(level3_score)
                except Exception as e:
                    print(f"Error calculating Level 3 Calinski-Harabasz Index for cluster ({level1_id},{level2_id}): {e}")
    
    if level3_scores:
        scores['level3'] = np.mean(level3_scores)
    else:
        scores['level3'] = None
    
    # Calculate overall average
    valid_scores = [score for score in [scores['level1'], scores['level2'], scores['level3']] if score is not None]
    if valid_scores:
        scores['average'] = np.mean(valid_scores)
        print(f"Overall Average Calinski-Harabasz Index: {scores['average']:.4f}")
    else:
        scores['average'] = None
        print("No valid Calinski-Harabasz Index calculated")
    
    return scores


def calculate_all_clustering_metrics(clustering_system, X):
    """
    Calculate all three clustering metrics for the nested clustering system.
    
    Args:
        clustering_system: FlexibleNestedClusteringSystem instance
        X: Feature matrix used for clustering
        
    Returns:
        dict: Dictionary containing all metric results
    """
    
    results = {}

    results['silhouette'] = calculate_silhouette_score_nested(clustering_system, X)

    results['davies_bouldin'] = calculate_davies_bouldin_score_nested(clustering_system, X)
 
    results['calinski_harabasz'] = calculate_calinski_harabasz_score_nested(clustering_system, X)
    

    
    # Print summary table
    print(f"{'Metric':<25} {'Level 1':<12} {'Level 2':<12} {'Level 3':<12} {'Average':<12}")
    print("-" * 80)
    
    metrics = [
        ('Silhouette Score', results['silhouette']),
        ('Davies-Bouldin Index', results['davies_bouldin']),
        ('Calinski-Harabasz Index', results['calinski_harabasz'])
    ]
    
    for metric_name, metric_data in metrics:
        level1_val = f"{metric_data['level1']:.4f}" if metric_data['level1'] is not None else "N/A"
        level2_val = f"{metric_data['level2']:.4f}" if metric_data['level2'] is not None else "N/A"
        level3_val = f"{metric_data['level3']:.4f}" if metric_data['level3'] is not None else "N/A"
        avg_val = f"{metric_data['average']:.4f}" if metric_data['average'] is not None else "N/A"
        
        print(f"{metric_name:<25} {level1_val:<12} {level2_val:<12} {level3_val:<12} {avg_val:<12}")
    
    return results

# Experiments

## KMeans

In [65]:
km_description = (
    "This is K-means experiment"
)

km_tags = {
    "project_name": "stapler_clustering_traditional_ML",
    "mlflow.note.content": km_description,
}
try:
    # Get the experiment ID first
    experiment = client.get_experiment_by_name("K-means")
    if experiment and experiment.lifecycle_stage == 'deleted':
        client.restore_experiment(experiment.experiment_id)
        print("Restored experiment: AgglomerativeClustering")
except Exception as e:
    print(f"Error restoring experiment: {e}")


In [66]:
# Sets the current active experiment to the "Apple_Models" experiment and returns the Experiment metadata
km_experiment = mlflow.set_experiment("K-means")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name_km = "Km"

# Define an artifact path that the model will be saved to.
artifact_path = "km_results"

In [67]:


# Create a second configuration with different algorithms
level1_config_2 = {
    'algorithm': 'KMeans',
    'n_clusters': 3,
}

level2_config_2 = {
    'algorithm': 'KMeans',
    'n_clusters': 4,
}

level3_config_2 = {
    'algorithm': 'KMeans',
    'n_clusters': 5,

}
params = {
    "level1_config" : level1_config_2,
    "level2_config" : level2_config_2,
    "level3_config" : level3_config_2

}
# Create second flexible clustering system
flexible_clustering_2 = FlexibleNestedClusteringSystem(
    level1_config=level1_config_2,
    level2_config=level2_config_2,
    level3_config=level3_config_2,
    min_samples_per_final_cluster=3,
    original_data=mobile
)

# Fit on your transformed data
sample_names = [f"Mobile_{i}" for i in range(len(transformed))] #TODO is it necessary ?

tfd = transformed.copy()
flexible_clustering_2.fit(tfd, sample_names)



# Calculate all metrics
metrics_results = calculate_all_clustering_metrics(flexible_clustering_2, transformed)

# Or calculate individual metrics
silhouette_scores = calculate_silhouette_score_nested(flexible_clustering_2, transformed)
davies_bouldin_scores = calculate_davies_bouldin_score_nested(flexible_clustering_2, transformed)
calinski_harabasz_scores = calculate_calinski_harabasz_score_nested(flexible_clustering_2, transformed)

metrics = {
    "Silhouette Score": silhouette_scores['average'],
    "Davies-Bouldin Index": davies_bouldin_scores['average'],
    "Calinski-Harabasz Index": calinski_harabasz_scores['average']
}

with mlflow.start_run(run_name=run_name_km):

    mlflow.log_params(params)

    mlflow.log_metrics(metrics)

    mlflow.sklearn.log_model(sk_model=flexible_clustering_2,input_example=mobile[::50],name=artifact_path)


üîß Starting Flexible Nested Clustering
üìä Data shape: (508, 96)
üéØ Level 1: KMeans ‚Üí KMeans ‚Üí KMeans
üìç Level 1: KMeans clustering...
üìç Level 2: KMeans sub-clustering...
üìç Level 3: KMeans sub-sub-clustering...
   Level 2 cluster (1, 3): Error in sub-sub-clustering - n_samples=3 should be >= n_clusters=5.
   Level 2 cluster (2, 3): 2 samples (too few for sub-sub-clustering)
‚úÖ Flexible nested clustering completed!

üìä HIERARCHY SUMMARY
Level 1 Cluster 0: 319 samples
  Level 2 Cluster 0: 150 samples
    Level 3 Cluster 0: 49 samples
      Showing up to 5 samples (brand | price | category):
        realme | nan | mid
        realme | nan | mid
        realme | nan | mid
        realme | nan | mid
        realme | nan | low
    Level 3 Cluster 1: 23 samples
      Showing up to 5 samples (brand | price | category):
        tecno-ma | 150341000.00 | mid
        xiaomi | 246990000.00 | mid
        xiaomi | 259051000.00 | nan
        xiaomi | 248850000.00 | mid
        xia

  "dataframe_split": {
    "columns": [
      "b.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor


üèÉ View run Km at: http://127.0.0.1:8080/#/experiments/694982361173019963/runs/21e0830d6fe1455396a7a0cf3da2b06b
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/694982361173019963


# Linkage

In [70]:
link_description = (
    "This is Linkage experiment"
)

link_tags = {
    "project_name": "stapler_clustering_traditional_ML",
    "mlflow.note.content": link_description,
}
try:
    # Get the experiment ID first
    experiment = client.get_experiment_by_name("Link")
    if experiment and experiment.lifecycle_stage == 'deleted':
        client.restore_experiment(experiment.experiment_id)
        print("Restored experiment: Link")
except Exception as e:
    print(f"Error restoring experiment: {e}")


In [71]:
# Sets the current active experiment to the "Apple_Models" experiment and returns the Experiment metadata
link_experiment = mlflow.set_experiment("Link")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name_link = "link_first"

# Define an artifact path that the model will be saved to.
artifact_path = "link_results"

In [72]:

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt


class FixedSingleModelHierarchicalClustering(BaseEstimator,ClusterMixin):
    """
    Fixed Single Model Hierarchical Clustering System that uses one hierarchical clustering model
    and saves the merging process to extract subclusters and sub-subclusters.
    """
    
    def __init__(self, 
                 n_level1_clusters: int = 5,
                 n_level2_clusters: int = 3, 
                 n_level3_clusters: int = 2,
                 linkage_method: str = 'ward',
                 min_samples_per_final_cluster: int = 3,
                 original_data=None):
        """
        Initialize single model hierarchical clustering system.

        Args:
            n_level1_clusters: Number of top-level clusters
            n_level2_clusters: Number of sub-clusters within each Level 1 cluster
            n_level3_clusters: Number of sub-sub-clusters within each Level 2 cluster
            linkage_method: Linkage method for hierarchical clustering ('ward', 'complete', 'average', 'single')
            min_samples_per_final_cluster: Minimum samples in final clusters
            original_data: The original (untransformed) data (optional, used for reporting only)
        """
        self.n_level1_clusters = n_level1_clusters
        self.n_level2_clusters = n_level2_clusters
        self.n_level3_clusters = n_level3_clusters
        self.linkage_method = linkage_method
        self.min_samples_per_final_cluster = min_samples_per_final_cluster
        
        # Store the original data, used for summary/visualization only
        self.original_data = original_data
        
        # Storage for models and results
        self.linkage_matrix = None
        self.level1_labels = None
        self.level2_labels = {}
        self.level3_labels = {}
        self.sample_assignments = None
        self.cluster_hierarchy = {}
        self.sample_names = None
        self.n_samples = 0
        self.X_data = None  # Store original data for sub-clustering
    
    def fit(self, X, sample_names=None):
        """
        Fit the hierarchical clustering model and extract multi-level clusters.
        
        Args:
            X: Feature matrix
            sample_names: Optional names for samples
        """
        if sample_names is None:
            sample_names = [f"Sample_{i}" for i in range(len(X))]
        
        self.sample_names = sample_names
        self.n_samples = len(X)
        self.X_data = X  # Store the original data for sub-clustering
        
        print("üå≥ Starting Fixed Single Model Hierarchical Clustering")
        print(f"üìä Data shape: {X.shape}")
        print(f"üéØ Target clusters: Level 1={self.n_level1_clusters}, Level 2={self.n_level2_clusters}, Level 3={self.n_level3_clusters}")
        print(f"üîó Linkage method: {self.linkage_method}")
        print("=" * 60)
        
        # Step 1: Create linkage matrix (this saves the merging process)
        print("üìç Step 1: Creating linkage matrix...")
        self._create_linkage_matrix(X)
        
        # Step 2: Extract Level 1 clusters
        print("üìç Step 2: Extracting Level 1 clusters...")
        self._extract_level1_clusters()
        
        # Step 3: Extract Level 2 clusters
        print("üìç Step 3: Extracting Level 2 clusters...")
        self._extract_level2_clusters()
        
        # Step 4: Extract Level 3 clusters
        print("üìç Step 4: Extracting Level 3 clusters...")
        self._extract_level3_clusters()
        
        # Step 5: Build hierarchy
        print("üìç Step 5: Building hierarchy...")
        self._build_hierarchy()
        
        print("‚úÖ Fixed single model hierarchical clustering completed!")
        self._print_hierarchy_summary(original_data=self.original_data)
        
        return self
    
    def _create_linkage_matrix(self, X):
        """Create linkage matrix using hierarchical clustering."""
        # Calculate pairwise distances
        distances = pdist(X, metric='euclidean')
        
        # Create linkage matrix
        self.linkage_matrix = linkage(distances, method=self.linkage_method)
        
        print(f"   Linkage matrix shape: {self.linkage_matrix.shape}")
    
    def _extract_level1_clusters(self):
        """Extract Level 1 clusters from linkage matrix."""
        # Use fcluster to get Level 1 clusters
        self.level1_labels = fcluster(self.linkage_matrix, 
                                     t=self.n_level1_clusters, 
                                     criterion='maxclust') - 1  # Convert to 0-based indexing
        
        # Print Level 1 results
        unique_labels, counts = np.unique(self.level1_labels, return_counts=True)
        print(f"   Level 1 clusters: {len(unique_labels)}")
        for label, count in zip(unique_labels, counts):
            print(f"     Cluster {label}: {count} samples")
    
    def _extract_level2_clusters(self):
        """Extract Level 2 clusters for each Level 1 cluster."""
        n_level1_clusters = len(np.unique(self.level1_labels))
        
        for level1_id in range(n_level1_clusters):
            # Get samples belonging to this Level 1 cluster
            mask = self.level1_labels == level1_id
            level1_indices = np.where(mask)[0]
            
            if len(level1_indices) < self.min_samples_per_final_cluster:
                print(f"   Level 1 cluster {level1_id}: {len(level1_indices)} samples (too few for sub-clustering)")
                self.level2_labels[level1_id] = np.zeros(len(level1_indices), dtype=int)
                continue
            
            # Create sub-linkage matrix for this Level 1 cluster
            sub_linkage = self._create_sub_linkage_matrix(level1_indices, self.X_data)
            
            if sub_linkage is not None:
                # Extract Level 2 clusters
                level2_labels = fcluster(sub_linkage, 
                                       t=self.n_level2_clusters, 
                                       criterion='maxclust') - 1
                
                self.level2_labels[level1_id] = level2_labels
                
                # Print Level 2 results
                unique_labels, counts = np.unique(level2_labels, return_counts=True)
                print(f"   Level 1 cluster {level1_id}: {len(level1_indices)} samples ‚Üí {len(unique_labels)} sub-clusters")
                for sub_label, count in zip(unique_labels, counts):
                    print(f"     Sub-cluster {sub_label}: {count} samples")
            else:
                print(f"   Level 1 cluster {level1_id}: Could not create sub-linkage matrix")
                self.level2_labels[level1_id] = np.zeros(len(level1_indices), dtype=int)
    
    def _extract_level3_clusters(self):
        """Extract Level 3 clusters for each Level 2 cluster."""
        for level1_id in self.level2_labels.keys():
            if level1_id not in self.level2_labels:
                continue
                
            level2_labels = self.level2_labels[level1_id]
            unique_level2_labels = np.unique(level2_labels)
            
            for level2_id in unique_level2_labels:
                # Get samples belonging to this Level 2 cluster
                level1_mask = self.level1_labels == level1_id
                level1_indices = np.where(level1_mask)[0]
                level2_mask = level2_labels == level2_id
                level2_indices = level1_indices[level2_mask]
                
                if len(level2_indices) < self.min_samples_per_final_cluster:
                    print(f"   Level 2 cluster ({level1_id}, {level2_id}): {len(level2_indices)} samples (too few for sub-sub-clustering)")
                    self.level3_labels[(level1_id, level2_id)] = np.zeros(len(level2_indices), dtype=int)
                    continue
                
                # Create sub-sub-linkage matrix for this Level 2 cluster
                sub_sub_linkage = self._create_sub_linkage_matrix(level2_indices, self.X_data)
                
                if sub_sub_linkage is not None:
                    # Extract Level 3 clusters
                    level3_labels = fcluster(sub_sub_linkage, 
                                           t=self.n_level3_clusters, 
                                           criterion='maxclust') - 1
                    
                    self.level3_labels[(level1_id, level2_id)] = level3_labels
                    
                    # Print Level 3 results
                    unique_labels, counts = np.unique(level3_labels, return_counts=True)
                    print(f"   Level 2 cluster ({level1_id}, {level2_id}): {len(level2_indices)} samples ‚Üí {len(unique_labels)} sub-sub-clusters")
                    for subsub_label, count in zip(unique_labels, counts):
                        print(f"     Sub-sub-cluster {subsub_label}: {count} samples")
                else:
                    print(f"   Level 2 cluster ({level1_id}, {level2_id}): Could not create sub-sub-linkage matrix")
                    self.level3_labels[(level1_id, level2_id)] = np.zeros(len(level2_indices), dtype=int)
    
    def _create_sub_linkage_matrix(self, sample_indices, X_data):
        """
        Create a sub-linkage matrix for a subset of samples.
        This creates a new linkage matrix for the subset using the original data.
        """
        if len(sample_indices) < 2:
            return None
        
        # For very small subsets, we can't create meaningful sub-clusters
        if len(sample_indices) <= 3:
            return None
        
        try:
            # Extract the subset of data
            X_subset = X_data[sample_indices]
            
            # Calculate pairwise distances for the subset
            distances = pdist(X_subset, metric='euclidean')
            
            # Create linkage matrix for the subset
            sub_linkage = linkage(distances, method=self.linkage_method)
            
            return sub_linkage
            
        except Exception as e:
            print(f"   Error creating sub-linkage matrix: {e}")
            return None
    
    def _build_hierarchy(self):
        """Build the complete hierarchy structure."""
        self.sample_assignments = []
        
        for sample_idx in range(self.n_samples):
            # Get Level 1 assignment
            level1_id = self.level1_labels[sample_idx]
            
            # Get Level 2 assignment
            level1_mask = self.level1_labels == level1_id
            level1_indices = np.where(level1_mask)[0]
            sample_position_in_level1 = np.where(level1_indices == sample_idx)[0][0]
            level2_id = self.level2_labels[level1_id][sample_position_in_level1]
            
            # Get Level 3 assignment
            level2_mask = self.level2_labels[level1_id] == level2_id
            level2_indices = np.where(level1_mask)[0][level2_mask]
            sample_position_in_level2 = np.where(level2_indices == sample_idx)[0][0]
            level3_id = self.level3_labels[(level1_id, level2_id)][sample_position_in_level2]
            
            assignment = {
                'sample_idx': sample_idx,
                'sample_name': self.sample_names[sample_idx],
                'level1_id': level1_id,
                'level2_id': level2_id,
                'level3_id': level3_id,
                'full_path': f"{level1_id}_{level2_id}_{level3_id}"
            }
            
            self.sample_assignments.append(assignment)
        
        # Build cluster hierarchy
        self._build_cluster_hierarchy()
    
    def _build_cluster_hierarchy(self):
        """Build the cluster hierarchy structure."""
        self.cluster_hierarchy = {}
        
        # Level 1 clusters
        for level1_id in range(len(np.unique(self.level1_labels))):
            level1_samples = [a for a in self.sample_assignments if a['level1_id'] == level1_id]
            
            self.cluster_hierarchy[level1_id] = {
                'level': 1,
                'cluster_id': level1_id,
                'sample_count': len(level1_samples),
                'samples': level1_samples,
                'children': {}
            }
            
            # Level 2 clusters within this Level 1 cluster
            unique_level2 = set(a['level2_id'] for a in level1_samples)
            for level2_id in unique_level2:
                level2_samples = [a for a in level1_samples if a['level2_id'] == level2_id]
                
                self.cluster_hierarchy[level1_id]['children'][level2_id] = {
                    'level': 2,
                    'cluster_id': level2_id,
                    'sample_count': len(level2_samples),
                    'samples': level2_samples,
                    'children': {}
                }
                
                # Level 3 clusters within this Level 2 cluster
                unique_level3 = set(a['level3_id'] for a in level2_samples)
                for level3_id in unique_level3:
                    level3_samples = [a for a in level2_samples if a['level3_id'] == level3_id]
                    
                    self.cluster_hierarchy[level1_id]['children'][level2_id]['children'][level3_id] = {
                        'level': 3,
                        'cluster_id': level3_id,
                        'sample_count': len(level3_samples),
                        'samples': level3_samples,
                        'children': {}
                    }
    
    def _print_hierarchy_summary(self, show_samples=5, original_data=None, feature_names=None, floatfmt=".2f"):
        """
        Print a summary of the clustering hierarchy.
        """
        print("\nüìä HIERARCHY SUMMARY")
        print("=" * 50)
        
        total_final_clusters = 0

        # Use the class's original_data if original_data not provided
        if original_data is None:
            if self.original_data is None:
                print("No original data provided for sample display")
                return
            original_data = self.original_data

        # Setup: get indices (column numbers) for 'brand', 'price', 'category'
        if hasattr(original_data, "iloc"):
            original_data_arr = original_data.values
            column_names = original_data.columns.tolist() if feature_names is None else feature_names
        else:
            original_data_arr = np.array(original_data)
            column_names = [f"feat_{i}" for i in range(original_data.shape[1])] if feature_names is None else feature_names

        # Find the indices of 'brand', 'price', and 'category'
        def get_feature_idx(name):
            try:
                return column_names.index(name)
            except ValueError:
                print(f"Warning: Required feature '{name}' not found in column names: {column_names}")
                return None
        
        brand_idx = get_feature_idx('brand')
        price_idx = get_feature_idx('price')
        category_idx = get_feature_idx('category')
        
        # Only show detailed info if we have the required features
        show_detailed = all(idx is not None for idx in [brand_idx, price_idx, category_idx])
        
        if not show_detailed:
            print("Note: Cannot show detailed sample information - missing required features")

        for level1_id, level1_data in self.cluster_hierarchy.items():
            print(f"Level 1 Cluster {level1_id}: {level1_data['sample_count']} samples")
            
            for level2_id, level2_data in level1_data['children'].items():
                print(f"  Level 2 Cluster {level2_id}: {level2_data['sample_count']} samples")
                
                for level3_id, level3_data in level2_data['children'].items():
                    sample_indices = [a['sample_idx'] for a in level3_data['samples']]
                    sample_count = level3_data['sample_count']
                    total_final_clusters += 1
                    print(f"    Level 3 Cluster {level3_id}: {sample_count} samples")
                    
                    if show_detailed and len(sample_indices) > 0:
                        shown_indices = sample_indices[:show_samples]
                        shown_data = original_data_arr[shown_indices]
                        print(f"      Showing up to {show_samples} samples (brand | price | category):")
                        for row in shown_data:
                            to_show = []
                            for idx in [brand_idx, price_idx, category_idx]:
                                v = row[idx]
                                # Format price/number if appropriate
                                if idx == price_idx:
                                    try:
                                        to_show.append(f"{float(v):{floatfmt}}")
                                    except Exception:
                                        to_show.append(str(v))
                                else:
                                    to_show.append(str(v))
                            print(f"        {to_show[0]} | {to_show[1]} | {to_show[2]}")
        
        print(f"\nüéØ Total final clusters: {total_final_clusters}")
        print(f"üìà Average samples per final cluster: {self.n_samples / total_final_clusters:.1f}")
    
    def get_cluster_path(self, sample_idx):
        """Get the complete cluster path for a sample."""
        if sample_idx >= len(self.sample_assignments):
            return None
        return self.sample_assignments[sample_idx]
    
    def predict(self, X):
        """
        Predict cluster assignments for new samples.

        Strategy:
          - Assign each new sample the same level1/level2/level3 path as its nearest
            neighbor in the training set (Euclidean distance on the features used
            during fit). This is a simple and robust approach when using a
            hierarchical model fitted on the original data.

        Returns:
          - list of dicts: each dict has keys:
              'nearest_train_idx', 'level1_id', 'level2_id', 'level3_id', 'full_path'
        """
        if self.sample_assignments is None or self.X_data is None:
            raise RuntimeError("Model has not been fit yet. Call fit() before predict().")
        
        X_arr = np.asarray(X)
        if X_arr.ndim == 1:
            X_arr = X_arr.reshape(1, -1)
        
        # Ensure dimensionality matches training data
        if X_arr.shape[1] != self.X_data.shape[1]:
            raise ValueError(f"Input features dimension ({X_arr.shape[1]}) does not match training data ({self.X_data.shape[1]}).")
        
        # Compute squared Euclidean distances between each query and training samples
        # shape: (n_queries, n_train)
        diffs = X_arr[:, None, :] - self.X_data[None, :, :]
        dists = np.sum(diffs * diffs, axis=2)
        nearest_idx = np.argmin(dists, axis=1)
        
        results = []
        for q_idx, train_idx in enumerate(nearest_idx):
            train_assignment = self.sample_assignments[int(train_idx)]
            res = {
                'nearest_train_idx': int(train_idx),
                'level1_id': int(train_assignment['level1_id']),
                'level2_id': int(train_assignment['level2_id']),
                'level3_id': int(train_assignment['level3_id']),
                'full_path': train_assignment['full_path']
            }
            results.append(res)
        
        return results
    
    def get_samples_in_cluster(self, level1_id, level2_id=None, level3_id=None):
        """Get all samples in a specific cluster at any level."""
        if level2_id is None:
            # Level 1 cluster
            return [a for a in self.sample_assignments if a['level1_id'] == level1_id]
        elif level3_id is None:
            # Level 2 cluster
            return [a for a in self.sample_assignments 
                   if a['level1_id'] == level1_id and a['level2_id'] == level2_id]
        else:
            # Level 3 cluster
            return [a for a in self.sample_assignments 
                   if a['level1_id'] == level1_id and a['level2_id'] == level2_id and a['level3_id'] == level3_id]
    
    def plot_dendrogram(self, max_display_levels=10, figsize=(12, 8)):
        """
        Plot the dendrogram of the hierarchical clustering.
        
        Args:
            max_display_levels: Maximum number of levels to display
            figsize: Figure size tuple
        """
        if self.linkage_matrix is None:
            print("No linkage matrix available. Run fit() first.")
            return
        
        plt.figure(figsize=figsize)
        dendrogram(self.linkage_matrix, 
                  truncate_mode='level', 
                  p=max_display_levels,
                  show_leaf_counts=True,
                  leaf_rotation=90.,
                  leaf_font_size=12.,
                  show_contracted=True)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Sample Index or (cluster size)')
        plt.ylabel('Distance')
        plt.tight_layout()
        plt.show()


In [73]:
# Test the Fixed Implementation
print("Testing Fixed SingleModelHierarchicalClustering")
print("=" * 60)

# Create sample data with clear clusters

# Create sample data with 3 distinct clusters



# Create an instance with smaller cluster numbers for testing
fixed_clustering_link = FixedSingleModelHierarchicalClustering(
    n_level1_clusters=3,      # Top-level clusters
    n_level2_clusters=4,      # Sub-clusters within each Level 1 cluster
    n_level3_clusters=5,      # Sub-sub-clusters within each Level 2 cluster
    linkage_method='ward',     # Linkage method for hierarchical clustering
    min_samples_per_final_cluster=3,
    original_data = mobile
)

params = {
    "level1" : 3,
    "level2" : 4,
    "level3": 5,
    "method" : "ward"
}
# Fit on your transformed data
sample_names = [f"Mobile_{i}" for i in range(len(transformed))] #TODO is it necessary ?

tfd = transformed.copy()
fixed_clustering_link.fit(tfd, sample_names)



# Calculate all metrics
metrics_results = calculate_all_clustering_metrics(fixed_clustering_link, transformed)

# Or calculate individual metrics
silhouette_scores = calculate_silhouette_score_nested(fixed_clustering_link, transformed)
davies_bouldin_scores = calculate_davies_bouldin_score_nested(fixed_clustering_link, transformed)
calinski_harabasz_scores = calculate_calinski_harabasz_score_nested(fixed_clustering_link, transformed)

metrics = {
    "Silhouette Score": silhouette_scores['average'],
    "Davies-Bouldin Index": davies_bouldin_scores['average'],
    "Calinski-Harabasz Index": calinski_harabasz_scores['average']
}

with mlflow.start_run(run_name=run_name_link):
    mlflow.log_params(params)

    mlflow.log_metrics(metrics)

    mlflow.sklearn.log_model(sk_model=fixed_clustering_link,input_example=transformed[::50],name=artifact_path)

Testing Fixed SingleModelHierarchicalClustering
üå≥ Starting Fixed Single Model Hierarchical Clustering
üìä Data shape: (508, 96)
üéØ Target clusters: Level 1=3, Level 2=4, Level 3=5
üîó Linkage method: ward
üìç Step 1: Creating linkage matrix...
   Linkage matrix shape: (507, 4)
üìç Step 2: Extracting Level 1 clusters...
   Level 1 clusters: 3
     Cluster 0: 101 samples
     Cluster 1: 166 samples
     Cluster 2: 241 samples
üìç Step 3: Extracting Level 2 clusters...
   Level 1 cluster 0: 101 samples ‚Üí 4 sub-clusters
     Sub-cluster 0: 12 samples
     Sub-cluster 1: 33 samples
     Sub-cluster 2: 2 samples
     Sub-cluster 3: 54 samples
   Level 1 cluster 1: 166 samples ‚Üí 4 sub-clusters
     Sub-cluster 0: 61 samples
     Sub-cluster 1: 93 samples
     Sub-cluster 2: 9 samples
     Sub-cluster 3: 3 samples
   Level 1 cluster 2: 241 samples ‚Üí 4 sub-clusters
     Sub-cluster 0: 152 samples
     Sub-cluster 1: 17 samples
     Sub-cluster 2: 41 samples
     Sub-cluster 3: 3



üèÉ View run link_first at: http://127.0.0.1:8080/#/experiments/918627768166686221/runs/dafdf2eed4424619b36e0ef8ab75f797
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/918627768166686221


In [57]:

fixed_clustering_link.predict(transformed[131])

[{'nearest_train_idx': 131,
  'level1_id': 0,
  'level2_id': 1,
  'level3_id': 1,
  'full_path': '0_1_1'}]

In [74]:

model_name = "Linkage"
model_version = "1"

# Load the model from the Model Registry
model_uri = f"models:/{model_name}/{model_version}"

model = mlflow.sklearn.load_model(model_uri)
# Generate a new dataset for prediction and predict
y_pred_new = model.predict(transformed[131])

print(y_pred_new)

Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 18.10it/s]

[{'nearest_train_idx': 131, 'level1_id': 0, 'level2_id': 1, 'level3_id': 1, 'full_path': '0_1_1'}]



