## Step 05: Hyperparameter Tuning

### Import necessary libraries

In [17]:
import pandas as pd
import numpy as np
import os

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.manifold import TSNE

from sklearn.metrics import silhouette_score, davies_bouldin_score,calinski_harabasz_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ParameterGrid

import warnings
warnings.filterwarnings("ignore")

### 5.1 Load datasets from file paths

In [18]:
data_path = '..\Dataset\data_cleaned.csv'
genre_data_path = '..\Dataset\genre_data_cleaned.csv'

# Check if files exist and load them
if os.path.exists(data_path) and os.path.exists(genre_data_path):
    data = pd.read_csv(data_path)
    genre_data = pd.read_csv(genre_data_path)
    print("Info: Data and genre data successfully loaded.")
else:
    print("Attention: One or both files are not found in the specified directory.")

Info: Data and genre data successfully loaded.


In [19]:
X = genre_data.select_dtypes(include=np.number) 
Y = data.select_dtypes(include=np.number)

### 5.2  KMeans Hyperparameter tuning 
- Exhaustive grid search is perfomed over KMeans hyperparameters using a training-validation split of 80%-20%. 

#### 5.2.1 Genre_data_Cleaned

In [None]:
# Step 1: Convert dataset to NumPy array and split into training/validation sets
print("Step 1: Splitting the dataset \n")
X_np = X.to_numpy()

# Split the dataset
X_train, X_test = train_test_split(X_np, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}\n")

# Step 2: Define parameter grid for KMeans
print("Step 2: Defining the parameter grid for KMeans...\n")
param_grid_kmeans = {
    'n_clusters': [13, 20],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20],
    'max_iter': [300, 500],
    'tol': [1e-3, 1e-4],
}

parameter_combinations = list(ParameterGrid(param_grid_kmeans))
total_iterations = len(parameter_combinations)
print(f"Total parameter combinations to evaluate: {total_iterations}\n")

# Step 3: Initialize tracking for the best and worst parameters
print("Step 3: Setting up tracking for the best and worst parameters...\n")
best_score = -1
worst_score = np.inf
best_params = None
worst_params = None

# Step 4: Set up K-Fold cross-validation
print("Step 4: Setting up 3-Fold Cross-Validation...\n")
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Step 5: Start the grid search
print("Step 5: Starting the grid search...\n")
current_iteration = 0

for params in parameter_combinations:
    current_iteration += 1
    print(f"🌀 Iteration {current_iteration}/{total_iterations}: Testing parameters {params}...\n")
    silhouette_scores = []
    davies_bouldin_scores = []
    calinski_harabasz_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(X_train), start=1):
        print(f"  ➡️ Fold {fold}: Splitting data...")
        X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]

        try:
            # Initialize and fit KMeans
            print(f"    🔄 Running KMeans with parameters: {params}...")
            kmeans = KMeans(
                n_clusters=params['n_clusters'],
                init=params['init'],
                n_init=params['n_init'],
                max_iter=params['max_iter'],
                tol=params['tol'],
                random_state=42
            )
            kmeans.fit(X_train_cv)

            # Predict on the validation fold
            labels_val = kmeans.predict(X_val_cv)

            # Compute metrics
            silhouette_val = silhouette_score(X_val_cv, labels_val)
            dbi_val = davies_bouldin_score(X_val_cv, labels_val)
            ch_score_val = calinski_harabasz_score(X_val_cv, labels_val)

            # Append scores
            silhouette_scores.append(silhouette_val)
            davies_bouldin_scores.append(dbi_val)
            calinski_harabasz_scores.append(ch_score_val)

            print(f"    ✅ Silhouette Score for Fold {fold}: {silhouette_val:.4f}")
            print(f"    ✅ Davies-Bouldin Index for Fold {fold}: {dbi_val:.4f}")
            print(f"    ✅ Calinski-Harabasz Score for Fold {fold}: {ch_score_val:.4f}\n")

        except Exception as e:
            print(f"    ❌ Error during Fold {fold}: {e}")
            silhouette_scores.append(np.nan)
            davies_bouldin_scores.append(np.nan)
            calinski_harabasz_scores.append(np.nan)

    # Compute average scores for this parameter set
    mean_silhouette = np.nanmean(silhouette_scores)
    mean_dbi = np.nanmean(davies_bouldin_scores)
    mean_ch_score = np.nanmean(calinski_harabasz_scores)

    print(f"  📊 Average Silhouette Score: {mean_silhouette:.4f}")
    print(f"  📊 Average Davies-Bouldin Index: {mean_dbi:.4f} (lower is better)")
    print(f"  📊 Average Calinski-Harabasz Score: {mean_ch_score:.4f} (higher is better)\n")

    # Update the best parameters if a higher Silhouette Score is found
    if mean_silhouette > best_score:
        print("  🎉 New best parameters found!")
        best_score = mean_silhouette
        best_params = params

    # Update the worst parameters if a lower Silhouette Score is found
    if mean_silhouette < worst_score:
        worst_score = mean_silhouette
        worst_params = params

    print("-" * 50)  

# Step 6: Print the summary of results
print("\n🎉 Grid Search Completed!")
print(f"🔍 Best Parameters for KMeans: {best_params}")
print(f"📈 Best Silhouette Score (Validation): {best_score:.4f}")
print(f"📊 Best Davies-Bouldin Index (Validation): {mean_dbi:.4f} (lower is better)")
print(f"📊 Best Calinski-Harabasz Score (Validation): {mean_ch_score:.4f} (higher is better)\n")
print(f"🔍 Worst Parameters for KMeans: {worst_params}")
print(f"📉 Worst Silhouette Score (Validation): {worst_score:.4f}\n")

# Step 7:Testing
print("\n Step 7: Evaluating on test set...")
final_kmeans = KMeans(
    n_clusters=best_params['n_clusters'],
    init=best_params['init'],
    n_init=best_params['n_init'],
    max_iter=best_params['max_iter'],
    tol=best_params['tol'],
    random_state=42
)
final_kmeans.fit(X_train)

# Predict on test data
labels_test = final_kmeans.predict(X_test)

# Evaluate metrics on test data
silhouette_test = silhouette_score(X_test, labels_test)
dbi_test = davies_bouldin_score(X_test, labels_test)
ch_score_test = calinski_harabasz_score(X_test, labels_test)

print(f"✅ Silhouette Score (Testing): {silhouette_test:.4f}")
print(f"✅ Davies-Bouldin Index  (Testing):  {dbi_test:.4f}")
print(f"✅ Calinski-Harabasz Score  (Testing):  {ch_score_test:.4f}\n")  

Step 1: Splitting the dataset 

Training set shape: (2378, 13)
Test set shape: (595, 13)

Step 2: Defining the parameter grid for KMeans...

Total parameter combinations to evaluate: 32

Step 3: Setting up tracking for the best and worst parameters...

Step 4: Setting up 3-Fold Cross-Validation...

Step 5: Starting the grid search...

🌀 Iteration 1/32: Testing parameters {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 13, 'n_init': 10, 'tol': 0.001}...

  ➡️ Fold 1: Splitting data...
    🔄 Running KMeans with parameters: {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 13, 'n_init': 10, 'tol': 0.001}...
    ✅ Silhouette Score for Fold 1: 0.2593
    ✅ Davies-Bouldin Index for Fold 1: 0.9706
    ✅ Calinski-Harabasz Score for Fold 1: 363.5352

  ➡️ Fold 2: Splitting data...
    🔄 Running KMeans with parameters: {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 13, 'n_init': 10, 'tol': 0.001}...
    ✅ Silhouette Score for Fold 2: 0.2538
    ✅ Davies-Bouldin Index for Fold 2: 0.

#### Observations

1. **Best Parameters Achieved**:
   - **Initialization (`init`)**: `'k-means++'` performed better than `'random'` by producing well-spread initial cluster centers.
   - **Number of Clusters (`n_clusters`)**: Using 13 clusters provided the best clustering quality.
   - **Number of Initializations (`n_init`)**: 20 initializations improved the solution's quality and consistency.
   - **Maximum Iterations (`max_iter`)**: 300 iterations were sufficient for convergence.
   - **Tolerance (`tol`)**: A value of 0.001 ensured stable convergence.
   - **Best Silhouette Score**: 0.2714, indicating moderately well-separated clusters.

2. **Worst Parameters Observed**:
   - **Initialization (`init`)**: `'random'` produced poor initial cluster centers, leading to suboptimal results.
   - **Number of Clusters (`n_clusters`)**: Using 20 clusters resulted in over-segmentation and lower silhouette scores.
   - **Worst Silhouette Score**: 0.2365, reflecting poor clustering structure.

3. **Key Findings**:
   - **Impact of Initialization**: `'k-means++'` consistently outperformed `'random'` initialization by creating better-defined clusters.
   - **Optimal Number of Clusters**: Selecting too many clusters (e.g., 20) caused clusters to overlap or become poorly defined.
   - **Importance of Multiple Initializations (`n_init`)**: Increasing `n_init` to 20 significantly improved clustering results by avoiding poor local minima.
   - **Effect of Iterations and Tolerance**: Both `max_iter=300` and `tol=0.001` were sufficient to reach stable and reliable results.

4. **Conclusion**:
   - The best results were achieved with `'k-means++'` initialization, 13 clusters, and 20 initializations. Poor initialization and excessive clusters negatively impacted performance.


#### 5.2.2 Data_Cleaned

In [None]:
# Step 1: Convert dataset to NumPy array and split into training/validation sets
print("Step 1: Splitting the dataset \n")
Y_np = Y.to_numpy()

Y_train, Y_test = train_test_split(Y_np, test_size=0.2, random_state=42)

print(f"Training set shape: {Y_train.shape}")
print(f"Testing set shape: {Y_test.shape}\n")

# Step 2: Define parameter grid for KMeans
print("Step 2: Defining the parameter grid for KMeans...\n")
param_grid_kmeans = {
    'n_clusters': [11, 14],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20],
    'max_iter': [300, 500],
    'tol': [1e-3, 1e-4],
}

parameter_combinations = list(ParameterGrid(param_grid_kmeans))
total_iterations = len(parameter_combinations)
print(f"Total parameter combinations to evaluate: {total_iterations}\n")

# Step 3: Initialize tracking for the best and worst parameters
print("Step 3: Setting up tracking for the best and worst parameters...\n")
best_score = -1
worst_score = np.inf
best_params = None
worst_params = None

# Step 4: Set up K-Fold cross-validation
print("Step 4: Setting up 3-Fold Cross-Validation...\n")
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Step 5: Start the grid search
print("Step 5: Starting the grid search...\n")
current_iteration = 0

for params in parameter_combinations:
    current_iteration += 1
    print(f"🌀 Iteration {current_iteration}/{total_iterations}: Testing parameters {params}...\n")
    silhouette_scores = []
    davies_bouldin_scores = []
    calinski_harabasz_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(Y_train), start=1):
        print(f"  ➡️ Fold {fold}: Splitting data...")
        Y_train_cv, Y_val_cv = Y_train[train_index], Y_train[val_index]

        try:
            # Initialize and fit KMeans
            print(f"    🔄 Running KMeans with parameters: {params}...")
            kmeans = KMeans(
                n_clusters=params['n_clusters'],
                init=params['init'],
                n_init=params['n_init'],
                max_iter=params['max_iter'],
                tol=params['tol'],
                random_state=42
            )
            kmeans.fit(Y_train_cv)

            # Predict on the validation fold
            labels_val = kmeans.predict(Y_val_cv)

            # Compute metrics
            silhouette_val = silhouette_score(Y_val_cv, labels_val)
            dbi_val = davies_bouldin_score(Y_val_cv, labels_val)
            ch_score_val = calinski_harabasz_score(Y_val_cv, labels_val)

            # Append scores
            silhouette_scores.append(silhouette_val)
            davies_bouldin_scores.append(dbi_val)
            calinski_harabasz_scores.append(ch_score_val)

            print(f"    ✅ Silhouette Score for Fold {fold}: {silhouette_val:.4f}")
            print(f"    ✅ Davies-Bouldin Index for Fold {fold}: {dbi_val:.4f}")
            print(f"    ✅ Calinski-Harabasz Score for Fold {fold}: {ch_score_val:.4f}\n")

        except Exception as e:
            print(f"    ❌ Error during Fold {fold}: {e}")
            silhouette_scores.append(np.nan)
            davies_bouldin_scores.append(np.nan)
            calinski_harabasz_scores.append(np.nan)

    # Compute average scores for this parameter set
    mean_silhouette = np.nanmean(silhouette_scores)
    mean_dbi = np.nanmean(davies_bouldin_scores)
    mean_ch_score = np.nanmean(calinski_harabasz_scores)

    print(f"  📊 Average Silhouette Score: {mean_silhouette:.4f}")
    print(f"  📊 Average Davies-Bouldin Index: {mean_dbi:.4f} (lower is better)")
    print(f"  📊 Average Calinski-Harabasz Score: {mean_ch_score:.4f} (higher is better)\n")

    # Update the best parameters if a higher Silhouette Score is found
    if mean_silhouette > best_score:
        print("  🎉 New best parameters found!")
        best_score = mean_silhouette
        best_params = params

    # Update the worst parameters if a lower Silhouette Score is found
    if mean_silhouette < worst_score:
        worst_score = mean_silhouette
        worst_params = params

    print("-" * 50) 

# Step 6: Print the summary of results
print("\n🎉 Grid Search Completed!")
print(f"🔍 Best Parameters for KMeans: {best_params}")
print(f"📈 Best Silhouette Score (Validation): {best_score:.4f}")
print(f"📊 Best Davies-Bouldin Index (Validation): {mean_dbi:.4f} (lower is better)")
print(f"📊 Best Calinski-Harabasz Score (Validation): {mean_ch_score:.4f} (higher is better)\n")
print(f"🔍 Worst Parameters for KMeans: {worst_params}")
print(f"📉 Worst Silhouette Score (Validation): {worst_score:.4f}\n")

# Step 7:Testing
print("\n Step 7: Evaluating on test set...")
final_kmeans = KMeans(
    n_clusters=best_params['n_clusters'],
    init=best_params['init'],
    n_init=best_params['n_init'],
    max_iter=best_params['max_iter'],
    tol=best_params['tol'],
    random_state=42
)
final_kmeans.fit(Y_train)

# Predict on test data
labels_test = final_kmeans.predict(Y_test)

# Evaluate metrics on test data
silhouette_test = silhouette_score(Y_test, labels_test)
dbi_test = davies_bouldin_score(Y_test, labels_test)
ch_score_test = calinski_harabasz_score(Y_test, labels_test)

print(f"✅ Silhouette Score (Testing): {silhouette_test:.4f}")
print(f"✅ Davies-Bouldin Index  (Testing):  {dbi_test:.4f}")
print(f"✅ Calinski-Harabasz Score  (Testing):  {ch_score_test:.4f}\n")  


Step 1: Splitting the dataset 

Training set shape: (136522, 17)
Test set shape: (34131, 17)

Step 2: Defining the parameter grid for KMeans...

Total parameter combinations to evaluate: 32

Step 3: Setting up tracking for the best and worst parameters...

Step 4: Setting up 3-Fold Cross-Validation...

Step 5: Starting the grid search...

🌀 Iteration 1/32: Testing parameters {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 11, 'n_init': 10, 'tol': 0.001}...

  ➡️ Fold 1: Splitting data...
    🔄 Running KMeans with parameters: {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 11, 'n_init': 10, 'tol': 0.001}...
    ✅ Silhouette Score for Fold 1: 0.2854
    ✅ Davies-Bouldin Index for Fold 1: 0.9934
    ✅ Calinski-Harabasz Score for Fold 1: 27813.1467

  ➡️ Fold 2: Splitting data...
    🔄 Running KMeans with parameters: {'init': 'k-means++', 'max_iter': 300, 'n_clusters': 11, 'n_init': 10, 'tol': 0.001}...
    ✅ Silhouette Score for Fold 2: 0.2907
    ✅ Davies-Bouldin Index for Fold

### Observation

#### Best Parameters:
- **Initialization (`init`)**: `'k-means++'` produced superior cluster centers compared to `'random'`.
- **Clusters (`n_clusters`)**: 11 clusters achieved the best balance and clustering quality.
- **Initializations (`n_init`)**: 10 ensured consistent and robust solutions.
- **Iterations (`max_iter`)**: 300 were sufficient for convergence.
- **Tolerance (`tol`)**: 0.0001 provided stable convergence.
- **Best Silhouette Score**: 0.2894 (moderately well-separated clusters).

#### Key Findings:
- `'k-means++'` initialization consistently outperformed `'random'`.
- 11 clusters balanced separation and coherence; 14 clusters led to poor structure.
- Increasing `n_init` to 10 improved robustness and avoided poor local minima.
- 300 iterations and a tolerance of 0.0001 were sufficient for stable convergence.

#### Conclusion:
- The best configuration used `'k-means++'` with 11 clusters, 10 initializations, 300 iterations, and a tolerance of 0.0001.
- These settings achieved well-defined, meaningful clusters, while poor initialization and excessive clusters degraded performance.


### 5.3 t-SNE Pipeline Hyperparameter tuning 

- TSNE: t-Distributed Stochastic Neighbor Embedding (t-SNE)
- Pupose : This is a technique for reducing the dimensionality of data to two dimensions (n_components=2) for the purpose of visualization.
- Metric : KL Diveregnce
- Grid search for the optimal t-SNE hyperparameters is performed. 


**t-SNE is used for `genre_data_cleaned` for below reasons:**
- t-SNE is great for visualizing data by reducing it to 2 or 3 dimensions.
- It's designed to capture complex, non-linear relationships, making it ideal for exploring clusters, like genres.
- However, t-SNE is computationally heavy, so it's best suited for small to medium datasets.

**Conclusion:**
- Use t-SNE for the `genre_data_cleaned` dataset to visualize clusters and patterns.

In [None]:
# Convert dataset to NumPy array (X is a DataFrame)
X_np = X.to_numpy()

# Step 1: Split dataset into training and validation sets
print("Step 1: Splitting the dataset \n")
X_train, X_test = train_test_split(X_np, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}\n")

# Step 2: Define parameter grid for t-SNE
print("Step 2: Defining the parameter grid for t-SNE...\n")
param_grid_tsne = {
    'perplexity': [10, 30, 50],
    'learning_rate': [100, 200, 300],
    'max_iter': [500, 1000, 2000],
    'early_exaggeration': [6, 12, 24],
}
parameter_combinations = list(ParameterGrid(param_grid_tsne))
total_iterations = len(parameter_combinations)
print(f"Total parameter combinations to evaluate: {total_iterations}\n")

# Step 3: Initialize variables to track the best parameters
print("Step 3: Setting up tracking for the best parameters...\n")
best_kl_divergence = np.inf
best_tsne_params = None

# Step 4: Set up K-Fold cross-validation
print("Step 4: Setting up 3-Fold Cross-Validation...\n")
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Step 5: Start the grid search
print("Step 5: Starting the grid search...\n")
current_iteration = 0

for params in parameter_combinations:
    current_iteration += 1
    print(f"🌀 Iteration {current_iteration}/{total_iterations}: Testing parameters {params}...\n")
    kl_divergences = []

    for fold, (train_index, val_index) in enumerate(kf.split(X_train), start=1):
        print(f"  ➡️ Fold {fold}: Preparing training and validation data...")
        X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]

        try:
            print(f"    🔄 Running t-SNE with parameters: {params}...")
            # Initialize and fit t-SNE
            tsne = TSNE(
                n_components=2,
                perplexity=params['perplexity'],
                learning_rate=params['learning_rate'],
                max_iter=params['max_iter'],
                early_exaggeration=params['early_exaggeration'],
                random_state=42,
                verbose=0
            )
            tsne.fit(X_train_cv)

            # Retrieve KL divergence
            kl_divergence = tsne.kl_divergence_
            kl_divergences.append(kl_divergence)
            print(f"    ✅ KL Divergence for Fold {fold}: {kl_divergence:.4f}\n")

        except Exception as e:
            print(f"    ❌ Error during Fold {fold}: {e}\n")
            kl_divergences.append(np.nan)

    # Compute the mean KL divergence across folds
    mean_kl_divergence = np.nanmean(kl_divergences)
    print(f"  📊 Average KL Divergence for parameters: {mean_kl_divergence:.4f}\n")

    # Update best parameters if a lower KL divergence is found
    if mean_kl_divergence < best_kl_divergence:
        print("  🎉 New best parameters found!")
        best_kl_divergence = mean_kl_divergence
        best_tsne_params = params
    else:
        print("  ⚠️ Parameters did not improve the best score.\n")

    print("-" * 50) 

# Step 6: Results
print("\n🎉 Grid Search Completed!")
print(f"🔍 Best Parameters for t-SNE: {best_tsne_params}")
print(f"📉 Lowest KL Divergence: {best_kl_divergence:.4f}\n")

# Step 7:Testing
print("\n Step 7: Evaluating on test set...")
tsne_test = TSNE(
    n_components=2,
    perplexity=best_tsne_params['perplexity'],
    learning_rate=best_tsne_params['learning_rate'],
    max_iter=best_tsne_params['max_iter'],
    early_exaggeration=best_tsne_params['early_exaggeration'],
    random_state=42,
    verbose=0
)
tsne_test.fit(X_test)

# KL Divergence for the test set
test_kl_divergence = tsne_test.kl_divergence_
print(f"✅ KL Divergence on the test set: {test_kl_divergence:.4f}")






Step 1: Splitting the dataset 

Training set shape: (2378, 13)
Test set shape: (595, 13)

Step 2: Defining the parameter grid for t-SNE...

Total parameter combinations to evaluate: 81

Step 3: Setting up tracking for the best parameters...

Step 4: Setting up 3-Fold Cross-Validation...

Step 5: Starting the grid search...

🌀 Iteration 1/81: Testing parameters {'early_exaggeration': 6, 'learning_rate': 100, 'max_iter': 500, 'perplexity': 10}...

  ➡️ Fold 1: Preparing training and validation data...
    🔄 Running t-SNE with parameters: {'early_exaggeration': 6, 'learning_rate': 100, 'max_iter': 500, 'perplexity': 10}...
    ✅ KL Divergence for Fold 1: 0.8420

  ➡️ Fold 2: Preparing training and validation data...
    🔄 Running t-SNE with parameters: {'early_exaggeration': 6, 'learning_rate': 100, 'max_iter': 500, 'perplexity': 10}...
    ✅ KL Divergence for Fold 2: 0.8411

  ➡️ Fold 3: Preparing training and validation data...
    🔄 Running t-SNE with parameters: {'early_exaggeration':

#### Conclusions

**Best Parameters for t-SNE:**
- **KL Divergence (0.7602)** :The lowest value achieved, indicating an effective mapping of high-dimensional data into lower dimensions with minimal loss of structural information.
- **Perplexity (50)**:This value balances the local and global aspects of the data, suggesting that the chosen perplexity effectively captures the underlying data structure.
- **Learning Rate (300)**:This learning rate allows for stable convergence while avoiding oscillations or slow updates.
- **Early Exaggeration(6)**:Enhances separation during the initial iterations, improving visualization of clusters in the final embedding.
- **Max Iterations (2000)**:The model converges effectively, reflecting that the chosen number of iterations provides sufficient time for optimization.

#### Summary
Use this configuration for final t-SNE visualization to ensure optimal preservation of data structure and cluster separability in the low-dimensional space.

### 5.4 PCA Pipeline Hyperparameter tuning 

- PCA: Principal Component Analysis (PCA)
- Purpose: A technique for dimensionality reduction that transforms the data into a lower-dimensional space while retaining as much variance as possible.
- Metric: Reconstrction error (Mean squared error)
- Grid search for the optimal PCA is performed. 


**We use PCA for `data_cleaned` for below reasons:**
- PCA is efficient for reducing dimensions while keeping most of the original data's variance.
- It's better for preprocessing large datasets because it is faster and scales well.
- PCA assumes linear relationships, which works well for structured data intended for modeling.

**Conclusion:**
- Use PCA for the `data_cleaned` dataset to prepare it for further analysis or machine learning.


In [None]:
# Convert dataset to NumPy array (Y is a DataFrame)
Y_np = Y.to_numpy()

# Step 1: Split dataset into training and test sets
print("Step 1: Splitting the dataset \n")
Y_train, Y_test = train_test_split(Y_np, test_size=0.2, random_state=42)

print(f"Training set shape: {Y_train.shape}")
print(f"Test set shape: {Y_test.shape}\n")

# Step 2: Define parameter grid for PCA
print("Step 2: Defining the parameter grid for PCA...\n")
param_grid_pca = {
    'PCA__n_components': [2],                       # Range of components
    'PCA__whiten': [True, False],                   # Whitening options
    'PCA__svd_solver': ['auto', 'full', 'randomized'],  # Solver options
    'PCA__tol': [1e-4, 1e-3],                      # Tolerance for convergence
}

parameter_combinations = list(ParameterGrid(param_grid_pca))
total_combinations = len(parameter_combinations)
print(f"Total parameter combinations to evaluate: {total_combinations}\n")

# Step 3: Initialize variables for tracking the best parameters
print("Step 3: Setting up tracking for the best parameters...\n")
best_error = np.inf
best_params = None
current_iteration = 0

# Step 4: Set up K-Fold Cross-Validation
print("Step 4: Setting up 3-Fold Cross-Validation...\n")
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Step 5: Start the grid search
print("Step 5: Starting the grid search...\n")
for params in parameter_combinations:
    current_iteration += 1
    print(f"🌀 Evaluating combination {current_iteration}/{total_combinations}: {params}")

    fold_errors = []

    for fold, (train_index, val_index) in enumerate(kf.split(Y_train), start=1):
        print(f"  ➡️ Fold {fold}: Splitting data...")
        Y_train_cv, Y_val_cv = Y_train[train_index], Y_train[val_index]

        try:
            # Define the PCA pipeline with current parameters
            print(f"    🔄 Running PCA with parameters: {params}...")
            pca_pipeline = Pipeline([
                ('scaler', StandardScaler()),  # Standardize the features
                ('PCA', PCA(
                    n_components=params['PCA__n_components'],
                    whiten=params['PCA__whiten'],
                    svd_solver=params['PCA__svd_solver'],
                    tol=params.get('PCA__tol', None),
                    random_state=42
                ))
            ])

            # Fit the pipeline on training fold and transform validation fold
            pca_pipeline.fit(Y_train_cv)
            transformed_val = pca_pipeline.transform(Y_val_cv)

            # Inverse transform the validation data for reconstruction
            reconstructed_val = pca_pipeline.named_steps['PCA'].inverse_transform(transformed_val)

            # Calculate reconstruction error on validation data
            error = mean_squared_error(Y_val_cv, reconstructed_val)
            fold_errors.append(error)
            print(f"    ✅ Reconstruction Error for Fold {fold}: {error:.4f}\n")

        except Exception as e:
            print(f"    ❌ Error during Fold {fold}: {e}\n")
            fold_errors.append(np.nan)

    # Compute the mean reconstruction error across folds
    mean_error = np.nanmean(fold_errors)
    print(f"  📊 Average Reconstruction Error for parameters: {mean_error:.4f}\n")

    # Update the best parameters if the average reconstruction error improves
    if mean_error < best_error:
        print("  🎉 New best parameters found!")
        best_error = mean_error
        best_params = params

    print("-" * 50)  

# Step 6: Evaluate the best parameters on the test set
print("\nStep 6: Evaluating on test set...\n")
final_pca_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize the features
    ('PCA', PCA(
        n_components=best_params['PCA__n_components'],
        whiten=best_params['PCA__whiten'],
        svd_solver=best_params['PCA__svd_solver'],
        tol=best_params.get('PCA__tol', None),
        random_state=42
    ))
])
final_pca_pipeline.fit(Y_train)
transformed_test = final_pca_pipeline.transform(Y_test)
reconstructed_test = final_pca_pipeline.named_steps['PCA'].inverse_transform(transformed_test)
test_error = mean_squared_error(Y_test, reconstructed_test)

# Final Output
print("\n🎉 Grid Search Completed!")
print(f"🔍 Best Parameters for PCA: {best_params}")
print(f"📉 Lowest Average Reconstruction Error (Validation): {best_error:.4f}")
print(f"📈 Reconstruction Error on Test Set: {test_error:.4f}")


Step 1: Splitting the dataset into training,validation and test sets...

Training set shape: (136522, 17)
Test set shape: (34131, 17)

Step 2: Defining the parameter grid for PCA...

Total parameter combinations to evaluate: 12

Step 3: Setting up tracking for the best parameters...

Step 4: Setting up 3-Fold Cross-Validation...

Step 5: Starting the grid search...

🌀 Evaluating combination 1/12: {'PCA__n_components': 2, 'PCA__svd_solver': 'auto', 'PCA__tol': 0.0001, 'PCA__whiten': True}
  ➡️ Fold 1: Splitting data...
    🔄 Running PCA with parameters: {'PCA__n_components': 2, 'PCA__svd_solver': 'auto', 'PCA__tol': 0.0001, 'PCA__whiten': True}...
    ✅ Reconstruction Error for Fold 1: 459668.3938

  ➡️ Fold 2: Splitting data...
    🔄 Running PCA with parameters: {'PCA__n_components': 2, 'PCA__svd_solver': 'auto', 'PCA__tol': 0.0001, 'PCA__whiten': True}...
    ✅ Reconstruction Error for Fold 2: 459714.6201

  ➡️ Fold 3: Splitting data...
    🔄 Running PCA with parameters: {'PCA__n_comp

#### Conclusions

- **Reconstruction Error (459695.21)** PCA successfully reduced dimensionality without significant data loss.
- **Tolerance(0.0001)** Ensures PCA converges with high precision by preventing premature stopping.
- **PCA_svd_solver('auto')** The best solver is automatically choosen based on dataset size, balancing speed and accuracy.

#### Summary
The best parameters provide a good balance between efficiency and minimizing reconstruction error by ensuring optimal preservation of data structure and maintaining meaningful cluster separability in the reduced-dimensional space.
