<a href="https://colab.research.google.com/github/Yohnjparra/test_unsupervisedlearning/blob/main/unsupervised_learning_nrp_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Unsupervised Learning: Performance Comparison Notebook

**Course:** Machine Learning | **Instructor:** Dr. Yohn  
**Institution:** Florida A&M University - Computer and Information Sciences Department

## Objective
This notebook demonstrates unsupervised learning algorithms while comparing computational performance between **JupyterHub NRP** and **Google Colab** environments. Students will:

1. Learn key unsupervised learning algorithms (K-Means, DBSCAN, Hierarchical Clustering)
2. Understand computational resource considerations
3. Compare performance metrics across different environments
4. Evaluate clustering quality using multiple metrics

## Instructions
1. Run this notebook completely on **JupyterHub NRP**
2. Save the generated CSV and images
3. Run the same notebook on **Google Colab**
4. Compare the results to understand infrastructure differences

In [1]:
# Cell 1: Import Libraries and Setup
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_classification
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import psutil
import platform

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

print("=" * 80)
print("UNSUPERVISED LEARNING: Performance Comparison Notebook")
print("Course: Machine Learning | Instructor: Dr. Yohn")
print("Florida A&M University - CIS Department")
print("=" * 80)

UNSUPERVISED LEARNING: Performance Comparison Notebook
Course: Machine Learning | Instructor: Dr. Yohn
Florida A&M University - CIS Department


In [2]:
# Cell 2: System Information and Resource Detection
def get_system_info():
    """
    Display system information to compare computational resources
    between NRP JupyterHub and Google Colab
    """
    info = {
        'Platform': platform.system(),
        'Platform Release': platform.release(),
        'Platform Version': platform.version(),
        'Architecture': platform.machine(),
        'Processor': platform.processor(),
        'CPU Cores (Physical)': psutil.cpu_count(logical=False),
        'CPU Cores (Logical)': psutil.cpu_count(logical=True),
        'Total RAM (GB)': round(psutil.virtual_memory().total / (1024**3), 2),
        'Available RAM (GB)': round(psutil.virtual_memory().available / (1024**3), 2),
        'Python Version': platform.python_version()
    }

    # Check if running on Google Colab
    try:
        import google.colab
        info['Environment'] = 'Google Colab'
    except:
        info['Environment'] = 'JupyterHub NRP (or Local)'

    # Display as DataFrame
    df_info = pd.DataFrame(list(info.items()), columns=['Property', 'Value'])
    print("\nüìä SYSTEM INFORMATION")
    print("=" * 60)
    print(df_info.to_string(index=False))
    print("=" * 60)

    return info

system_info = get_system_info()


üìä SYSTEM INFORMATION
            Property                               Value
            Platform                               Linux
    Platform Release                            6.6.105+
    Platform Version #1 SMP Thu Oct  2 10:42:05 UTC 2025
        Architecture                              x86_64
           Processor                              x86_64
CPU Cores (Physical)                                   1
 CPU Cores (Logical)                                   2
      Total RAM (GB)                               12.67
  Available RAM (GB)                               11.15
      Python Version                             3.12.12
         Environment                        Google Colab


In [3]:
# Cell 3: Generate Synthetic Datasets of Various Sizes
def create_datasets():
    """
    Create datasets of different sizes to benchmark performance
    """
    datasets = {}

    print("\nüî¨ GENERATING DATASETS")
    print("=" * 60)

    # Small dataset - for quick testing
    X_small, y_small = make_blobs(n_samples=1000, n_features=10,
                                   centers=5, random_state=42)
    datasets['small'] = (X_small, y_small, "Small (1K samples, 10 features)")
    print(f"‚úì Small dataset: {X_small.shape}")

    # Medium dataset - typical classroom size
    X_medium, y_medium = make_blobs(n_samples=50000, n_features=20,
                                     centers=8, random_state=42)
    datasets['medium'] = (X_medium, y_medium, "Medium (50K samples, 20 features)")
    print(f"‚úì Medium dataset: {X_medium.shape}")

    # Large dataset - to stress test resources
    X_large, y_large = make_blobs(n_samples=200000, n_features=30,
                                   centers=10, random_state=42)
    datasets['large'] = (X_large, y_large, "Large (200K samples, 30 features)")
    print(f"‚úì Large dataset: {X_large.shape}")

    print("=" * 60)

    return datasets

datasets = create_datasets()


üî¨ GENERATING DATASETS
‚úì Small dataset: (1000, 10)
‚úì Medium dataset: (50000, 20)
‚úì Large dataset: (200000, 30)


In [4]:
# Cell 4: Performance Benchmarking Class
class PerformanceBenchmark:
    """
    Class to track and compare algorithm performance
    """
    def __init__(self):
        self.results = []

    def benchmark_algorithm(self, algorithm, X, dataset_name, algorithm_name):
        """
        Benchmark a clustering algorithm
        """
        # Record start metrics
        start_time = time.time()
        start_memory = psutil.Process().memory_info().rss / (1024**2)  # MB
        start_cpu = psutil.cpu_percent(interval=0.1)

        # Run algorithm
        try:
            labels = algorithm.fit_predict(X)

            # Record end metrics
            end_time = time.time()
            end_memory = psutil.Process().memory_info().rss / (1024**2)  # MB
            end_cpu = psutil.cpu_percent(interval=0.1)

            # Calculate metrics
            execution_time = end_time - start_time
            memory_used = end_memory - start_memory
            avg_cpu = (start_cpu + end_cpu) / 2

            # Clustering quality metrics
            silhouette = silhouette_score(X, labels) if len(np.unique(labels)) > 1 else 0
            davies_bouldin = davies_bouldin_score(X, labels) if len(np.unique(labels)) > 1 else 0
            calinski = calinski_harabasz_score(X, labels) if len(np.unique(labels)) > 1 else 0

            result = {
                'Dataset': dataset_name,
                'Algorithm': algorithm_name,
                'Samples': X.shape[0],
                'Features': X.shape[1],
                'Execution Time (s)': round(execution_time, 4),
                'Memory Used (MB)': round(memory_used, 2),
                'Avg CPU (%)': round(avg_cpu, 2),
                'Silhouette Score': round(silhouette, 4),
                'Davies-Bouldin Index': round(davies_bouldin, 4),
                'Calinski-Harabasz Score': round(calinski, 2),
                'N Clusters': len(np.unique(labels)),
                'Status': 'Success'
            }

        except Exception as e:
            result = {
                'Dataset': dataset_name,
                'Algorithm': algorithm_name,
                'Samples': X.shape[0],
                'Features': X.shape[1],
                'Execution Time (s)': 0,
                'Memory Used (MB)': 0,
                'Avg CPU (%)': 0,
                'Silhouette Score': 0,
                'Davies-Bouldin Index': 0,
                'Calinski-Harabasz Score': 0,
                'N Clusters': 0,
                'Status': f'Failed: {str(e)}'
            }

        self.results.append(result)
        return result

    def get_results_df(self):
        """Return results as a DataFrame"""
        return pd.DataFrame(self.results)

    def display_summary(self):
        """Display summary of results"""
        df = self.get_results_df()
        print("\nüìà PERFORMANCE SUMMARY")
        print("=" * 120)
        print(df.to_string(index=False))
        print("=" * 120)
        return df

benchmark = PerformanceBenchmark()

In [None]:
# Cell 5: Run Clustering Algorithms on All Datasets
print("\nüöÄ RUNNING UNSUPERVISED LEARNING ALGORITHMS")
print("=" * 80)

algorithms = {
    'K-Means (k=5)': KMeans(n_clusters=5, random_state=42, n_init=10),
    'K-Means (k=8)': KMeans(n_clusters=8, random_state=42, n_init=10),
    'DBSCAN': DBSCAN(eps=3, min_samples=5),
    'Hierarchical': AgglomerativeClustering(n_clusters=5)
}

for dataset_name, (X, y_true, description) in datasets.items():
    print(f"\nüìä Processing {description}...")

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    for algo_name, algorithm in algorithms.items():
        print(f"  ‚è±Ô∏è  Running {algo_name}...", end=" ")
        result = benchmark.benchmark_algorithm(algorithm, X_scaled, dataset_name, algo_name)
        print(f"‚úì ({result['Execution Time (s)']}s)")

print("\n" + "=" * 80)
print("‚úÖ ALL BENCHMARKS COMPLETED")


üöÄ RUNNING UNSUPERVISED LEARNING ALGORITHMS

üìä Processing Small (1K samples, 10 features)...
  ‚è±Ô∏è  Running K-Means (k=5)... ‚úì (0.1596s)
  ‚è±Ô∏è  Running K-Means (k=8)... ‚úì (0.1316s)
  ‚è±Ô∏è  Running DBSCAN... ‚úì (0.1368s)
  ‚è±Ô∏è  Running Hierarchical... ‚úì (0.1259s)

üìä Processing Medium (50K samples, 20 features)...
  ‚è±Ô∏è  Running K-Means (k=5)... ‚úì (0.8981s)
  ‚è±Ô∏è  Running K-Means (k=8)... ‚úì (0.4719s)
  ‚è±Ô∏è  Running DBSCAN... ‚úì (19.7166s)
  ‚è±Ô∏è  Running Hierarchical... 

In [5]:
# Cell 6: Display Results and Save
results_df = benchmark.display_summary()

# Save results to CSV for later comparison
output_filename = f"clustering_benchmark_{system_info['Environment'].replace(' ', '_')}.csv"
results_df.to_csv(output_filename, index=False)
print(f"\nüíæ Results saved to: {output_filename}")
print("üìå You can compare this file with results from Google Colab!")


üìà PERFORMANCE SUMMARY
Empty DataFrame
Columns: []
Index: []


NameError: name 'system_info' is not defined

In [None]:
# Cell 7: Comprehensive Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Execution Time by Dataset and Algorithm
ax1 = axes[0, 0]
pivot_time = results_df.pivot(index='Algorithm', columns='Dataset', values='Execution Time (s)')
pivot_time.plot(kind='bar', ax=ax1, colormap='viridis')
ax1.set_title('Execution Time Comparison Across Datasets', fontsize=14, fontweight='bold')
ax1.set_ylabel('Time (seconds)', fontsize=12)
ax1.set_xlabel('Algorithm', fontsize=12)
ax1.legend(title='Dataset Size')
ax1.grid(axis='y', alpha=0.3)

# Plot 2: Memory Usage Comparison
ax2 = axes[0, 1]
pivot_memory = results_df.pivot(index='Algorithm', columns='Dataset', values='Memory Used (MB)')
pivot_memory.plot(kind='bar', ax=ax2, colormap='plasma')
ax2.set_title('Memory Usage Comparison', fontsize=14, fontweight='bold')
ax2.set_ylabel('Memory (MB)', fontsize=12)
ax2.set_xlabel('Algorithm', fontsize=12)
ax2.legend(title='Dataset Size')
ax2.grid(axis='y', alpha=0.3)

# Plot 3: Clustering Quality - Silhouette Score
ax3 = axes[1, 0]
pivot_silhouette = results_df.pivot(index='Algorithm', columns='Dataset', values='Silhouette Score')
pivot_silhouette.plot(kind='bar', ax=ax3, colormap='coolwarm')
ax3.set_title('Clustering Quality: Silhouette Score (Higher is Better)', fontsize=14, fontweight='bold')
ax3.set_ylabel('Silhouette Score', fontsize=12)
ax3.set_xlabel('Algorithm', fontsize=12)
ax3.legend(title='Dataset Size')
ax3.axhline(y=0.5, color='red', linestyle='--', label='Good clustering threshold')
ax3.grid(axis='y', alpha=0.3)

# Plot 4: Scalability Analysis
ax4 = axes[1, 1]
for algo in results_df['Algorithm'].unique():
    algo_data = results_df[results_df['Algorithm'] == algo].sort_values('Samples')
    ax4.plot(algo_data['Samples'], algo_data['Execution Time (s)'],
             marker='o', label=algo, linewidth=2, markersize=8)
ax4.set_title('Algorithm Scalability Analysis', fontsize=14, fontweight='bold')
ax4.set_xlabel('Number of Samples', fontsize=12)
ax4.set_ylabel('Execution Time (seconds)', fontsize=12)
ax4.legend(loc='best')
ax4.grid(True, alpha=0.3)
ax4.set_xscale('log')

plt.tight_layout()
plt.savefig(f'clustering_performance_{system_info["Environment"].replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nüìä Visualizations saved!")

In [None]:
# Cell 8: Dimensionality Reduction Performance Test
print("\nüîç TESTING DIMENSIONALITY REDUCTION ALGORITHMS")
print("=" * 80)

# Use medium dataset for dimensionality reduction
X_test, y_test, _ = datasets['medium']
X_test_scaled = StandardScaler().fit_transform(X_test)

# Test PCA
print("\n‚è±Ô∏è  Running PCA...")
start_pca = time.time()
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_test_scaled)
pca_time = time.time() - start_pca
print(f"   PCA completed in {pca_time:.4f} seconds")
print(f"   Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")

# Test t-SNE (on subset due to computational cost)
print("\n‚è±Ô∏è  Running t-SNE (on 10K sample subset)...")
X_tsne_subset = X_test_scaled[:10000]
y_tsne_subset = y_test[:10000]
start_tsne = time.time()
tsne = TSNE(n_components=2, random_state=42, n_jobs=-1)
X_tsne = tsne.fit_transform(X_tsne_subset)
tsne_time = time.time() - start_tsne
print(f"   t-SNE completed in {tsne_time:.4f} seconds")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# PCA visualization
ax1 = axes[0]
scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y_test, cmap='viridis',
                       alpha=0.5, s=10)
ax1.set_title(f'PCA Projection (Time: {pca_time:.2f}s)', fontsize=14, fontweight='bold')
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
plt.colorbar(scatter1, ax=ax1)

# t-SNE visualization
ax2 = axes[1]
scatter2 = ax2.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_tsne_subset, cmap='viridis',
                       alpha=0.5, s=10)
ax2.set_title(f't-SNE Projection (Time: {tsne_time:.2f}s)', fontsize=14, fontweight='bold')
ax2.set_xlabel('t-SNE Component 1')
ax2.set_ylabel('t-SNE Component 2')
plt.colorbar(scatter2, ax=ax2)

plt.tight_layout()
plt.savefig(f'dimensionality_reduction_{system_info["Environment"].replace(" ", "_")}.png',
            dpi=300, bbox_inches='tight')
plt.show()

## Key Learning Points for Students

### 1Ô∏è‚É£ Computational Resources Matter
- Different environments provide different computational capabilities
- RAM and CPU cores directly impact algorithm performance
- NRP resources may provide advantages for large-scale ML tasks

### 2Ô∏è‚É£ Algorithm Complexity
- **K-Means**: O(n√ók√ói) - Generally fastest, scales well
- **DBSCAN**: O(n¬≤) in worst case - Good for noise, slower on large datasets
- **Hierarchical**: O(n¬≤√ólog(n)) - Most computationally expensive

### 3Ô∏è‚É£ Quality Metrics
- **Silhouette Score**: Measures cluster cohesion (range: -1 to 1, higher is better)
- **Davies-Bouldin Index**: Measures cluster separation (lower is better)
- **Calinski-Harabasz**: Variance ratio criterion (higher is better)

### 4Ô∏è‚É£ Practical Implications
- Choose algorithms based on dataset size and available resources
- Consider quality vs. speed trade-offs
- Infrastructure choice affects experimental capabilities

### 5Ô∏è‚É£ Next Steps for Exploration
- Try different clustering parameters (k values, eps, linkage methods)
- Experiment with feature engineering and preprocessing
- Compare results between NRP and Google Colab environments
- Explore other unsupervised methods (GMM, Spectral Clustering)

In [None]:
# Cell 9: Generate Summary Report
from datetime import datetime

report = f"""
UNSUPERVISED LEARNING PERFORMANCE BENCHMARK REPORT
Florida A&M University - Computer & Information Sciences Department
Instructor: Dr. Yohn
Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

{'=' * 80}
SYSTEM CONFIGURATION
{'=' * 80}
Environment: {system_info['Environment']}
CPU Cores (Logical): {system_info['CPU Cores (Logical)']}
Total RAM: {system_info['Total RAM (GB)']} GB
Available RAM: {system_info['Available RAM (GB)']} GB
Python Version: {system_info['Python Version']}

{'=' * 80}
PERFORMANCE SUMMARY
{'=' * 80}

{results_df.to_string(index=False)}

{'=' * 80}
FASTEST ALGORITHMS BY DATASET SIZE
{'=' * 80}
"""

for dataset_name in ['small', 'medium', 'large']:
    dataset_results = results_df[results_df['Dataset'] == dataset_name]
    fastest = dataset_results.loc[dataset_results['Execution Time (s)'].idxmin()]
    report += f"\n{dataset_name.upper()}: {fastest['Algorithm']} ({fastest['Execution Time (s)']}s)"

report += f"""

{'=' * 80}
RECOMMENDATIONS FOR STUDENTS
{'=' * 80}
1. For exploratory analysis: Use K-Means on smaller datasets
2. For production systems: Consider computational resources carefully
3. For best quality: Compare multiple algorithms and validate with metrics
4. When using NRP: Leverage increased resources for larger experiments

Report generated on {system_info['Environment']}
Compare this with results from Google Colab to see resource differences!
{'=' * 80}
"""

# Save report
report_filename = f"benchmark_report_{system_info['Environment'].replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
with open(report_filename, 'w') as f:
    f.write(report)

print(report)
print(f"\nüíæ Full report saved to: {report_filename}")

## Interactive Comparison Tool

### To Compare NRP vs Google Colab Performance:

1. Run this entire notebook on **JupyterHub NRP** (you're doing this now!)
2. Download the generated CSV file: `clustering_benchmark_*.csv`
3. Open this same notebook in **Google Colab**
4. Run all cells in Google Colab
5. Download the Colab CSV file
6. Upload both CSV files and run the comparison cell below

This hands-on comparison will help you understand:
- How computational resources affect ML performance
- When to use cloud resources vs local/NRP resources
- Real-world considerations for deploying ML models

In [None]:
# Cell 10: Interactive Comparison Function (Optional)
def compare_environments(nrp_csv, colab_csv):
    """
    Compare performance between NRP and Google Colab

    Usage: compare_environments('nrp_results.csv', 'colab_results.csv')
    """
    nrp_df = pd.read_csv(nrp_csv)
    colab_df = pd.read_csv(colab_csv)

    nrp_df['Environment'] = 'NRP'
    colab_df['Environment'] = 'Colab'

    combined = pd.concat([nrp_df, colab_df])

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Execution time comparison
    ax1 = axes[0, 0]
    combined.groupby(['Dataset', 'Environment'])['Execution Time (s)'].mean().unstack().plot(
        kind='bar', ax=ax1, colormap='Set2')
    ax1.set_title('Average Execution Time: NRP vs Colab', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Time (seconds)')
    ax1.legend(title='Environment')

    # Memory comparison
    ax2 = axes[0, 1]
    combined.groupby(['Dataset', 'Environment'])['Memory Used (MB)'].mean().unstack().plot(
        kind='bar', ax=ax2, colormap='Set2')
    ax2.set_title('Average Memory Usage: NRP vs Colab', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Memory (MB)')
    ax2.legend(title='Environment')

    # Quality comparison
    ax3 = axes[1, 0]
    combined.groupby(['Dataset', 'Environment'])['Silhouette Score'].mean().unstack().plot(
        kind='bar', ax=ax3, colormap='Set2')
    ax3.set_title('Average Clustering Quality: NRP vs Colab', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Silhouette Score')
    ax3.legend(title='Environment')

    # Speedup factor
    ax4 = axes[1, 1]
    speedup_data = []
    for dataset in combined['Dataset'].unique():
        for algo in combined['Algorithm'].unique():
            nrp_time = nrp_df[(nrp_df['Dataset']==dataset) & (nrp_df['Algorithm']==algo)]['Execution Time (s)'].values
            colab_time = colab_df[(colab_df['Dataset']==dataset) & (colab_df['Algorithm']==algo)]['Execution Time (s)'].values
            if len(nrp_time) > 0 and len(colab_time) > 0 and nrp_time[0] > 0:
                speedup = colab_time[0] / nrp_time[0]
                speedup_data.append({'Dataset': dataset, 'Algorithm': algo, 'Speedup': speedup})

    if speedup_data:
        speedup_df = pd.DataFrame(speedup_data)
        speedup_pivot = speedup_df.pivot(index='Algorithm', columns='Dataset', values='Speedup')
        speedup_pivot.plot(kind='bar', ax=ax4, colormap='RdYlGn')
        ax4.set_title('Speedup Factor (Colab Time / NRP Time)', fontsize=14, fontweight='bold')
        ax4.set_ylabel('Speedup (>1 means NRP faster)')
        ax4.axhline(y=1, color='black', linestyle='--', label='Equal performance')
        ax4.legend(title='Dataset')

    plt.tight_layout()
    plt.savefig('nrp_vs_colab_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

    print("\nüìä ENVIRONMENT COMPARISON SUMMARY")
    print("=" * 80)
    print(combined.groupby('Environment')[['Execution Time (s)', 'Memory Used (MB)',
                                            'Silhouette Score']].mean())
    print("=" * 80)

# Uncomment and run when you have both CSV files:
# compare_environments('clustering_benchmark_JupyterHub_NRP_(or_Local).csv',
#                      'clustering_benchmark_Google_Colab.csv')

print("\n‚úÖ Notebook Complete!")
print("üìä Review your results and compare with Google Colab when ready.")
print("üí° Discussion: What differences did you observe? Why might they exist?")