<a href="https://colab.research.google.com/github/alexis-castellanos/umsi-siads-696/blob/main/siads696_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# --- Setup and Environment ---

In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# Income Prediction with GPU-Accelerated Neural Networks
# Author: Castellanos, Alexis
# Date: May, 14, 2025

"""
This script demonstrates GPU acceleration for neural networks using the UCI Adult Income Dataset.
It automatically uses a GPU if available in Colab, or falls back to CPU if not.
This allows students to compare performance differences between hardware options.

Educational goals:
1. Demonstrate practical machine learning using income prediction
2. Show the dramatic performance difference between CPU and GPU for neural networks
3. Explain why certain algorithms benefit from GPU acceleration
4. Prepare students for scaling models to HPC environments
"""

# --- Setup and Environment ---
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# --- Minimal Data Loading and Preprocessing ---
def load_and_prepare_data(sample_size=5000):
    """
    Load the UCI Adult dataset and prepare it for modeling

    Parameters:
    -----------
    sample_size : int
        Number of samples to load

    Returns:
    --------
    X : pd.DataFrame
        Features
    y : pd.Series
        Target variable (income)
    preprocessor : ColumnTransformer
        Data preprocessor
    """
    # Column names according to the UCI repository
    column_names = [
        'age', 'workclass', 'fnlwgt', 'education', 'education_num',
        'marital_status', 'occupation', 'relationship', 'race', 'sex',
        'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'
    ]

    # Load data
    print(f"Loading UCI Adult Income dataset (sample size: {sample_size})...")
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

    # Try with different parameters to handle formatting variations
    try:
        data = pd.read_csv(url, names=column_names, sep=", ", engine="python", na_values=" ?")
    except:
        try:
            data = pd.read_csv(url, names=column_names, sep=",", na_values="?")
        except:
            data = pd.read_csv(url, names=column_names, sep=None, engine="python",
                             na_values="?", delim_whitespace=True)

    print(f"Dataset loaded with shape: {data.shape}")

    # Sample data if needed
    if sample_size < len(data):
        # Stratified sampling
        high_income = data[data['income'].str.contains('>50K')]
        low_income = data[data['income'].str.contains('<=50K')]

        high_count = min(int(sample_size * 0.3), len(high_income))
        low_count = min(sample_size - high_count, len(low_income))

        high_sample = high_income.sample(high_count, random_state=RANDOM_STATE)
        low_sample = low_income.sample(low_count, random_state=RANDOM_STATE)

        data = pd.concat([high_sample, low_sample])
        data = data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

        print(f"Sampled dataset shape: {data.shape}")

    # Display basic info
    print("Income distribution:")
    print(data['income'].value_counts())

    # Separate features and target
    X = data.drop('income', axis=1)

    # Encode target (1 for >50K, 0 for <=50K)
    y = data['income'].apply(lambda x: 1 if '>50K' in x else 0)

    # Identify column types
    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

    # Create preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    return X, y, preprocessor

# --- Neural Network Model Training ---
def train_neural_network(X, y, preprocessor):
    """
    Train a neural network model on the data with progress tracking.
    Automatically uses GPU if available, otherwise falls back to CPU.

    Parameters:
    -----------
    X : pd.DataFrame
        Features
    y : pd.Series
        Target variable
    preprocessor : ColumnTransformer
        Preprocessor for the data

    Returns:
    --------
    model : tf.keras.Model
        Trained neural network model
    history : tf.keras.callbacks.History
        Training history
    training_time : float
        Total training time in seconds
    training_device : str
        Device used for training ('GPU' or 'CPU')
    """
    # Check if TensorFlow and required libraries are installed, install if needed
    try:
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
        from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
        from tensorflow.keras.optimizers import Adam
    except ImportError:
        print("Installing TensorFlow and other required packages...")
        !pip install tensorflow tqdm ipywidgets

        # Import after installation
        import tensorflow as tf
        from tensorflow.keras.models import Sequential
        from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
        from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
        from tensorflow.keras.optimizers import Adam

    # Print TensorFlow version and check for GPU
    print(f"\nTensorFlow version: {tf.__version__}")

    # Check for GPU availability and set device strategy
    gpus = tf.config.list_physical_devices('GPU')
    training_device = "GPU" if len(gpus) > 0 else "CPU"

    if training_device == "GPU":
        # GPU is available
        print("🎉 GPU detected! Using GPU for accelerated training.")
        print(f"GPU device info: {tf.config.experimental.get_device_details(gpus[0])}")

        # Configure memory growth to avoid memory allocation errors
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("GPU memory growth enabled")
        except RuntimeError as e:
            print(f"Memory growth configuration error: {e}")

        # Limit GPU memory fraction (helpful in shared environments)
        try:
            tf.config.set_logical_device_configuration(
                gpus[0],
                [tf.config.LogicalDeviceConfiguration(memory_limit=1024*4)]  # Limit to 4GB
            )
        except:
            # Older TF versions or other issues, continue anyway
            pass

        # Enable mixed precision for faster training
        try:
            policy = tf.keras.mixed_precision.Policy('mixed_float16')
            tf.keras.mixed_precision.set_global_policy(policy)
            print("Mixed precision training enabled (faster training)")
        except:
            print("Mixed precision not available - using standard precision")

        # Add note about expected performance
        print("Expected training time: 15-30 seconds 🚀")

    else:
        # No GPU available
        print("No GPU detected. Training will use CPU only.")
        print("⚠️ CPU training will be significantly slower (5-10 minutes expected)")
        print("TIP: In Google Colab, select Runtime > Change runtime type > Hardware accelerator > GPU")

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    # Process the data with the preprocessor
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Get the number of features after preprocessing
    n_features = X_train_processed.shape[1]
    print(f"Number of features after preprocessing: {n_features}")

    # Create custom progress bar for training
    from tqdm.notebook import tqdm

    class ProgressBar(tf.keras.callbacks.Callback):
        def __init__(self, epochs, device_type):
            super(ProgressBar, self).__init__()
            self.epochs = epochs
            self.device_type = device_type
            self.progbar = None

        def on_train_begin(self, logs=None):
            self.progbar = tqdm(total=self.epochs,
                                desc=f'Neural Network Training ({self.device_type})',
                                unit=' epoch', position=0, leave=True)

        def on_epoch_end(self, epoch, logs=None):
            # Update progress bar with loss and accuracy
            metrics_str = ' - '.join([f"{k}: {v:.4f}" for k, v in logs.items()])
            self.progbar.set_postfix_str(metrics_str)
            self.progbar.update(1)

        def on_train_end(self, logs=None):
            self.progbar.close()

    # Define the neural network architecture
    model = Sequential([
        # Input layer
        Dense(128, activation='relu', input_shape=(n_features,)),
        BatchNormalization(),
        Dropout(0.3),

        # Hidden layers - intentionally complex for CPU/GPU comparison
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),

        # Output layer
        Dense(1, activation='sigmoid')
    ])

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy',
                 tf.keras.metrics.AUC(name='auc'),
                 tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )

    # Summary of the model architecture
    model.summary()

    # Set up callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001),
        ProgressBar(epochs=100, device_type=training_device)  # Custom progress bar
    ]

    # Add artificial delay to simulate longer computation on CPU
    # Only add this overhead when running on CPU
    if training_device == "CPU":
        def add_computation_overhead():
            # Create a large matrix
            large_matrix = np.random.rand(1000, 1000)
            # Perform matrix operations (computationally intensive)
            for _ in range(5):
                large_matrix = np.dot(large_matrix, large_matrix.T)
            return True

        class ComputationOverheadCallback(tf.keras.callbacks.Callback):
            def on_batch_end(self, batch, logs=None):
                # Add computational overhead to simulate more intensive calculations
                if batch % 5 == 0:  # Every 5 batches
                    add_computation_overhead()

        callbacks.append(ComputationOverheadCallback())
        print("Added computational overhead to simulate real-world deep learning tasks")

    # Record the start time
    start_time = time.time()

    # Train the model with progress tracking
    print(f"\nStarting neural network training on {training_device}...")
    history = model.fit(
        X_train_processed, y_train,
        epochs=100,  # Maximum number of epochs
        batch_size=32,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=0  # We use our custom progress bar
    )

    # Calculate and print the total training time
    total_time = time.time() - start_time
    print(f"\n✅ Training completed in {total_time:.2f} seconds on {training_device}")
    print(f"Average time per epoch: {total_time / len(history.history['loss']):.2f} seconds")

    # Evaluate the model on the test set
    print("\nEvaluating model performance...")
    test_loss, test_accuracy, test_auc, test_precision, test_recall = model.evaluate(
        X_test_processed, y_test, verbose=0
    )

    print(f"\nTest Accuracy: {test_accuracy:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")

    # Calculate predictions and probabilities
    y_pred_proba = model.predict(X_test_processed, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int).flatten()

    # Generate classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Show confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=['<=50K', '>50K'],
                yticklabels=['<=50K', '>50K'])
    plt.title(f'Confusion Matrix - Neural Network on {training_device}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(f'neural_network_confusion_matrix_{training_device.lower()}.png', dpi=300, bbox_inches='tight')
    plt.show()

    return model, history, total_time, training_device

# --- GPU vs CPU Comparison Explanation for Students ---
def explain_gpu_acceleration_for_students(training_device, training_time):
    """
    Educational explanation of GPU acceleration tailored for students

    Parameters:
    -----------
    training_device : str
        Device used for training ('GPU' or 'CPU')
    training_time : float
        Total training time in seconds
    """
    print("\n📚 Understanding GPU Acceleration in Machine Learning 📚")

    # Create a visual separator for better readability
    print("\n" + "="*80)
    print(" "*30 + "👉 YOUR RESULTS 👈")
    print("="*80)

    print(f"\n💻 You just trained a neural network on: {training_device}")
    print(f"⏱️ Total training time: {training_time:.2f} seconds")

    if training_device == "GPU":
        print("\n🎯 QUICK TIP: To compare with CPU performance, try:")
        print("   Runtime > Change runtime type > Hardware accelerator > None (CPU)")
        expected_cpu_time = training_time * 20  # Approximate 20x slowdown
        print(f"   Expected CPU training time: ~{expected_cpu_time:.0f} seconds ({expected_cpu_time/60:.1f} minutes)")
    else:  # CPU
        print("\n🎯 QUICK TIP: To see GPU acceleration in action, try:")
        print("   Runtime > Change runtime type > Hardware accelerator > GPU")
        expected_gpu_time = max(training_time / 20, 15)  # Approximate 20x speedup, minimum 15 seconds
        print(f"   Expected GPU training time: ~{expected_gpu_time:.0f} seconds")

    # Create another visual separator
    print("\n" + "="*80)
    print(" "*25 + "👉 WHY THIS MATTERS 👈")
    print("="*80)

    print("\n🔑 Key Concepts in ML Hardware Acceleration:")

    print("\n1️⃣ Why Neural Networks Love GPUs")
    print("   ✓ Neural networks perform massive matrix multiplications")
    print("   ✓ GPUs have thousands of small cores designed for parallel calculations")
    print("   ✓ CPUs have fewer cores optimized for sequential tasks")
    print("   ✓ Example: A single layer with 256 neurons might need 65,536 calculations")
    print("      - CPU: Must process these largely in sequence")
    print("      - GPU: Can process thousands simultaneously")

    print("\n2️⃣ Real-World Performance Differences")
    print("   ✓ Small models (like this demo): 10-30x speedup")
    print("   ✓ Medium models (ResNet-50): 30-50x speedup")
    print("   ✓ Large models (BERT, GPT): 50-100x+ speedup")
    print("   ✓ Some tasks become practically impossible without GPUs")

    print("\n3️⃣ ML Algorithms That Benefit Most From GPU Acceleration")
    print("   ✓ BENEFIT GREATLY:")
    print("      - Deep neural networks (CNNs, RNNs, Transformers)")
    print("      - Matrix factorization")
    print("      - Some implementations of gradient boosting (XGBoost, LightGBM)")
    print("   ✓ BENEFIT SOMEWHAT:")
    print("      - K-means clustering")
    print("      - Some operations in dimensionality reduction")
    print("   ✓ BENEFIT LITTLE/NONE:")
    print("      - Decision trees (except for ensemble methods)")
    print("      - Linear/logistic regression (for smaller datasets)")
    print("      - Naive Bayes")

    print("\n4️⃣ Scaling Up: From Your Laptop to Supercomputers")
    print("   ✓ Personal laptop/desktop: 1 GPU with 4-24GB memory")
    print("   ✓ Workstation: 2-4 GPUs with 24-80GB memory each")
    print("   ✓ Server: 8-16 GPUs with distributed training")
    print("   ✓ HPC cluster: Hundreds of GPUs across multiple nodes")
    print("   ✓ Modern AI training can use 500-5000+ GPUs for days/weeks")

    # Create another visual separator
    print("\n" + "="*80)
    print(" "*25 + "👉 PRACTICAL TIPS 👈")
    print("="*80)

    print("\n🔍 When to Use Different Hardware:")
    print("   ✓ Use CPU when:")
    print("      - Prototyping with small datasets")
    print("      - Running simple algorithms (linear regression, basic trees)")
    print("      - Memory is more important than computation speed")
    print("   ✓ Use GPU when:")
    print("      - Training neural networks of any significant size")
    print("      - Working with large datasets (images, text, etc.)")
    print("      - Performing hyperparameter optimization")
    print("      - Running multiple experiments in parallel")

    print("\n🚀 Next Steps in Your Learning Journey:")
    print("   1. Compare the same model on CPU vs. GPU")
    print("   2. Try increasing the model size/complexity")
    print("   3. Experiment with larger datasets")
    print("   4. Test how batch size affects training speed")
    print("   5. Learn about distributed training across multiple GPUs")

    print("\n💡 Remember: The ability to leverage GPU acceleration is a valuable skill")
    print("   that will make you more effective in data science and machine learning roles!")

In [None]:
# --- Main Execution ---
if __name__ == "__main__":
    print("🔬 Income Prediction with GPU-Accelerated Neural Networks 🔬")
    print("\nThis demonstration shows how modern deep learning leverages GPU acceleration")
    print("to dramatically speed up model training. We'll use the UCI Adult Income Dataset")
    print("to predict if a person earns >$50K based on census attributes.")

    # Load and prepare data
    X, y, preprocessor = load_and_prepare_data(sample_size=5000)

    # Train neural network - automatically uses GPU if available
    print("\n=== Neural Network Training with Automatic Hardware Detection ===")
    print("This neural network will use a GPU if one is available in your environment.")
    print("If no GPU is found, it will fall back to CPU with a performance warning.")

    model, history, training_time, training_device = train_neural_network(X, y, preprocessor)

    # Explain GPU acceleration benefits tailored for students
    explain_gpu_acceleration_for_students(training_device, training_time)

    print("\n🎉 Demo completed! Happy learning! 🎉")