Lab 10: Logistic Regression
This script demonstrates Logistic Regression for binary and multiclass classification.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer, load_iris, make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
import seaborn as sns


In [None]:
def binary_classification():
    """Demonstrate binary logistic regression"""
    print("=" * 50)
    print("Binary Logistic Regression")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    print(f"\nDataset: Breast Cancer")
    print(f"Shape: {X.shape}")
    print(f"Classes: {cancer.target_names}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create and train model
    log_reg = LogisticRegression(max_iter=10000, random_state=42)
    log_reg.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = log_reg.predict(X_test_scaled)
    y_pred_proba = log_reg.predict_proba(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=cancer.target_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=cancer.target_names,
                yticklabels=cancer.target_names)
    plt.title('Confusion Matrix - Logistic Regression')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('lab10_confusion_matrix.png')
    plt.close()
    print("\nConfusion matrix saved as 'lab10_confusion_matrix.png'")
    
    return X_test_scaled, y_test, y_pred_proba


In [None]:
def roc_curve_analysis(X_test, y_test, y_pred_proba):
    """Analyze ROC curve and AUC"""
    print("\n" + "=" * 50)
    print("ROC Curve Analysis")
    print("=" * 50)
    
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    
    print(f"\nArea Under Curve (AUC): {roc_auc:.4f}")
    
    # Plot ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, 
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('lab10_roc_curve.png')
    plt.close()
    print("\nROC curve saved as 'lab10_roc_curve.png'")


In [None]:
def multiclass_classification():
    """Demonstrate multiclass logistic regression"""
    print("\n" + "=" * 50)
    print("Multiclass Logistic Regression")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data
    y = iris.target
    
    print(f"\nDataset: Iris")
    print(f"Shape: {X.shape}")
    print(f"Classes: {iris.target_names}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create and train model
    log_reg = LogisticRegression(max_iter=10000, random_state=42)
    log_reg.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = log_reg.predict(X_test_scaled)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=iris.target_names,
                yticklabels=iris.target_names)
    plt.title('Confusion Matrix - Multiclass Logistic Regression')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('lab10_multiclass_confusion.png')
    plt.close()
    print("\nMulticlass confusion matrix saved as 'lab10_multiclass_confusion.png'")


In [None]:
def regularization_comparison():
    """Compare different regularization techniques"""
    print("\n" + "=" * 50)
    print("Regularization Comparison (L1 vs L2)")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Split and scale
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Test different regularization penalties
    penalties = ['l1', 'l2', 'none']
    results = {}
    
    for penalty in penalties:
        if penalty == 'none':
            log_reg = LogisticRegression(penalty=None, max_iter=10000, random_state=42)
        else:
            log_reg = LogisticRegression(
                penalty=penalty, 
                solver='liblinear' if penalty == 'l1' else 'lbfgs',
                max_iter=10000, 
                random_state=42
            )
        
        log_reg.fit(X_train_scaled, y_train)
        accuracy = log_reg.score(X_test_scaled, y_test)
        results[penalty] = accuracy
        
        print(f"\n{penalty.upper() if penalty != 'none' else 'No'} Regularization:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Non-zero coefficients: {np.sum(log_reg.coef_ != 0)}")
    
    # Visualize comparison
    plt.figure(figsize=(10, 6))
    plt.bar(results.keys(), results.values(), color=['blue', 'green', 'orange'])
    plt.ylabel('Accuracy')
    plt.title('Logistic Regression: Regularization Comparison')
    plt.ylim([0.9, 1.0])
    for i, (penalty, acc) in enumerate(results.items()):
        plt.text(i, acc + 0.005, f'{acc:.4f}', ha='center')
    plt.tight_layout()
    plt.savefig('lab10_regularization.png')
    plt.close()
    print("\nRegularization comparison saved as 'lab10_regularization.png'")


In [None]:
def decision_boundary_visualization():
    """Visualize decision boundary for 2D data"""
    print("\n" + "=" * 50)
    print("Decision Boundary Visualization")
    print("=" * 50)
    
    # Generate 2D dataset
    X, y = make_classification(
        n_samples=200,
        n_features=2,
        n_redundant=0,
        n_informative=2,
        n_clusters_per_class=1,
        random_state=42
    )
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train model
    log_reg = LogisticRegression(random_state=42)
    log_reg.fit(X_scaled, y)
    
    print(f"\nAccuracy: {log_reg.score(X_scaled, y):.4f}")
    
    # Create mesh for decision boundary
    h = 0.02
    x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
    y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    # Predict on mesh
    Z = log_reg.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')
    plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='RdYlBu', 
                edgecolors='black', s=50)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Logistic Regression Decision Boundary')
    plt.colorbar(label='Class')
    plt.tight_layout()
    plt.savefig('lab10_decision_boundary.png')
    plt.close()
    print("\nDecision boundary saved as 'lab10_decision_boundary.png'")


In [None]:
def probability_analysis():
    """Analyze prediction probabilities"""
    print("\n" + "=" * 50)
    print("Prediction Probability Analysis")
    print("=" * 50)
    
    # Load dataset
    iris = load_iris()
    X = iris.data[:100]  # Use only first 2 classes
    y = iris.target[:100]
    
    # Split and scale
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train model
    log_reg = LogisticRegression(max_iter=10000, random_state=42)
    log_reg.fit(X_train_scaled, y_train)
    
    # Get probabilities
    y_pred_proba = log_reg.predict_proba(X_test_scaled)
    y_pred = log_reg.predict(X_test_scaled)
    
    # Display sample predictions with probabilities
    print("\nSample Predictions with Probabilities:")
    print(f"{'Actual':<10} {'Predicted':<10} {'Prob Class 0':<15} {'Prob Class 1':<15}")
    print("-" * 50)
    for i in range(min(10, len(y_test))):
        print(f"{y_test[i]:<10} {y_pred[i]:<10} {y_pred_proba[i][0]:.4f}{'':<10} {y_pred_proba[i][1]:.4f}")


In [None]:
def cross_validation():
    """Perform cross-validation"""
    print("\n" + "=" * 50)
    print("Cross-Validation")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Create model
    log_reg = LogisticRegression(max_iter=10000, random_state=42)
    
    # Perform cross-validation
    scores = cross_val_score(log_reg, X_scaled, y, cv=5)
    
    print(f"\nCross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.4f}")
    print(f"Standard deviation: {scores.std():.4f}")


In [None]:
def feature_coefficients():
    """Analyze feature coefficients"""
    print("\n" + "=" * 50)
    print("Feature Coefficients Analysis")
    print("=" * 50)
    
    # Load dataset
    cancer = load_breast_cancer()
    X = cancer.data
    y = cancer.target
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train model
    log_reg = LogisticRegression(max_iter=10000, random_state=42)
    log_reg.fit(X_scaled, y)
    
    # Get coefficients
    coefficients = log_reg.coef_[0]
    feature_importance = pd.DataFrame({
        'Feature': cancer.feature_names,
        'Coefficient': coefficients,
        'Abs_Coefficient': np.abs(coefficients)
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Plot top features
    top_features = feature_importance.head(10)
    plt.figure(figsize=(12, 6))
    colors = ['green' if c > 0 else 'red' for c in top_features['Coefficient']]
    plt.barh(range(len(top_features)), top_features['Coefficient'], color=colors)
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Coefficient Value')
    plt.title('Top 10 Feature Coefficients (Green: Positive, Red: Negative)')
    plt.tight_layout()
    plt.savefig('lab10_feature_coefficients.png')
    plt.close()
    print("\nFeature coefficients plot saved as 'lab10_feature_coefficients.png'")


In [None]:
def main():
    """Main function to demonstrate logistic regression"""
    print("\n" + "=" * 50)
    print("Lab 10: Logistic Regression")
    print("=" * 50)
    
    # Binary classification
    X_test, y_test, y_pred_proba = binary_classification()
    
    # ROC curve
    roc_curve_analysis(X_test, y_test, y_pred_proba)
    
    # Multiclass classification
    multiclass_classification()
    
    # Regularization comparison
    regularization_comparison()
    
    # Decision boundary
    decision_boundary_visualization()
    
    # Probability analysis
    probability_analysis()
    
    # Cross-validation
    cross_validation()
    
    # Feature coefficients
    feature_coefficients()
    
    print("\n" + "=" * 50)
    print("Lab 10 Complete!")
    print("=" * 50)


In [None]:
if __name__ == "__main__":
    main()
