# rfClassifier

The following changes have been made to Calvin's original code: 
- Addition of `SMOTE`
- Switch to `GradientBoostingClassifier`
- Parameter tuning using `RandomizedSearchCV`
- Additional metrics summarized and/or plotted

In [1]:
# environment
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, learning_curve, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import time
import json
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import joblib

## Functions
### Plotting

In [2]:
# plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, save_path=None):
    """
    Plot and save confusion matrix.
    """
    cm = metrics.confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    if save_path:
        plt.savefig(save_path)
        print(f"Confusion matrix saved to {save_path}")
    plt.close()

In [3]:
# plot roc curve
def plot_roc_curve(y_true, y_pred_proba, save_path=None):
    """
    Plot and save ROC curve.
    """
    fpr, tpr, _ = metrics.roc_curve(y_true, y_pred_proba[:, 1])
    roc_auc = metrics.auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    
    if save_path:
        plt.savefig(save_path)
        print(f"ROC curve saved to {save_path}")
    plt.close()


In [4]:
# plot precision recall curve
def plot_precision_recall_curve(y_true, y_pred_proba, save_path=None):
    """
    Plot and save Precision-Recall curve.
    """
    precision, recall, _ = metrics.precision_recall_curve(y_true, y_pred_proba[:, 1])
    pr_auc = metrics.auc(recall, precision)
    
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='darkorange', lw=2, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    
    if save_path:
        plt.savefig(save_path)
        print(f"Precision-Recall curve saved to {save_path}")
    plt.close()


In [5]:
# plot learning curves
def plot_learning_curves(estimator, X, y, save_path=None):
    """
    Plot and save learning curves.
    """
    train_sizes, train_scores, val_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='f1'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training score')
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
    plt.plot(train_sizes, val_mean, label='Cross-validation score')
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1)
    plt.xlabel('Training Examples')
    plt.ylabel('F1 Score')
    plt.title('Learning Curves')
    plt.legend(loc='best')
    plt.grid(True)
    
    if save_path:
        plt.savefig(save_path)
        print(f"Learning curves saved to {save_path}")
    plt.close()


In [6]:
# plot feature importance
def plot_feature_importance(feature_importance, feature_names, save_path=None):
    """
    Plot and save feature importance.
    """
    plt.figure(figsize=(12, 6))
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    sns.barplot(x='importance', y='feature', data=importance_df.head(20))
    plt.title('Top 20 Most Important Features')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    
    if save_path:
        plt.savefig(save_path)
        print(f"Feature importance plot saved to {save_path}")
    plt.close()


In [7]:
# plot cross-validation scores
def plot_cv_scores(cv_scores, save_path=None):
    """
    Plot and save cross-validation scores distribution.
    """
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=cv_scores)
    plt.title('Cross-validation Scores Distribution')
    plt.xlabel('F1 Score')
    
    if save_path:
        plt.savefig(save_path)
        print(f"CV scores distribution saved to {save_path}")
    plt.close()


## Data prep

In [8]:
# data loading
def load_data(data_file):
    '''
    load_data()

    Loads data from CSV file.
    
    --------
    Args:

    data_file: (Type: String) Name of CSV file containing data
    --------
    Returns:

    features: (Type: numpy.ndarray) Features of the dataset.
    labels: (Type: numpy.ndarray) Labels of the dataset
    feature_names: (Type: list) Names of the features
    '''
    print(f"Loading data from {data_file}...")
    # Read the CSV file using pandas
    df = pd.read_csv(data_file)

    # Extract feature names from the first row
    feature_names = df.columns.tolist()[1:]  # Skip the 'Label' column
    print(f"\nFeature names: {feature_names[:5]}... (total: {len(feature_names)} features)")
    
    # First column is the target variable, features are all other columns
    y = df.iloc[:, 0].values    # First column
    X = df.iloc[:, 1:].values   # All columns except the first one
    
    print(f"Loaded {X.shape[0]} samples with {X.shape[1]} features")
    print("Class distribution before SMOTE:")
    unique, counts = np.unique(y, return_counts=True)
    for label, count in zip(unique, counts):
        print(f"Class {label}: {count} samples")
    
    return X, y, feature_names


In [9]:
# saving model parameters
def save_results(results, filename=None):
    '''
    Save model results to a JSON file.
    
    --------
    Args:
    results: (Type: dict) Dictionary containing model results
    filename: (Type: str) Optional filename, defaults to timestamp
    '''
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"model_results_{timestamp}.json"
    
    with open(filename, 'w') as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {filename}")

## Analysis

In [10]:
def main(file):
    start_time = time.time()
    
    # Set Random Seed for Reproducability
    np.random.seed(89)

    # Load Data
    X, y, feature_names = load_data(file)

    # Generate train and test splits
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=89, stratify=y)
    print(f"\nTraining set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    # Check the number of minority class samples in the training set
    minority_class_count = np.sum(y_train == 1)
    print(f"Number of minority class samples in training set: {minority_class_count}")
    
    # Determine a safe k_neighbors value for SMOTE
    # We need to ensure k_neighbors is less than the number of minority class samples
    safe_k = max(1, min(3, minority_class_count - 1))
    print(f"Using k_neighbors={safe_k} for SMOTE to avoid errors")

    # Define the pipeline with SMOTE inside the pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('smote', SMOTE(random_state=89, k_neighbors=safe_k, sampling_strategy='auto')),
        ('classifier', GradientBoostingClassifier(random_state=89))
    ])

    # Define parameter distributions for RandomizedSearchCV
    # Use a smaller range for k_neighbors to avoid errors
    # Add regularization parameters to reduce overfitting
    param_distributions = {
        'smote__k_neighbors': [1, 2, 3],  # More k_neighbors options
        'classifier__n_estimators': [100, 200, 300, 400, 500],  # More estimators
        'classifier__learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],  # More learning rates
        'classifier__max_depth': [2, 3, 4],  # More depth options
        'classifier__min_samples_split': [2, 5, 10, 15, 20],  # More split options
        'classifier__min_samples_leaf': [2, 4, 6, 8],  # More leaf options
        'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # More subsample options
        'classifier__max_features': ['sqrt', 'log2', None]  # Added None option
    }

    # Create RandomizedSearchCV object with stratification
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions,
        n_iter=50,  # Increased number of iterations
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=89),  # Increased number of folds
        scoring='f1',
        n_jobs=-1,
        verbose=1,
        random_state=42,
        error_score='raise'
    )

    # Fit RandomizedSearchCV with progress bar
    print("\nStarting RandomizedSearchCV...")
    with tqdm(total=100, desc="Training Progress") as pbar:
        random_search.fit(X_train, y_train)
        pbar.update(100)

    # Get cross-validation scores with more folds
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=89)
    cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=cv, scoring='f1')
    print(f"\nCross-validation scores: {cv_scores}")
    print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

    # Get the resampled training data for visualization
    best_pipeline = random_search.best_estimator_
    X_train_resampled, y_train_resampled = best_pipeline.named_steps['smote'].fit_resample(
        best_pipeline.named_steps['scaler'].fit_transform(X_train), 
        y_train
    )

    # Collect results
    results = {
        'best_parameters': random_search.best_params_,
        'best_cv_score': float(random_search.best_score_),
        'cv_scores': cv_scores.tolist(),
        'training_time': time.time() - start_time,
        'n_samples': X.shape[0],
        'n_features': X.shape[1],
        'class_distribution_before': {str(k): int(v) for k, v in zip(*np.unique(y, return_counts=True))},
        'class_distribution_after_smote': {str(k): int(v) for k, v in zip(*np.unique(y_train_resampled, return_counts=True))},
        'best_k_neighbors': random_search.best_params_['smote__k_neighbors']
    }

    # Get best model
    best_model = random_search.best_estimator_

    # Calculate test metrics
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)
    
    # Calculate additional metrics
    results['test_accuracy'] = float(metrics.accuracy_score(y_test, y_pred))
    results['test_auc'] = float(metrics.roc_auc_score(y_test, y_pred_proba[:, 1]))
    results['test_f1'] = float(metrics.f1_score(y_test, y_pred))
    results['test_precision'] = float(metrics.precision_score(y_test, y_pred))
    results['test_recall'] = float(metrics.recall_score(y_test, y_pred))
    results['classification_report'] = metrics.classification_report(y_test, y_pred, output_dict=True)

    # Print results
    print("\n=== Model Performance ===")
    print(f"Best parameters: {results['best_parameters']}")
    print(f"Best CV score: {results['best_cv_score']:.4f}")
    print(f"Test accuracy: {results['test_accuracy']:.4f}")
    print(f"Test AUC: {results['test_auc']:.4f}")
    print(f"Test F1-score: {results['test_f1']:.4f}")
    print(f"Test Precision: {results['test_precision']:.4f}")
    print(f"Test Recall: {results['test_recall']:.4f}")
    print(f"\nTraining time: {results['training_time']:.2f} seconds")

    # Generate and save visualizations
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plot_confusion_matrix(y_test, y_pred, f'confusion_matrix_{timestamp}.png')
    plot_roc_curve(y_test, y_pred_proba, f'roc_curve_{timestamp}.png')
    plot_precision_recall_curve(y_test, y_pred_proba, f'precision_recall_curve_{timestamp}.png')
    plot_learning_curves(best_model, X_train, y_train, f'learning_curves_{timestamp}.png')
    plot_cv_scores(cv_scores, f'cv_scores_{timestamp}.png')
    
    # Get feature importance
    feature_importance = best_model.named_steps['classifier'].feature_importances_
    plot_feature_importance(feature_importance, feature_names, f'feature_importance_{timestamp}.png')
    
    features_rank = pd.Series(feature_importance, index=feature_names).sort_values(ascending=False)
    print("\n=== Top 10 Most Important Features ===")
    print(features_rank.head(400))

    # Save results
    save_results(results)

    return best_model, feature_names

In [None]:
# run main() and store the returned values
best_model, feature_names = main("synthetic_data.csv")

Loading data from synthetic_data.csv...

Feature names: ['FP_r_Metric01', 'FP_l_Metric01', 'IC_r_Metric01', 'IC_l_Metric01', 'SFG_r_Metric01']... (total: 792 features)
Loaded 400 samples with 792 features
Class distribution before SMOTE:
Class 0: 372 samples
Class 1: 28 samples

Training set size: 320, Test set size: 80
Number of minority class samples in training set: 22
Using k_neighbors=3 for SMOTE to avoid errors

Starting RandomizedSearchCV...


Training Progress:   0%|          | 0/100 [00:00<?, ?it/s]

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
# Save the trained model pipeline (which includes SMOTE + scaler + classifier)
joblib.dump(pipeline, "model.pkl")

# Save the test set used for evaluation (optional)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)