In [4]:
import pandas as pd

# Load dataset (expects vibrations_data.csv in the notebook working directory)
df = pd.read_csv("../../data/vibrations_data.csv")

print(f"Loaded dataframe with {df.shape[0]} rows and {df.shape[1]} columns")
print(df.dtypes)
df.head()

Loaded dataframe with 390263 rows and 25 columns
Time                  float64
Demand 1              float64
Control 1             float64
Output Drive 1        float64
Channel 1             float64
Channel 2             float64
Channel 3             float64
Channel 4             float64
Channel 1 Kurtosis    float64
Channel 2 Kurtosis    float64
Channel 3 Kurtosis    float64
Channel 4 Kurtosis    float64
Rear Input 1            int64
Rear Input 2            int64
Rear Input 3            int64
Rear Input 4            int64
Rear Input 5            int64
Rear Input 6            int64
Rear Input 7            int64
Rear Input 8            int64
condition              object
rpm                     int64
humidity                int64
temperature             int64
source_file            object
dtype: object


Unnamed: 0,Time,Demand 1,Control 1,Output Drive 1,Channel 1,Channel 2,Channel 3,Channel 4,Channel 1 Kurtosis,Channel 2 Kurtosis,...,Rear Input 4,Rear Input 5,Rear Input 6,Rear Input 7,Rear Input 8,condition,rpm,humidity,temperature,source_file
0,0.00145,0.125011,0.176033,0.0,0.211458,0.209182,0.145823,1.6242e-15,2.52457,2.94874,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
1,0.00145,0.125011,0.176033,0.0,0.211458,0.209182,0.145823,1.6242e-15,2.52457,2.94874,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
2,0.006283,0.125011,0.176033,1.2e-05,0.206329,0.206513,0.150478,1.64332e-15,2.3229,2.46553,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
3,0.009633,0.125011,0.172626,1.3e-05,0.206351,0.194663,0.148313,1.52827e-15,2.26458,2.55488,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv
4,0.0132,0.125011,0.172626,1.4e-05,0.214463,0.214489,0.155652,1.79137e-15,2.5338,2.66379,...,0,0,0,0,0,faulty,1000,0,-10,1st at -10 2022Jun04-2239-0005.csv


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, 
    roc_auc_score, precision_score, recall_score,
    classification_report, confusion_matrix
)
import warnings
warnings.filterwarnings('ignore')

def analyze_and_train_best_model(
    df, 
    target_col='condition',
    test_size=0.2,
    random_state=42,
    exclude_cols=None,
    metric='balanced_accuracy',
    verbose=True
):
    """
    Analyze dataset, split into train/test, train multiple ML models, and select the best one.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe with features and target column
    target_col : str, default='condition'
        Name of the target column
    test_size : float, default=0.2
        Proportion of dataset to use for testing
    random_state : int, default=42
        Random seed for reproducibility
    exclude_cols : list, default=None
        List of column names to exclude from features (e.g., ['source_file', 'Time'])
    metric : str, default='balanced_accuracy'
        Metric to use for selecting best model ('balanced_accuracy', 'f1', 'roc_auc', 'accuracy')
    verbose : bool, default=True
        Whether to print detailed results
        
    Returns:
    --------
    dict : Dictionary containing:
        - 'best_model': The best trained model
        - 'best_model_name': Name of the best model
        - 'scaler': Fitted StandardScaler
        - 'label_encoder': Fitted LabelEncoder
        - 'feature_cols': List of feature column names used
        - 'results': Dictionary with all model results
        - 'X_test': Test features
        - 'y_test': Test labels
        - 'X_train': Train features
        - 'y_train': Train labels
    """
    
    # 1. Data analysis
    if verbose:
        print("=" * 60)
        print("DATASET ANALYSIS")
        print("=" * 60)
        print(f"Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"\nTarget variable '{target_col}' distribution:")
        print(df[target_col].value_counts())
        print(f"\nTarget variable '{target_col}' distribution (%):")
        print(df[target_col].value_counts(normalize=True) * 100)
        print(f"\nMissing values:")
        print(df.isnull().sum()[df.isnull().sum() > 0])
        if df.isnull().sum().sum() == 0:
            print("No missing values found.")
    
    # 2. Feature selection
    if exclude_cols is None:
        exclude_cols = ['source_file', 'Time']  # Default exclusions
    
    # Get numeric columns (excluding target and excluded columns)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feature_cols = [c for c in numeric_cols if c not in [target_col] + exclude_cols]
    
    if verbose:
        print(f"\nSelected {len(feature_cols)} features: {feature_cols[:5]}..." if len(feature_cols) > 5 else f"\nSelected {len(feature_cols)} features: {feature_cols}")
    
    # 3. Prepare features and target
    X = df[feature_cols].values
    y_raw = df[target_col].values
    
    # Encode target labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_raw)
    
    if verbose:
        print(f"\nLabel encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
    
    # 4. Train/test split (stratified to maintain class distribution)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        stratify=y, 
        random_state=random_state
    )
    
    if verbose:
        print(f"\nTrain set: {X_train.shape[0]} samples")
        print(f"Test set: {X_test.shape[0]} samples")
        print(f"Train class distribution: {np.bincount(y_train)}")
        print(f"Test class distribution: {np.bincount(y_test)}")
    
    # 5. Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 6. Define models to test
    models = {
        "Logistic Regression": LogisticRegression(
            max_iter=1000, 
            random_state=random_state,
            n_jobs=-1
        ),
        "Random Forest": RandomForestClassifier(
            n_estimators=200, 
            random_state=random_state, 
            n_jobs=-1,
            max_depth=20
        ),
        "Gradient Boosting": GradientBoostingClassifier(
            n_estimators=100,
            random_state=random_state,
            max_depth=5
        ),
        "AdaBoost": AdaBoostClassifier(
            n_estimators=100,
            random_state=random_state
        ),
        "SVM (RBF)": SVC(
            probability=True, 
            random_state=random_state,
            kernel='rbf'
        ),
        "K-Nearest Neighbors": KNeighborsClassifier(
            n_neighbors=5,
            n_jobs=-1
        ),
        "Decision Tree": DecisionTreeClassifier(
            random_state=random_state,
            max_depth=15
        )
    }
    
    # 7. Train and evaluate all models
    if verbose:
        print("\n" + "=" * 60)
        print("TRAINING MODELS")
        print("=" * 60)
    
    results = {}
    
    for name, model in models.items():
        if verbose:
            print(f"\nTraining {name}...")
        
        try:
            # Train model
            model.fit(X_train_scaled, y_train)
            
            # Predictions
            y_pred = model.predict(X_test_scaled)
            y_proba = None
            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_test_scaled)[:, 1]
            
            # Calculate metrics
            acc = accuracy_score(y_test, y_pred)
            bal_acc = balanced_accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
            rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
            
            roc_auc = None
            if y_proba is not None:
                try:
                    roc_auc = roc_auc_score(y_test, y_proba)
                except:
                    roc_auc = None
            
            results[name] = {
                'model': model,
                'accuracy': acc,
                'balanced_accuracy': bal_acc,
                'f1_score': f1,
                'precision': prec,
                'recall': rec,
                'roc_auc': roc_auc,
                'predictions': y_pred,
                'probabilities': y_proba
            }
            
            if verbose:
                print(f"  Accuracy: {acc:.4f}")
                print(f"  Balanced Accuracy: {bal_acc:.4f}")
                print(f"  F1 Score: {f1:.4f}")
                if roc_auc is not None:
                    print(f"  ROC-AUC: {roc_auc:.4f}")
        
        except Exception as e:
            if verbose:
                print(f"  Error training {name}: {str(e)}")
            results[name] = {'error': str(e)}
    
    # 8. Select best model based on specified metric
    valid_results = {k: v for k, v in results.items() if 'error' not in v}
    
    if metric not in ['balanced_accuracy', 'f1_score', 'roc_auc', 'accuracy']:
        metric = 'balanced_accuracy'
        if verbose:
            print(f"\nWarning: Invalid metric '{metric}'. Using 'balanced_accuracy' instead.")
    
    # Handle ROC-AUC which might be None
    if metric == 'roc_auc':
        valid_results = {k: v for k, v in valid_results.items() if v.get('roc_auc') is not None}
        if not valid_results:
            metric = 'balanced_accuracy'
            if verbose:
                print("Warning: No models with ROC-AUC available. Using 'balanced_accuracy' instead.")
    
    best_model_name = max(valid_results.keys(), key=lambda k: valid_results[k][metric])
    best_model_info = valid_results[best_model_name]
    
    # 9. Print results summary
    if verbose:
        print("\n" + "=" * 60)
        print("MODEL COMPARISON RESULTS")
        print("=" * 60)
        print(f"\n{'Model':<25} {'Accuracy':<10} {'Bal Acc':<10} {'F1':<10} {'ROC-AUC':<10}")
        print("-" * 65)
        
        sorted_results = sorted(
            valid_results.items(), 
            key=lambda kv: kv[1][metric], 
            reverse=True
        )
        
        for name, res in sorted_results:
            roc_str = f"{res['roc_auc']:.4f}" if res['roc_auc'] is not None else "N/A"
            print(f"{name:<25} {res['accuracy']:<10.4f} {res['balanced_accuracy']:<10.4f} "
                  f"{res['f1_score']:<10.4f} {roc_str:<10}")
        
        print("\n" + "=" * 60)
        print(f"BEST MODEL: {best_model_name} (based on {metric})")
        print("=" * 60)
        print(f"  Accuracy: {best_model_info['accuracy']:.4f}")
        print(f"  Balanced Accuracy: {best_model_info['balanced_accuracy']:.4f}")
        print(f"  F1 Score: {best_model_info['f1_score']:.4f}")
        if best_model_info['roc_auc'] is not None:
            print(f"  ROC-AUC: {best_model_info['roc_auc']:.4f}")
        
        # Classification report for best model
        print(f"\nClassification Report for {best_model_name}:")
        print(classification_report(y_test, best_model_info['predictions'], 
                                   target_names=label_encoder.classes_))
        
        # Confusion matrix
        print(f"\nConfusion Matrix for {best_model_name}:")
        cm = confusion_matrix(y_test, best_model_info['predictions'])
        print(cm)
    
    # 10. Retrain best model on full training data (optional, but good practice)
    best_model = best_model_info['model']
    # Create a fresh instance to retrain
    best_model_class = type(best_model)
    if best_model_name == "Logistic Regression":
        final_model = LogisticRegression(max_iter=1000, random_state=random_state, n_jobs=-1)
    elif best_model_name == "Random Forest":
        final_model = RandomForestClassifier(n_estimators=200, random_state=random_state, n_jobs=-1, max_depth=20)
    elif best_model_name == "Gradient Boosting":
        final_model = GradientBoostingClassifier(n_estimators=100, random_state=random_state, max_depth=5)
    elif best_model_name == "AdaBoost":
        final_model = AdaBoostClassifier(n_estimators=100, random_state=random_state)
    elif best_model_name == "SVM (RBF)":
        final_model = SVC(probability=True, random_state=random_state, kernel='rbf')
    elif best_model_name == "K-Nearest Neighbors":
        final_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    elif best_model_name == "Decision Tree":
        final_model = DecisionTreeClassifier(random_state=random_state, max_depth=15)
    else:
        final_model = best_model
    
    final_model.fit(X_train_scaled, y_train)
    
    return {
        'best_model': final_model,
        'best_model_name': best_model_name,
        'scaler': scaler,
        'label_encoder': label_encoder,
        'feature_cols': feature_cols,
        'results': results,
        'X_test': X_test,
        'y_test': y_test,
        'X_train': X_train,
        'y_train': y_train,
        'X_test_scaled': X_test_scaled,
        'X_train_scaled': X_train_scaled
    }

# Example usage:
# results = analyze_and_train_best_model(df, target_col='condition', test_size=0.2, random_state=42)


In [6]:
# Use the function to analyze the dataset and train the best model
model_results = analyze_and_train_best_model(
    df, 
    target_col='condition',
    test_size=0.2,
    random_state=42,
    exclude_cols=['source_file', 'Time'],  # Exclude non-feature columns
    metric='balanced_accuracy',  # Metric to use for selecting best model
    verbose=True
)

# Access the results
print("\n" + "=" * 60)
print("FUNCTION OUTPUT SUMMARY")
print("=" * 60)
print(f"Best Model: {model_results['best_model_name']}")
print(f"Number of features used: {len(model_results['feature_cols'])}")
print(f"Feature columns: {model_results['feature_cols']}")


DATASET ANALYSIS
Dataset shape: 390263 rows, 25 columns

Target variable 'condition' distribution:
condition
faulty     209662
healthy    180601
Name: count, dtype: int64

Target variable 'condition' distribution (%):
condition
faulty     53.723258
healthy    46.276742
Name: proportion, dtype: float64

Missing values:
Series([], dtype: int64)
No missing values found.

Selected 22 features: ['Demand 1', 'Control 1', 'Output Drive 1', 'Channel 1', 'Channel 2']...

Label encoding: {'faulty': 0, 'healthy': 1}

Train set: 312210 samples
Test set: 78053 samples
Train class distribution: [167729 144481]
Test class distribution: [41933 36120]

TRAINING MODELS

Training Logistic Regression...
  Accuracy: 0.8942
  Balanced Accuracy: 0.8933
  F1 Score: 0.8942
  ROC-AUC: 0.9432

Training Random Forest...
  Accuracy: 1.0000
  Balanced Accuracy: 1.0000
  F1 Score: 1.0000
  ROC-AUC: 1.0000

Training Gradient Boosting...
  Accuracy: 0.9952
  Balanced Accuracy: 0.9950
  F1 Score: 0.9952
  ROC-AUC: 0.99