In [1]:
import numpy as np  
import pandas as pd  
from sklearn.pipeline import Pipeline  
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score  
from sklearn.preprocessing import StandardScaler, MinMaxScaler  
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier  
from sklearn.svm import SVC  
from sklearn.linear_model import LogisticRegression  
from sklearn.neural_network import MLPClassifier  
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA  
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score  
from scipy.stats import uniform, randint  
from sklearn.decomposition import PCA  

def Grid_search(features, target):  
    # Split features and target variable  
    X = features  
    y = target  

    # Initialize Stratified K-Fold  
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  

    # Hyperparameter grids using lists for RandomizedSearchCV  
    param_grids = {  
        "KNN": {  
            'lda__n_components': [1, 2],  
            'classifier__n_neighbors': list(range(1, 30)),  # List of integers from 1 to 29  
            'classifier__weights': ['uniform', 'distance']  
        },  
        "Random Forest": {  
            'lda__n_components': [1, 2],  
            'classifier__n_estimators': list(range(50, 300)),  # List of integers from 50 to 299  
            'classifier__max_depth': [None] + list(range(5, 31, 5)),  # [None, 5, 10, 15, 20, 25, 30]  
            'classifier__min_samples_split': [2, 5, 10, 15]  
        },  
        "SVM": {  
            'lda__n_components': [1, 2],  
            'classifier__C': uniform(loc=0.001, scale=99.999),  # Uniform distribution from 0.001 to 100  
            'classifier__kernel': ['linear', 'rbf']  
        },  
        "Logistic Regression": {  
            'lda__n_components': [1, 2],  
            'classifier__C': uniform(loc=0.001, scale=99.999),  # To explore a range of regularization strengths  
            'classifier__solver': ['lbfgs', 'liblinear']  
        },  
        "ANN": {  
            'lda__n_components': [1, 2],  
            'classifier__hidden_layer_sizes': [(50,), (100,), (150,), (50, 50), (100, 100)],  
            'classifier__activation': ['relu', 'tanh', 'logistic'],  
            'classifier__alpha': uniform(loc=1e-4, scale=1e-1),  # Uniform distribution between 0.0001 and 0.1  
            'classifier__max_iter': [1000]  # Fixed max_iter for ANN  
        }  
    }  

    # Models to test  
    models = {  
        "KNN": KNeighborsClassifier(),  
        "Random Forest": RandomForestClassifier(),  
        "SVM": SVC(probability=True),  # Enable probability for ROC AUC  
        "Logistic Regression": LogisticRegression(),  
        "ANN": MLPClassifier()  
    }  

    # Define multiple metrics for evaluation  
    metrics = {  
        'accuracy': make_scorer(accuracy_score),  
        'precision': make_scorer(precision_score, average='weighted'),  
        'recall': make_scorer(recall_score, average='weighted'),  
        'f1': make_scorer(f1_score, average='weighted')  
    }  

    # Results dictionary  
    results = {}  
    for model_name, model in models.items():  
        # Create pipeline with scaling, LDA, and classifier  
        pipeline = Pipeline([  
            ('scaler', StandardScaler()),  
            ('lda', LDA()),  
            ('classifier', model)  
        ])  

        # Choose search strategy based on model  
        if model_name in ["Random Forest", "SVM", "Logistic Regression", "ANN"]:  
            search = RandomizedSearchCV(pipeline, param_grids[model_name], n_iter=50,  
                                        cv=skf, scoring='accuracy', n_jobs=-1, random_state=42)  
        else:  
            search = GridSearchCV(pipeline, param_grids[model_name], cv=skf, scoring='accuracy', n_jobs=-1)  

        # Fit the model to the data  
        search.fit(X, y)  

        best_params = search.best_params_  
        best_score = search.best_score_  # The best cross-validated score  
        results[model_name] = (best_params, best_score)  

        # Evaluate the model with other metrics  
        for metric_name, scorer in metrics.items():  
            score = cross_val_score(search.best_estimator_, X, y, cv=skf, scoring=scorer)  
            results[model_name] += (metric_name, score.mean())  # Append the mean score for each metric  

    # Print out the results  
    for model_name, metrics in results.items():  
        best_params, best_score = metrics[:2]  
        print(f"{model_name}: Best Parameters = {best_params}, Best CV Accuracy = {best_score:.4f}")  
        for i in range(2, len(metrics), 2):  
            print(f"    {metrics[i]}: {metrics[i + 1]:.4f}")  

# Example usage with defined 'features' and 'target'  

In [2]:
all_data = pd.read_csv('./datasets/augmented_dataset.csv')

In [3]:
target = all_data[['storage_1', 'storage_2', 'storage_3']].idxmax(axis=1).str[-1].astype(int)  
features_sc_pt = all_data.drop(columns=['Replica', 'storage_1', 'storage_2', 'storage_3'])  
Grid_search(features_sc_pt, target)

  _data = np.array(data, dtype=dtype, copy=copy,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refe

KNN: Best Parameters = {'classifier__n_neighbors': 1, 'classifier__weights': 'uniform'}, Best CV Accuracy = 0.9722
    accuracy: 0.9722
    precision: 0.9786
    recall: 0.9722
    f1: 0.9715
Random Forest: Best Parameters = {'classifier__n_estimators': 220, 'classifier__min_samples_split': 5, 'classifier__max_depth': 30}, Best CV Accuracy = 0.9667
    accuracy: 0.9611
    precision: 0.9655
    recall: 0.9500
    f1: 0.9545
SVM: Best Parameters = {'classifier__C': 70.80754970702675, 'classifier__kernel': 'rbf'}, Best CV Accuracy = 0.9611
    accuracy: 0.9611
    precision: 0.9690
    recall: 0.9611
    f1: 0.9604
Logistic Regression: Best Parameters = {'classifier__C': 61.165704395667596, 'classifier__solver': 'lbfgs'}, Best CV Accuracy = 0.9722
    accuracy: 0.9722
    precision: 0.9755
    recall: 0.9722
    f1: 0.9718
ANN: Best Parameters = {'classifier__activation': 'relu', 'classifier__alpha': 0.060211501174320885, 'classifier__hidden_layer_sizes': (150,), 'classifier__max_iter': 