In [1]:
import numpy as np
from pathlib import Path
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
import random
import json

In [2]:
def prepare_features(files):
    '''
    Prepare features and labels for a binary classification task based on a dataset of files.

    Parameters
    ----------
    - files (list of str): A list of file paths representing the dataset. Each file is associated with a data sample.
    '''

    selfeatures1 = [file for file in files if "gas" in file.split('/',-1)[-1]]
    selfeatures1.sort()
    selfeatures2 = [file for file in files if "non" in file.split('/',-1)[-1]]
    random.shuffle(selfeatures2)
    selfeatures2 = selfeatures2[:len(selfeatures1)]
    selfeatures = selfeatures1 + selfeatures2

    X = np.asarray([np.load(feature, mmap_mode='r')['arr_0'].flatten() for feature in selfeatures])
    y = np.asarray([1] * len(selfeatures1) + [0] * len(selfeatures2))

    return X, y

In [3]:
def gridsearch_train_sgdclassifier(X, y, param_grid):
    '''
    Train an SGDClassifier using grid search to find the best hyperparameters.

    Parameters
    ----------
    - X (numpy.ndarray): Features of the dataset.
    - y (numpy.ndarray): Labels corresponding to the features (1 for positive, 0 for negative).
    - param_grid (dict): Hyperparameter grid to search for the best configuration.
    '''
    best_estimator = None  
    sgdclassifier = SGDClassifier(
                                penalty = 'l2', #'l1'
                                loss = 'hinge', #'log_loss', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'
                                learning_rate='adaptive', 
                                shuffle=True, 
                                early_stopping = False,
                                tol = 1e-3,
                                validation_fraction = 0.2,
                                n_jobs=1)

        
    grid_search = GridSearchCV(sgdclassifier, param_grid, scoring='f1_macro', cv=1, n_jobs=2, verbose=0, return_train_score=True) 
    grid_search.fit(X, y)
        
    if best_estimator is None or grid_search.best_score_ > best_estimator.best_score_:
        best_estimator = grid_search

    return best_estimator.best_estimator_, best_estimator.best_params_

In [4]:
def calculate_metrics(clf, X_valid, y_valid):
    '''
    Calculate various classification metrics for a given classifier's predictions.

    Parameters
    ----------
    - clf: Classifier model for which metrics need to be calculated.
    - X_valid (numpy.ndarray): Features of the validation dataset.
    - y_valid (numpy.ndarray): True labels for the validation dataset.
    '''

    print(f"Metrics Calculation")
    print(f"-------------------")
    y_pred = clf.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    precision = precision_score(y_valid, y_pred)
    print(f"Precision: {precision:.2f}")

    recall = recall_score(y_valid, y_pred)
    print(f"Recall: {recall:.2f}")

    f1 = f1_score(y_valid, y_pred)
    print(f"F1 Score: {f1:.2f}")

    roc_auc = roc_auc_score(y_valid, y_pred)
    print(f"ROC AUC Score: {roc_auc:.2f}")
    print()

    class_report = classification_report(y_valid, y_pred)
    print("Classification Report:")
    print(class_report)

In [5]:
path_train = "/home/antonkout/Documents/modules/flammable_gas_detection/release/data/dataset/propane/dataset_arrays_hof/training"
path_test = "/home/antonkout/Documents/modules/flammable_gas_detection/release/data/dataset/propane/dataset_arrays_hof/test"

trainfeatures = [str(file) for file in Path(path_train).rglob('*') if file.is_file()]
testfeatures = [str(file) for file in Path(path_test).rglob('*') if file.is_file()]

X_train, y_train = prepare_features(trainfeatures)

param_grid = {
        'alpha' : [0.1],
        'eta0': [0.1,],
        'max_iter': [3000, 3200],
    }

sgd_classifier, best_param = gridsearch_train_sgdclassifier(X_train, y_train, param_grid)
print("Best parameters found from grid search:")
print(best_param)
print()
X_test, y_test = prepare_features(testfeatures)
calculate_metrics(sgd_classifier, X_test, y_test)

In [15]:
# Get all the parameters of the classifier
classifier_params = sgd_classifier.get_params()

# Convert the parameters to a JSON string
classifier_params_json = json.dumps(classifier_params, indent=4)
with open('./classifier_params.json', 'w') as json_file:
    json_file.write(classifier_params_json)