In [1]:
import pkg_resources
version = pkg_resources.get_distribution("autogluon").version
print(version)


  import pkg_resources


0.8.2


In [5]:
# -*- coding: utf-8 -*-
"""
Created on Mon Sep  5 04:30:22 2022

@author: stuun
"""
import time
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve
from autogluon.tabular import TabularPredictor
import matplotlib.pyplot as plt
import seaborn as sns

# Define dataset paths
datasets = {
    "missforest": "C:\\Users\\Ali\\Desktop\\missforest.xlsx",
    "MICE": "C:\\Users\\Ali\\Desktop\\MICE.csv",
    "EM": "C:\\Users\\Ali\\Desktop\\EM.csv",
    "KNNI": "C:\\Users\\Ali\\Desktop\\KNNI.csv",
    "Pchip": "C:\\Users\\Ali\\Desktop\\Pchip.csv"
}

# Initialize a list to store evaluation results for all datasets
all_results = []

# Loop through each dataset
for dataset_name, dataset_path in datasets.items():
    print(f"Processing dataset: {dataset_name}")

    # Load dataset
    if dataset_path.endswith('.xlsx'):
        dataset = pd.read_excel(dataset_path)
    else:
        dataset = pd.read_csv(dataset_path)

    # Separate into input and output columns
    X, y = dataset.values[:, :-1], dataset.values[:, -1]
    y[y == 2] = 0  # Assuming '2' represents the negative class, and '1' the positive class
    y[y == 1] = 1

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

    # Create DataFrames for AutoGluon
    train_data = pd.DataFrame(X_train)
    test_data = pd.DataFrame(X_test)
    label = 'Target'  # Name of the target variable
    train_data[label] = y_train
    test_data[label] = y_test

    # Summarize class distribution
    counter = Counter(y)
    print("Class distribution:", counter)

    # Set the time limit and evaluation metric for AutoGluon
    time_limit = 160  # Time limit for training
    metric = 'roc_auc'  # Specify evaluation metric

    # Train the AutoGluon model using best quality preset
    predictor = TabularPredictor(label, eval_metric=metric).fit(train_data, time_limit=time_limit, presets='best_quality')

    # Get the leaderboard
    leaderboard = predictor.leaderboard(test_data, silent=True)
    print(leaderboard)

    # Evaluate models and collect results
    for model_name in leaderboard['model']:
        y_pred = predictor.predict(test_data.drop(columns=[label]), model=model_name)
        y_scores = predictor.predict_proba(test_data.drop(columns=[label]), model=model_name).iloc[:, 1].values

        # Calculate evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_scores)
        precision_pr, recall_pr, _ = precision_recall_curve(y_test, y_scores)
        pr_auc = auc(recall_pr, precision_pr)

        # Append results to the list
        all_results.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'Accuracy': accuracy,
            'F1-Score': f1,
            'Precision': precision,
            'Recall': recall,
            'ROC-AUC': roc_auc,
            'PR-AUC': pr_auc
        })

# Convert all results to a DataFrame
final_results_df = pd.DataFrame(all_results)

# Save results to a CSV file
final_results_df.to_csv("C:\\Users\\Ali\\Desktop\\Model_Evaluation_Results.csv", index=False)


Processing dataset: missforest


No path specified. Models will be saved in: "AutogluonModels\ag-20250410_152540\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=5, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 160s
AutoGluon will save models to "AutogluonModels\ag-20250410_152540\"
AutoGluon Version:  0.8.2
Python Version:     3.10.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
Disk Space Avail:   240.83 GB / 323.46 GB (74.5%)
Train Data Rows:    442
Train Data Columns: 15
Label Column: Target
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [1.0, 0.0]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, cla

Class distribution: Counter({1.0: 472, 0.0: 81})


			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 15 | ['0', '1', '2', '3', '4', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 14 | ['1', '2', '3', '4', '5', ...]
		('int', ['bool']) :  1 | ['0']
	0.2s = Fit runtime
	15 features in original data used to generate 15 features in processed data.
	Train Data (Processed) Memory Usage: 0.05 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.27s ...
AutoGluon will gauge predictive performance using evaluation metric: 'roc_auc'
	This metric expects predicted probabilities rather than predicted class labels, so

KeyboardInterrupt: 