In [1]:
# python imports
import pandas as pd
import numpy as np
import os
import joblib
from tabulate import tabulate

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef)

# import models from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
# Load Dataset
data = pd.read_excel("Dry_Bean_Dataset.xlsx")

print(f"Dataset Name    :  Dry Bean")
print(f"Dataset Source  :  UCI")
print(f"No. of Samples  :  {data.shape[0]}")
print(f"No. of Features :  {data.shape[1]}")

print("\nFirst 5 rows of the raw dataset:")
print(data.head(5))

X = data.drop('Class', axis=1)
y = data['Class']

cwd = os.getcwd()

le = LabelEncoder()
y_le = le.fit_transform(y)
joblib.dump(le, os.path.join(cwd, 'label_encoder.pkl'))

X_train, X_test, y_train, y_test = train_test_split(
    X, y_le, test_size=0.2, random_state=42, stratify=y_le)

print(f"\nTraining set size : {X_train.shape[0]}")
print(f"\nTesting set size  : {X_test.shape[0]}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
joblib.dump(scaler, os.path.join(cwd, 'scaler.pkl'))
print()

Dataset Name    :  Dry Bean
Dataset Source  :  UCI
No. of Samples  :  13611
No. of Features :  17

First 5 rows of the raw dataset:
    Area  Perimeter  MajorAxisLength  MinorAxisLength  AspectRation  \
0  28395    610.291       208.178117       173.888747      1.197191   
1  28734    638.018       200.524796       182.734419      1.097356   
2  29380    624.110       212.826130       175.931143      1.209713   
3  30008    645.884       210.557999       182.516516      1.153638   
4  30140    620.134       201.847882       190.279279      1.060798   

   Eccentricity  ConvexArea  EquivDiameter    Extent  Solidity  roundness  \
0      0.549812       28715     190.141097  0.763923  0.988856   0.958027   
1      0.411785       29172     191.272750  0.783968  0.984986   0.887034   
2      0.562727       29690     193.410904  0.778113  0.989559   0.947849   
3      0.498616       30724     195.467062  0.782681  0.976696   0.903936   
4      0.333680       30417     195.896503  0.773098  0.

In [3]:
# Intializing the Models
classification_models = {
    'logistic_regression': LogisticRegression(max_iter=3000, solver='lbfgs', random_state=42),
    'decision_tree': DecisionTreeClassifier(random_state=42),
    'knn': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'naive_bayes': GaussianNB(),
    'random_forest': RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
    'xgboost': XGBClassifier(eval_metric='mlogloss', n_jobs=-1, random_state=42)
}

In [4]:
metrics = []

for name, model in classification_models.items():
    model.fit(X_train_scaled, y_train)

    joblib.dump(model, os.path.join(cwd, f'{name}.pkl'))

    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    y_proba = model.predict_proba(X_test_scaled)
    auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')

    metrics.append({
        'Model': name, 'Accuracy': accuracy, 'AUC': auc, 'Precision': precision,
        'Recall': recall, 'F1 Score': f1, 'MCC': mcc
    })

In [5]:
# Print the result metrics
df_metrics = pd.DataFrame(metrics)

table = tabulate(df_metrics, headers='keys', tablefmt='github', floatfmt=".4f", showindex=False)

# Print to console
print("Results from all the models:\n")
print(table)

Results from all the models:

| Model               |   Accuracy |    AUC |   Precision |   Recall |   F1 Score |    MCC |
|---------------------|------------|--------|-------------|----------|------------|--------|
| logistic_regression |     0.9207 | 0.9934 |      0.9214 |   0.9207 |     0.9208 | 0.9041 |
| decision_tree       |     0.8898 | 0.9320 |      0.8896 |   0.8898 |     0.8896 | 0.8669 |
| knn                 |     0.9152 | 0.9811 |      0.9158 |   0.9152 |     0.9153 | 0.8974 |
| naive_bayes         |     0.8979 | 0.9902 |      0.9005 |   0.8979 |     0.8980 | 0.8772 |
| random_forest       |     0.9192 | 0.9910 |      0.9194 |   0.9192 |     0.9191 | 0.9023 |
| xgboost             |     0.9280 | 0.9939 |      0.9282 |   0.9280 |     0.9280 | 0.9129 |
