In [4]:
import pandas as pd

df = pd.read_csv('yeast.csv')
print("Dataset loaded successfully")

Dataset loaded successfully


In [7]:
X = df.drop('name', axis=1)
y = df['name']

print("\nFeatures (X) and Target (y) separated.")
print("X shape:", X.shape)
print("y shape:", y.shape)


Features (X) and Target (y) separated.
X shape: (1484, 8)
y shape: (1484,)


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

print("Target variable (y) encoded successfully. First 5 encoded labels:")
print(y[:5])

Target variable (y) encoded successfully. First 5 encoded labels:
[6 6 6 7 6]


In [10]:
from sklearn.model_selection import train_test_split

# To split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split successfully into training and testing sets.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Data split successfully into training and testing sets.
X_train shape: (1187, 8)
X_test shape: (297, 8)
y_train shape: (1187,)
y_test shape: (297,)


In [11]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_classifier.fit(X_train, y_train)

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef, recall_score
import numpy as np

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

auc_scores = []
f1_scores = []
mcc_scores = []
recall_scores = []

# Performing Stratified K-Fold Cross-Validation
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Training the Decision Tree Classifier on the current fold's training data
    dt_classifier_fold = DecisionTreeClassifier(random_state=42)
    dt_classifier_fold.fit(X_train_fold, y_train_fold)

    # Making predictions and probability predictions on the current fold's test data
    y_pred = dt_classifier_fold.predict(X_test_fold)
    y_proba = dt_classifier_fold.predict_proba(X_test_fold)

    # For multi-class AUC, use 'ovr' (One-vs-Rest) strategy with 'weighted' average
    auc = roc_auc_score(y_test_fold, y_proba, multi_class='ovr', average='weighted')
    f1 = f1_score(y_test_fold, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test_fold, y_pred)
    recall = recall_score(y_test_fold, y_pred, average='weighted')

    auc_scores.append(auc)
    f1_scores.append(f1)
    mcc_scores.append(mcc)
    recall_scores.append(recall)

    print(f"Fold {fold + 1}/{n_splits} - AUC: {auc:.4f}, F1-score: {f1:.4f}, MCC: {mcc:.4f}, Recall: {recall:.4f}")

print(f"\nAverage AUC across {n_splits} folds: {np.mean(auc_scores):.4f}")
print(f"Average F1-score across {n_splits} folds: {np.mean(f1_scores):.4f}")
print(f"Average MCC across {n_splits} folds: {np.mean(mcc_scores):.4f}")
print(f"Average Recall across {n_splits} folds: {np.mean(recall_scores):.4f}")


Fold 1/5 - AUC: 0.7171, F1-score: 0.5592, MCC: 0.4350, Recall: 0.5623
Fold 2/5 - AUC: 0.6883, F1-score: 0.5232, MCC: 0.3865, Recall: 0.5253
Fold 3/5 - AUC: 0.6183, F1-score: 0.4302, MCC: 0.2546, Recall: 0.4310
Fold 4/5 - AUC: 0.6505, F1-score: 0.4612, MCC: 0.3096, Recall: 0.4613
Fold 5/5 - AUC: 0.7064, F1-score: 0.5450, MCC: 0.4172, Recall: 0.5473

Average AUC across 5 folds: 0.6761
Average F1-score across 5 folds: 0.5038
Average MCC across 5 folds: 0.3606
Average Recall across 5 folds: 0.5054
