In [1]:
import pandas as pd

df = pd.read_csv('yeast.csv')
print("Dataset loaded successfully")

Dataset loaded successfully


In [2]:
X = df.drop('name', axis=1)
y = df['name']

print("\nFeatures (X) and Target (y) separated.")
print("X shape:", X.shape)
print("y shape:", y.shape)


Features (X) and Target (y) separated.
X shape: (1484, 8)
y shape: (1484,)


In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

print("Target variable (y) encoded successfully. First 5 encoded labels:")
print(y[:5])

Target variable (y) encoded successfully. First 5 encoded labels:
[6 6 6 7 6]


In [4]:
from sklearn.model_selection import train_test_split

# To split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Data split successfully into training and testing sets.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Data split successfully into training and testing sets.
X_train shape: (1187, 8)
X_test shape: (297, 8)
y_train shape: (1187,)
y_test shape: (297,)


In [5]:
from sklearn.tree import DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)

dt_classifier.fit(X_train, y_train)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef, recall_score
import numpy as np

n_splits = 5

# Initializing StratifiedKFold to ensure each fold has roughly the same proportion of target labels as the whole dataset.
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) # random_state ensures reproducibility

rf_auc_scores = []
rf_f1_scores = []
rf_mcc_scores = []
rf_recall_scores = []

for fold_idx, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Splitting data for the current fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Initializing and training a Random Forest Classifier for this fold
    rf_classifier_fold = RandomForestClassifier(random_state=42)
    rf_classifier_fold.fit(X_train_fold, y_train_fold)

    # Making predictions and getting probability estimates on the test set for this fold
    y_pred_rf = rf_classifier_fold.predict(X_test_fold)
    y_proba_rf = rf_classifier_fold.predict_proba(X_test_fold)

    # Calculating various evaluation metrics
    auc_rf = roc_auc_score(y_test_fold, y_proba_rf, multi_class='ovr', average='weighted')
    f1_rf = f1_score(y_test_fold, y_pred_rf, average='weighted')
    mcc_rf = matthews_corrcoef(y_test_fold, y_pred_rf)
    recall_rf = recall_score(y_test_fold, y_pred_rf, average='weighted')

    # Storing results for this fold
    rf_auc_scores.append(auc_rf)
    rf_f1_scores.append(f1_rf)
    rf_mcc_scores.append(mcc_rf)
    rf_recall_scores.append(recall_rf)

print(f"\n--- Random Forest Classifier Performance (averaged over {n_splits} folds) ---")
print(f"AUC: {np.mean(rf_auc_scores):.4f}")
print(f"F1-score: {np.mean(rf_f1_scores):.4f}")
print(f"MCC: {np.mean(rf_mcc_scores):.4f}")
print(f"Recall: {np.mean(rf_recall_scores):.4f}")

print("\n--- For Comparison: Decision Tree Performance (averaged over {n_splits} folds) ---")
print(f"AUC: {np.mean(auc_scores):.4f}")
print(f"F1-score: {np.mean(f1_scores):.4f}")
print(f"MCC: {np.mean(mcc_scores):.4f}")
print(f"Recall: {np.mean(recall_scores):.4f}")


--- Random Forest Classifier Performance (averaged over 5 folds) ---
AUC: 0.8499
F1-score: 0.6071
MCC: 0.5022
Recall: 0.6180

--- For Comparison: Decision Tree Performance (averaged over {n_splits} folds) ---
AUC: 0.6761
F1-score: 0.5038
MCC: 0.3606
Recall: 0.5054
