In [10]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score  # Include cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

# Independent variables (all features except "class")
X = dataset.iloc[:, 1:]

# Dependent variable (the class/target)
Y = dataset.iloc[:, 0].values

# Handle missing values with the most frequent values for categorical data
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Apply OneHotEncoder to handle categorical data
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), list(range(X.shape[1])))],
    remainder='passthrough'
)

# Apply the transformation
X = ct.fit_transform(X)

# Scale features for comparability
sc = StandardScaler(with_mean=False)  # Keep sparse data sparse
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Initialize Random Forest classifier with optimal hyperparameters
rf_classifier = RandomForestClassifier(
    criterion='entropy',
    n_estimators=200,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=42
)

# Fit the Random Forest model on the training set
rf_classifier.fit(X_train, y_train)

# Cross-validation
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate confusion matrix, accuracy, and F1 score
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Confusion Matrix:", cm)
print("Accuracy:", accuracy)
print("F1 Score (Weighted):", f1)
print("Classification Report:", classification_report(y_test, y_pred))


Cross-Validation Accuracy Scores: [0.99672515 0.99578947 0.99602339 0.99707568 0.99520412]
Mean Cross-Validation Accuracy: 0.996163562100557
Confusion Matrix: [[ 8087    21]
 [   40 10173]]
Accuracy: 0.996670487418809
F1 Score (Weighted): 0.9966708857250468
Classification Report:               precision    recall  f1-score   support

           e       1.00      1.00      1.00      8108
           p       1.00      1.00      1.00     10213

    accuracy                           1.00     18321
   macro avg       1.00      1.00      1.00     18321
weighted avg       1.00      1.00      1.00     18321

