In [17]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

Y = dataset.iloc[:, 0].values  # Dependent variable (edibility)
X = dataset.iloc[:, 1:].values  # Independent variables

# Handle missing values by most frequent variable for categorical data
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Use OneHotEncoder to handle categorical data
ct = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), list(range(X.shape[1])))], 
    remainder='passthrough'  # Keep any other columns as-is
)

# Apply the transformation
X = ct.fit_transform(X)

# Scale features
sc = StandardScaler(with_mean=False)  # Don't center sparse data
X = sc.fit_transform(X)

# Split the dataset into training and testing sets (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)

# Initialize Decision Tree Classifier
classifier = tree.DecisionTreeClassifier(random_state=42, criterion='entropy')

# Fit the model on the training set
classifier.fit(X_train, y_train)

# Perform cross-validation to validate the model
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score (Weighted):", f1)

# Classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Cross-Validation Accuracy Scores: [0.99602339 0.9954386  0.99672515 0.99660779 0.99508714]
Mean Cross-Validation Accuracy: 0.9959764139166101
Confusion Matrix:
[[ 8084    24]
 [   45 10168]]
Accuracy: 0.9962338300311119
F1 Score (Weighted): 0.9962343275098632
Classification Report:
              precision    recall  f1-score   support

           e       0.99      1.00      1.00      8108
           p       1.00      1.00      1.00     10213

    accuracy                           1.00     18321
   macro avg       1.00      1.00      1.00     18321
weighted avg       1.00      1.00      1.00     18321

