In [47]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

# Independent variables (all features except "class")
X = dataset.iloc[:, 1:]

# Dependent variable (just the poisonous/edible feature)
Y = dataset.iloc[:, 0].values

# Fill in missing values with the most frequent values for categorical data
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)  

# Apply OneHotEncoder to handle categorical data
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), list(range(X.shape[1])))], remainder='passthrough')
X = ct.fit_transform(X)

# Split the dataset into training and testing sets (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Scale features for comparability (using StandardScaler)
sc = StandardScaler(with_mean=False)  # Keep sparse data sparse
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(criterion='entropy', n_estimators=200, max_depth=None, min_samples_split=5, min_samples_leaf=1, random_state=0)

# Fit the model and find the best hyperparameters
rf_classifier.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5, scoring='accuracy')

# Make predictions on the test set with the best model
y_pred = rf_classifier.predict(X_test)

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Calculate confusion matrix, accuracy
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:", cm)
print("Accuracy:", accuracy)


Cross-Validation Accuracy Scores: [0.99508772 0.99649123 0.99578947 0.99684174 0.99660779]
Mean Accuracy: 0.9961635894626026
Confusion Matrix: [[ 7982    13]
 [   50 10276]]
Accuracy: 0.9965613230718847
