In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

X = dataset.iloc[:, 1:].values  # Independent variables, all features except "class"
Y = dataset.iloc[:, 0].values  # Dependent variable, just the poisonous/edible feature

# Fill in missing values with the most frequent value for categorical data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)  

from sklearn.preprocessing import LabelEncoder
# Use LabelEncoder to convert categorical features to integer labels
label_encoders = []
for col in range(X.shape[1]):
    le = LabelEncoder()  # Create a new label encoder for each column
    X[:, col] = le.fit_transform(X[:, col])  # Transform each column using the label encoder
    label_encoders.append(le)  # Save the label encoder for possible inverse transformation later

# Label encode the target variable as well
y_encoder = LabelEncoder()  # Encoder for the target variable
Y = y_encoder.fit_transform(Y)

# Split the dataset into training and testing sets, 80/20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Scale features for comparability
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()  # Now that it's dense, you can use with_mean=True
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)     

# Fit the Gaussian Naive Bayes model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Metrics to evaluate model performance
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score

# Cross-validation with 5-folds
cv_scores = cross_val_score(classifier, X, Y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Confusion Matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')  # 'Weighted' for class imbalance
print("F1 Score (Weighted):", f1)

# Classification Report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)



Cross-Validation Accuracy Scores: [0.53561487 0.46921565 0.61535942 0.6962502  0.55571932]
Mean Accuracy: 0.5744318932260599
Confusion Matrix:
[[4605  697]
 [4187 2725]]
Accuracy: 0.6001309972163091
F1 Score (Weighted): 0.5821174894357489
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.87      0.65      5302
           1       0.80      0.39      0.53      6912

    accuracy                           0.60     12214
   macro avg       0.66      0.63      0.59     12214
weighted avg       0.68      0.60      0.58     12214

