In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

X = dataset.iloc[:, 1:].values  # Independent variables, all features except "class"
Y = dataset.iloc[:, 0].values  # Dependent variable, just the poisonous/edible feature

# Fill in missing values by most frequent variable for categorical data
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)  

# Use LabelEncoder to convert categorical features to integer labels
label_encoders = []
for col in range(X.shape[1]):
    le = LabelEncoder()  # Create a new label encoder for each column
    X[:, col] = le.fit_transform(X[:, col])  # Transform each column using the label encoder
    label_encoders.append(le)  # Save the label encoder for possible inverse transformation later

# Label encode the target variable as well
y_encoder = LabelEncoder()  # Encoder for the target variable
Y = y_encoder.fit_transform(Y)

# Split the dataset into training and testing sets, 80/20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Scale features for comparability
sc = StandardScaler(with_mean=False)  # Don't center the data by subtracting the mean, data is sparse
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)     

# Define the classifier with default hyperparameters
classifier = SVC(kernel='rbf', C=1, gamma=1, random_state=0)

# Fit the Random Forest model on the training set
classifier.fit(X_train, y_train)

# Perform cross-validation to validate the model
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

# Test the best model on the test set
y_pred = classifier.predict(X_test)

# Metrics to evaluate model performance
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred, average='weighted')  # 'Weighted' for class imbalance
print("F1 Score (Weighted):", f1)

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)


Cross-Validation Accuracy Scores: [0.99959063 0.99969297 0.99969297 0.99989766 0.99979531]
Mean Cross-Validation Accuracy: 0.9997339064578856
Confusion Matrix:
[[5301    1]
 [   0 6912]]
Accuracy: 0.9999181267398067
F1 Score (Weighted): 0.9999181258400115
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5302
           1       1.00      1.00      1.00      6912

    accuracy                           1.00     12214
   macro avg       1.00      1.00      1.00     12214
weighted avg       1.00      1.00      1.00     12214

