In [17]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

# Load the dataset
dataset = pd.read_csv('mushroom.csv')

X = dataset.iloc[:, 1:].values  # Independent variables, all features except "class"
Y = dataset.iloc[:, 0].values  # Dependent variable, just the poisonous/edible feature

# Fill in missing values by most frequent variable for categorical data
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

# Handle categorical data with OneHotEncoder
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), list(range(X.shape[1])))],
    remainder='passthrough'
)

# Apply the transformation
X = ct.fit_transform(X)

# Split the dataset into training and testing sets, 70/30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Scale features for comparability
sc = StandardScaler(with_mean=False)  # Don't center sparse data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Initialize Logistic Regression classifier
classifier = LogisticRegression(random_state=42, max_iter=1500, C=0.01, solver='lbfgs', penalty='l2')

# Fit the model
classifier.fit(X_train, y_train)

# Cross-validation scores
cv_scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Confusion Matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# F1 Score
f1 = f1_score(y_test, y_pred, average='weighted')  # 'Weighted' for class imbalance
print("F1 Score (Weighted):", f1)

# Classification Report
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)

Cross-Validation Accuracy Scores: [0.82538012 0.82760234 0.81754386 0.83331384 0.82465785]
Mean Accuracy: 0.8256996016770198
Confusion Matrix:
[[6587 1521]
 [1493 8720]]
Accuracy: 0.8354893291850881
F1 Score (Weighted): 0.8354596554979984
Classification Report:
              precision    recall  f1-score   support

           e       0.82      0.81      0.81      8108
           p       0.85      0.85      0.85     10213

    accuracy                           0.84     18321
   macro avg       0.83      0.83      0.83     18321
weighted avg       0.84      0.84      0.84     18321

