In [None]:
%pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
secondary_mushroom = fetch_ucirepo(id=848)

# data (as pandas dataframes)
X = secondary_mushroom.data.features
y = secondary_mushroom.data.targets

# metadata
print(secondary_mushroom.metadata)

# variable information
print(secondary_mushroom.variables)


{'uci_id': 848, 'name': 'Secondary Mushroom', 'repository_url': 'https://archive.ics.uci.edu/dataset/848/secondary+mushroom+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/848/data.csv', 'abstract': 'Dataset of simulated mushrooms for binary classification into edible and poisonous.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 61068, 'num_features': 20, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2021, 'last_updated': 'Wed Apr 10 2024', 'dataset_doi': '10.24432/C5FP5Q', 'creators': ['Dennis Wagner', 'D. Heider', 'Georges Hattab'], 'intro_paper': {'ID': 259, 'type': 'NATIVE', 'title': 'Mushroom data creation, curation, and simulation to support classification tasks', 'authors': 'Dennis Wagner, D. Heider, Georges Hattab', 'venue': 'Scientific Reports', 'year': 2021, 'journal': None, '

### Mushroom Dataset Description

**Target Variable (Binary Class):**  
- `edible = e`  
- `poisonous = p` (also includes mushrooms of unknown edibility)  

---

### Features (20 variables: nominal = n, metrical = m)

1. **cap-diameter (m):** float number in cm  
2. **cap-shape (n):**  
   - bell = b  
   - conical = c  
   - convex = x  
   - flat = f  
   - sunken = s  
   - spherical = p  
   - others = o  
3. **cap-surface (n):**  
   - fibrous = i  
   - grooves = g  
   - scaly = y  
   - smooth = s  
   - shiny = h  
   - leathery = l  
   - silky = k  
   - sticky = t  
   - wrinkled = w  
   - fleshy = e  
4. **cap-color (n):**  
   - brown = n  
   - buff = b  
   - gray = g  
   - green = r  
   - pink = p  
   - purple = u  
   - red = e  
   - white = w  
   - yellow = y  
   - blue = l  
   - orange = o  
   - black = k  
5. **does-bruise-bleed (n):**  
   - bruises-or-bleeding = t  
   - no = f  
6. **gill-attachment (n):**  
   - adnate = a  
   - adnexed = x  
   - decurrent = d  
   - free = e  
   - sinuate = s  
   - pores = p  
   - none = f  
   - unknown = ?  
7. **gill-spacing (n):**  
   - close = c  
   - distant = d  
   - none = f  
8. **gill-color (n):** see `cap-color` + none = f  
9. **stem-height (m):** float number in cm  
10. **stem-width (m):** float number in mm  
11. **stem-root (n):**  
    - bulbous = b  
    - swollen = s  
    - club = c  
    - cup = u  
    - equal = e  
    - rhizomorphs = z  
    - rooted = r  
12. **stem-surface (n):** see `cap-surface` + none = f  
13. **stem-color (n):** see `cap-color` + none = f  
14. **veil-type (n):**  
    - partial = p  
    - universal = u  
15. **veil-color (n):** see `cap-color` + none = f  
16. **has-ring (n):**  
    - ring = t  
    - none = f  
17. **ring-type (n):**  
    - cobwebby = c  
    - evanescent = e  
    - flaring = r  
    - grooved = g  
    - large = l  
    - pendant = p  
    - sheathing = s  
    - zone = z  
    - scaly = y  
    - movable = m  
    - none = f  
    - unknown = ?  
18. **spore-print-color (n):** see `cap-color`  
19. **habitat (n):**  
    - grasses = g  
    - leaves = l  
    - meadows = m  
    - paths = p  
    - heaths = h  
    - urban = u  
    - waste = w  
    - woods = d  
20. **season (n):**  
    - spring = s  
    - summer = u  
    - autumn = a  
    - winter = w  

---

### Class Labels
- **edible = e**  
- **poisonous = p**  


In [None]:
X

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,15.26,x,g,o,f,e,,w,16.95,17.09,s,y,w,u,w,t,g,,d,w
1,16.60,x,g,o,f,e,,w,17.99,18.19,s,y,w,u,w,t,g,,d,u
2,14.07,x,g,o,f,e,,w,17.80,17.74,s,y,w,u,w,t,g,,d,w
3,14.17,f,h,e,f,e,,w,15.77,15.98,s,y,w,u,w,t,p,,d,w
4,14.64,x,h,o,f,e,,w,16.53,17.20,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,1.18,s,s,y,f,f,f,f,3.93,6.22,,,y,,,f,f,,d,a
61065,1.27,f,s,y,f,f,f,f,3.18,5.43,,,y,,,f,f,,d,a
61066,1.27,s,s,y,f,f,f,f,3.86,6.37,,,y,,,f,f,,d,u
61067,1.24,f,s,y,f,f,f,f,3.56,5.44,,,y,,,f,f,,d,u


In [None]:
y

Unnamed: 0,class
0,p
1,p
2,p
3,p
4,p
...,...
61064,p
61065,p
61066,p
61067,p


## Mushroom Classification Project

### Problem Description
The goal of this project is to classify mushrooms as **edible (e)** or **poisonous (p)** based on 20 descriptive features such as cap shape, cap color, stem characteristics, habitat, and season.  
This is a **binary classification problem** where the target variable is the mushroom class (`edible` or `poisonous`).  

---

### Project Workflow

#### 1. Exploratory Data Analysis (EDA)
- Inspect the dataset structure (rows, columns, data types).  
- Check for missing values and duplicates.  
- Visualize distributions of categorical features (bar plots) and numeric features (histograms, boxplots).  
- Explore correlations between features and the target class.  
- Look for class imbalance in the target variable (edible vs poisonous).  

#### 2. Data Cleaning & Preprocessing
- Handle missing values or unknown entries if present.  
- Encode categorical features (e.g., **One-Hot Encoding** or **Label Encoding**).  
- Scale numerical features (e.g., **StandardScaler** or **MinMaxScaler**).  
- Stratify the dataset when splitting into **training and test sets** to maintain class balance.  
- If class imbalance is significant, apply oversampling techniques (e.g., **SMOTE**).  

#### 3. Feature Selection
To reduce dimensionality and improve generalization, apply feature selection techniques such as:  
- **Filter Methods:** Use statistical tests (e.g., Chi-square, ANOVA F-test, mutual information) to rank features.  
- **Wrapper Methods:** Apply Recursive Feature Elimination (**RFE**) with models like Logistic Regression or Decision Trees.  
- **Embedded Methods:** Leverage feature importance from models (e.g., Decision Tree, Random Forest, Lasso Regression).  
- Compare model performance before and after feature selection to evaluate impact.  

#### 4. Model Training & Evaluation
Train and evaluate the following classification models:  
- **Logistic Regression**  
- **Decision Tree**  
- **Random Forest**  
- **Support Vector Machine (SVM)**  
- **K-Nearest Neighbors (KNN)**  
- **Naive Bayes**  

For each model:  
- Train on the **training set**.  
- Evaluate on the **test set** using:  
  - **Classification Report** (Precision, Recall, F1-score, Accuracy)  
  - **Confusion Matrix**  
  - **Cross-validation (optional)** for stability check  

#### 5. Hyperparameter Tuning
To improve model performance, apply hyperparameter optimization techniques:  
- **Grid Search:** Exhaustively search all combinations of parameters (good for small parameter spaces).  
- **Random Search:** Randomly sample parameter combinations (more efficient for large parameter spaces).  
- **Bayesian Optimization:** Iteratively choose hyperparameters based on past performance (efficient for complex models like Random Forest, SVM, or KNN).  

Compare tuned models with baseline models to measure improvement.  

#### 6. Feature Importance
- Extract **feature importance** from Decision Tree and Random Forest models.  
- Rank features to identify the most influential ones for classification.  
- Use feature importance as a basis for feature selection and model simplification.  

#### 7. Model Comparison
- Compare performance across all models (baseline, tuned, and with feature selection).  
- Use metrics such as accuracy, precision, recall, F1-score, and ROC-AUC to decide the best-performing model.
- Create plot of ROC-AUC curve of each model with different color for comparison  
- Summarize results in a **comparison table** or **bar plot**.  

---

#### 8. Save best models as pickle files

---

### Final Deliverables
- Preprocessed dataset ready for classification.  
- Performance evaluation of six classification models.  
- Insights on class imbalance and handling techniques.  
- Feature selection results (filter, wrapper, and embedded methods).  
- Hyperparameter tuning results from Grid Search, Random Search, and Bayesian Optimization.  
- Feature importance ranking for better model interpretability.  
- Final comparison and recommendation of the best-performing model for mushroom classification.  
- pickle files of the saved models
