In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score, classification_report)
warnings.filterwarnings("ignore")

In [2]:
#Task 1: Load Dataset
df = pd.read_csv("mushroom.csv")  
out_dir = "figures"                
os.makedirs(out_dir, exist_ok=True)

In [3]:
#Load & Explore Data
df.shape

(2000, 26)

In [4]:
df.columns.tolist()

['Unnamed: 0',
 'cap_shape',
 'cap_surface',
 'cap_color',
 'bruises',
 'odor',
 'gill_attachment',
 'gill_spacing',
 'gill_size',
 'gill_color',
 'stalk_shape',
 'stalk_root',
 'stalk_surface_above_ring',
 'stalk_surface_below_ring',
 'stalk_color_above_ring',
 'stalk_color_below_ring',
 'veil_type',
 'veil_color',
 'ring_number',
 'ring_type',
 'spore_print_color',
 'population',
 'habitat',
 'class',
 'stalk_height',
 'cap_diameter']

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat,class,stalk_height,cap_diameter
0,1167,sunken,scaly,white,no,anise,descending,distant,broad,pink,...,partial,brown,two,sheathing,chocolate,clustered,waste,poisonous,14.276173,5.054983
1,1037,sunken,fibrous,red,no,anise,notched,crowded,narrow,chocolate,...,universal,brown,two,sheathing,brown,numerous,waste,edible,3.952715,19.068319
2,309,flat,grooves,purple,yes,foul,descending,crowded,broad,purple,...,universal,yellow,two,sheathing,purple,abundant,waste,poisonous,9.054265,7.205884
3,282,bell,scaly,pink,yes,fishy,notched,close,broad,orange,...,partial,yellow,two,cobwebby,green,clustered,grasses,poisonous,5.226499,20.932692
4,820,flat,smooth,yellow,yes,musty,free,crowded,narrow,orange,...,universal,white,none,none,yellow,clustered,urban,poisonous,14.037532,12.545245


In [6]:
df.isna().sum()

Unnamed: 0                  0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
class                       0
stalk_height                0
cap_diameter                0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                2000 non-null   int64  
 1   cap_shape                 2000 non-null   object 
 2   cap_surface               2000 non-null   object 
 3   cap_color                 2000 non-null   object 
 4   bruises                   2000 non-null   object 
 5   odor                      2000 non-null   object 
 6   gill_attachment           2000 non-null   object 
 7   gill_spacing              2000 non-null   object 
 8   gill_size                 2000 non-null   object 
 9   gill_color                2000 non-null   object 
 10  stalk_shape               2000 non-null   object 
 11  stalk_root                2000 non-null   object 
 12  stalk_surface_above_ring  2000 non-null   object 
 13  stalk_surface_below_ring  2000 non-null   object 
 14  stalk_co

In [8]:
df.describe(include="all")

Unnamed: 0.1,Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat,class,stalk_height,cap_diameter
count,2000.0,2000,2000,2000,2000,2000,2000,2000,2000,2000,...,2000,2000,2000,2000,2000,2000,2000,2000,2000.0,2000.0
unique,,5,4,9,2,9,4,3,2,12,...,2,4,3,8,9,6,7,2,,
top,,sunken,scaly,brown,yes,foul,free,distant,narrow,orange,...,universal,white,one,sheathing,brown,abundant,urban,poisonous,,
freq,,439,568,263,1023,274,540,677,1007,216,...,1010,553,689,305,255,360,316,1400,,
mean,624.974,,,,,,,,,,...,,,,,,,,,8.449118,12.314345
std,375.091938,,,,,,,,,,...,,,,,,,,,3.697217,7.048845
min,0.0,,,,,,,,,,...,,,,,,,,,2.0,1.0
25%,290.0,,,,,,,,,,...,,,,,,,,,5.291009,5.723521
50%,607.0,,,,,,,,,,...,,,,,,,,,8.318596,12.124902
75%,957.25,,,,,,,,,,...,,,,,,,,,11.781272,18.698605


In [9]:
# Distribution plots for categorical features
for col in df.select_dtypes(include="object").columns:
    plt.figure(figsize=(6,4))
    df[col].value_counts().plot(kind="bar", color="skyblue")
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f"dist_{col}.png"))
    plt.close()

In [10]:
# Numerical features distribution
for col in df.select_dtypes(include=["int64","float64"]).columns:
    plt.figure(figsize=(6,4))
    df[col].hist(bins=30, color="orange", edgecolor="black")
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, f"hist_{col}.png"))
    plt.close()

In [11]:
#Task 2: Preprocessing 
X = df.drop(columns=["class"])
y = df["class"]  
cat_cols = list(X.columns)
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)

In [12]:
 X_train.shape

(1600, 25)

In [13]:
 X_test.shape

(400, 25)

In [14]:
#Task 3: Class Balance Visualization
y.value_counts().plot(kind="bar", title="Class Distribution")
plt.savefig(os.path.join(out_dir, "class_distribution.png"))
plt.close()

In [15]:
#Task 4: SVM Training & Evaluation
kernels = ["linear", "rbf", "poly"]
results = []
for ker in kernels:
    clf = Pipeline([("pre", preprocess),("svc", SVC(kernel=ker, random_state=42))])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label="poisonous")
    rec = recall_score(y_test, y_pred, pos_label="poisonous")
    f1 = f1_score(y_test, y_pred, pos_label="poisonous")

    results.append([ker, acc, prec, rec, f1])

    print(f"\n - Kernel: {ker}")
    print(classification_report(y_test, y_pred))


 - Kernel: linear
              precision    recall  f1-score   support

      edible       0.87      0.61      0.72       120
   poisonous       0.85      0.96      0.90       280

    accuracy                           0.85       400
   macro avg       0.86      0.78      0.81       400
weighted avg       0.86      0.85      0.85       400


 - Kernel: rbf
              precision    recall  f1-score   support

      edible       0.95      0.33      0.49       120
   poisonous       0.78      0.99      0.87       280

    accuracy                           0.80       400
   macro avg       0.86      0.66      0.68       400
weighted avg       0.83      0.80      0.76       400


 - Kernel: poly
              precision    recall  f1-score   support

      edible       0.97      0.54      0.70       120
   poisonous       0.83      0.99      0.91       280

    accuracy                           0.86       400
   macro avg       0.90      0.77      0.80       400
weighted avg       0.8

In [16]:
#Task 6: Hyperparameter Tuning
param_grid = {
    "svc__kernel": ["linear", "rbf", "poly"],
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", 0.1, 1]
}
svc_pipe = Pipeline([
    ("pre", preprocess),
    ("svc", SVC(random_state=42))
])
grid = GridSearchCV(svc_pipe, param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
y_pred_tuned = grid.predict(X_test)
print("\n -Tuned Model Report ")
print(classification_report(y_test, y_pred_tuned))

acc = accuracy_score(y_test, y_pred_tuned)
prec = precision_score(y_test, y_pred_tuned, pos_label="poisonous")
rec = recall_score(y_test, y_pred_tuned, pos_label="poisonous")
f1 = f1_score(y_test, y_pred_tuned, pos_label="poisonous")
results.append([f"tuned({grid.best_params_['svc__kernel']})", acc, prec, rec, f1])

Best params: {'svc__C': 1, 'svc__gamma': 1, 'svc__kernel': 'rbf'}

 -Tuned Model Report 
              precision    recall  f1-score   support

      edible       1.00      0.53      0.69       120
   poisonous       0.83      1.00      0.91       280

    accuracy                           0.86       400
   macro avg       0.92      0.76      0.80       400
weighted avg       0.88      0.86      0.84       400



In [17]:
#Task 7: Comparison 
comp_df = pd.DataFrame(results, columns=["Kernel", "Accuracy", "Precision(p)", "Recall(p)", "F1(p)"])
print("\n=== Kernel Comparison ===\n", comp_df)
comp_df.to_csv(os.path.join(out_dir, "svm_comparison.csv"), index=False)
print("\nDone. Plots saved in 'figures' folder.")


=== Kernel Comparison ===
        Kernel  Accuracy  Precision(p)  Recall(p)     F1(p)
0      linear    0.8550      0.851266   0.960714  0.902685
1         rbf    0.7950      0.776536   0.992857  0.871473
2        poly    0.8575      0.834835   0.992857  0.907015
3  tuned(rbf)    0.8575      0.830861   1.000000  0.907618

Done. Plots saved in 'figures' folder.


In [18]:
##### Thanks You ####