In [3]:
import pandas as pd
import numpy as np

## Step 1. Load "ModifiedEdibleMushroom.csv" data from the link below (note: this data set has been preliminarily prepared.).

In [4]:
#df = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

In [5]:
df = pd.read_csv('mushroom2020_dataset.csv')

In [6]:
df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [7]:
df.shape

(5824, 24)

## [From step 1] before doing the data prep., how many "na" are there in "gill-size" variables?  

In [8]:
df['gill-size'].isnull().sum()

121

## 2. Drop rows where the target (label) variable is missing.

In [None]:
df = df.dropna(subset=['label'])

In [None]:
df.isnull().sum()

id                               0
label                            0
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

## Step 3. Drop the following variables:

'id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate',

'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'

In [None]:
df = df.drop(columns=['id', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color-rate', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate', 'stalk-color-below-ring-rate', 'veil-color-rate', 'veil-type'])

## Step 4. Examine the number of rows, the number of digits, and whether any are missing.  [From step 4] how many rows of data, how many variables ?

In [None]:
df.shape

(5764, 12)

In [None]:
df.nunique()

label                 2
cap-shape             5
cap-surface           4
bruises               2
odor                  9
stalk-shape           2
ring-number           3
ring-type             5
spore-print-color     8
population            6
habitat               7
cap-color-rate       10
dtype: int64

In [None]:
df.isnull().sum()

label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate        27
dtype: int64

## Step 5. Fill missing values by adding the mean for numeric variables and the mode for nominal variables

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
nominal_columns = df.select_dtypes(include=['object']).columns
df[nominal_columns] = df[nominal_columns].fillna(df[nominal_columns].mode().iloc[0])

In [None]:
df.isnull().sum()

label                0
cap-shape            0
cap-surface          0
bruises              0
odor                 0
stalk-shape          0
ring-number          0
ring-type            0
spore-print-color    0
population           0
habitat              0
cap-color-rate       0
dtype: int64

## Step 6. Convert the label variable e (edible) to 1 and p (poisonous) to 0 and check the quantity. class0: class1

In [None]:
df['label'] = df['label'].map({'e': 1, 'p': 0})

## [From step 6], answer the quantity class0:class1

In [None]:
class_counts = df['label'].value_counts()
print("Class 0 (poisonous) quantity:", class_counts[0])
print("Class 1 (edible) quantity:", class_counts[1])

Class 0 (poisonous) quantity: 3660
Class 1 (edible) quantity: 2104


## Step 7. Convert the nominal variable to numeric using a dummy code with drop_first = True.

In [None]:
nominal_columns = df.select_dtypes(include=['object']).columns

In [None]:
df = pd.get_dummies(df, columns=nominal_columns, drop_first=True)

In [None]:
df.head()

Unnamed: 0,label,cap-color-rate,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,bruises_t,...,population_n,population_s,population_v,population_y,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1.0,0,0,0,1,0,1,0,1,...,0,1,0,0,0,0,0,0,1,0
1,1,2.0,0,0,0,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,0
2,1,3.0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,0,1,0,0,0
3,0,3.0,0,0,0,1,0,0,1,1,...,0,1,0,0,0,0,0,0,1,0
4,1,4.0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


## Step 8. Split train/test with 20% test, stratify, and seed = 2020.

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2020)

## [From step 8], how much is each training and testing sets

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4611, 42)
X_test shape: (1153, 42)
y_train shape: (4611,)
y_test shape: (1153,)


## Step 9 Create a Random Forest with GridSearch on training data with 5 CV.

​'criterion':['gini','entropy']
'max_depth': [2,3,6]
'min_samples_leaf':[2,5,10]
'N_estimators':[100,200]
'random_state': 2020

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier()
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 6],
    'min_samples_leaf': [2, 5, 10],
    'n_estimators': [100, 200],
    'random_state': [2020]
}
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


## [From step 9] best params after doing random forest grid search

In [None]:
print("Best Parameters:",  grid_search.best_params_)

Best Parameters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'n_estimators': 100, 'random_state': 2020}


## Step 10. Predict the testing data set with confusion_matrix and classification_report. [From step 10] What is the value of macro f1 (Beware digit !) using scientific rounding ( less than 0.5 dropped, more than 0.5 then increased)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score

y_pred = grid_search.best_estimator_.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)


macro_f1 = f1_score(y_test, y_pred, average='macro')
print("Macro F1 Score:",round(macro_f1, 2))

Confusion Matrix:
[[731   1]
 [  5 416]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       732
           1       1.00      0.99      0.99       421

    accuracy                           0.99      1153
   macro avg       1.00      0.99      0.99      1153
weighted avg       0.99      0.99      0.99      1153

Macro F1 Score: 0.99
