In [135]:
!pip install mlflow --quiet --use-deprecated=legacy-resolver

In [136]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.metrics import f1_score, accuracy_score, classification_report
import mlflow
import mlflow.sklearn

## Step 1. Load "ModifiedEdibleMushroom.csv" data from the link below (note: this data set has been preliminarily prepared.).

In [137]:
df = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

In [138]:
df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [139]:
df.shape

(5824, 24)

## [From step 1] before doing the data prep., how many "na" are there in "gill-size" variables?  

In [140]:
df['gill-size'].isnull().sum()

121

## 2. Drop rows where the target (label) variable is missing.

In [141]:
df = df.dropna(subset=['label'])

In [142]:
df.isnull().sum()

id                               0
label                            0
cap-shape                        0
cap-surface                     27
bruises                         99
odor                            99
gill-attachment                 99
gill-spacing                   130
gill-size                      121
stalk-shape                    121
stalk-root                      31
stalk-surface-above-ring        31
stalk-surface-below-ring        31
veil-type                       62
ring-number                     62
ring-type                       62
spore-print-color               56
population                      56
habitat                         31
cap-color-rate                  27
gill-color-rate                121
veil-color-rate                 62
stalk-color-above-ring-rate     31
stalk-color-below-ring-rate     62
dtype: int64

## Step 3. Drop the following variables:

'id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate',

'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'

In [143]:
df = df.drop(columns=['id', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color-rate', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate', 'stalk-color-below-ring-rate', 'veil-color-rate', 'veil-type'])

## Step 4. Examine the number of rows, the number of digits, and whether any are missing.  [From step 4] how many rows of data, how many variables ?

In [144]:
df.shape

(5764, 12)

In [145]:
df.nunique()

label                 2
cap-shape             5
cap-surface           4
bruises               2
odor                  9
stalk-shape           2
ring-number           3
ring-type             5
spore-print-color     8
population            6
habitat               7
cap-color-rate       10
dtype: int64

In [146]:
df.isnull().sum()

label                  0
cap-shape              0
cap-surface           27
bruises               99
odor                  99
stalk-shape          121
ring-number           62
ring-type             62
spore-print-color     56
population            56
habitat               31
cap-color-rate        27
dtype: int64

## Step 5. Fill missing values by adding the mean for numeric variables and the mode for nominal variables

In [147]:
num_cols = df.select_dtypes(include=['float64', 'int64']).columns#[df.select_dtypes(include=['float64', 'int64']).notnull().all()]
# df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = cat_cols.drop('label')
#[df.select_dtypes(include=['object']).notnull().all()]
# df[nominal_columns] = df[nominal_columns].fillna(df[nominal_columns].mode().iloc[0])

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean'))])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(drop='first'))])

## Step 6. Convert the label variable e (edible) to 1 and p (poisonous) to 0 and check the quantity. class0: class1

In [148]:
df['label'] = df['label'].map({'e': 1, 'p': 0})

## [From step 6], answer the quantity class0:class1

In [149]:
class_counts = df['label'].value_counts()
print("Class 0 (poisonous) quantity:", class_counts[0])
print("Class 1 (edible) quantity:", class_counts[1])

Class 0 (poisonous) quantity: 3660
Class 1 (edible) quantity: 2104


## Step 7. Create column tranformer

In [150]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)],
    remainder='passthrough')

## Step 8. Split train/test with 20% test, stratify, and seed = 2020.

In [151]:
X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2020)

## [From step 8], how much is each training and testing sets

In [152]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4611, 11)
X_test shape: (1153, 11)
y_train shape: (4611,)
y_test shape: (1153,)


## Step 9-1o Gridsearch+fit+mlflow

​'criterion':['gini','entropy']
'max_depth': [2,3,6]
'min_samples_leaf':[2,5,10]
'N_estimators':[100,200]
'random_state': 2020

In [153]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

with mlflow.start_run():
    model = RandomForestClassifier()
    model_pipeline = Pipeline(steps=[
        ('col_trans', col_trans),
        ('model', model)
    ])

    param_grid = {
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [2, 3, 6],
        'model__min_samples_leaf': [2, 5, 10],
        'model__n_estimators': [100, 200],
        'model__random_state': [2020]
    }

    gs_pipeline = GridSearchCV(model_pipeline, param_grid, cv=StratifiedKFold(n_splits=5))
    gs_pipeline.fit(X_train, y_train)

    for key, value in gs_pipeline.best_params_.items():
        mlflow.log_param(key, value)

    y_pred = gs_pipeline.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=['Poisonous', 'Edible'], digits=4))
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.sklearn.log_model(gs_pipeline.best_estimator_, 'random_forest_model')

    print("Best parameter set:", gs_pipeline.best_params_)
mlflow.search_runs()



              precision    recall  f1-score   support

   Poisonous     0.9932    0.9986    0.9959       732
      Edible     0.9976    0.9881    0.9928       421

    accuracy                         0.9948      1153
   macro avg     0.9954    0.9934    0.9944      1153
weighted avg     0.9948    0.9948    0.9948      1153

Best parameter set: {'model__criterion': 'gini', 'model__max_depth': 6, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'model__random_state': 2020}


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,params.model__max_depth,params.model__criterion,params.model__n_estimators,params.model__random_state,params.model__min_samples_leaf,tags.mlflow.user,tags.mlflow.source.name,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.log-model.history
0,2df48206416e4b43ab9fd7df951fbc11,0,FINISHED,file:///content/mlruns/0/2df48206416e4b43ab9fd...,2024-02-10 14:58:28.643000+00:00,2024-02-10 15:00:03.054000+00:00,0.994796,6,gini,100,2020,2,root,/usr/local/lib/python3.10/dist-packages/colab_...,righteous-gull-459,LOCAL,"[{""run_id"": ""2df48206416e4b43ab9fd7df951fbc11""..."
1,5eb131076d8b4f009cee3101afb5e062,0,FINISHED,file:///content/mlruns/0/5eb131076d8b4f009cee3...,2024-02-10 14:55:06.328000+00:00,2024-02-10 14:56:29.123000+00:00,0.994796,6,gini,100,2020,2,root,/usr/local/lib/python3.10/dist-packages/colab_...,bald-colt-574,LOCAL,"[{""run_id"": ""5eb131076d8b4f009cee3101afb5e062""..."
