In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [20]:
df = pd.read_csv('mushrooms.csv')
pd.set_option('display.max_columns', None)
df.head(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
5,e,x,y,y,t,a,f,c,b,n,e,c,s,s,w,w,p,w,o,p,k,n,g
6,e,b,s,w,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,n,m
7,e,b,y,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,s,m
8,p,x,y,w,t,p,f,c,n,p,e,e,s,s,w,w,p,w,o,p,k,v,g
9,e,b,s,y,t,a,f,c,b,g,e,c,s,s,w,w,p,w,o,p,k,s,m


In [21]:
df.shape

(8124, 23)

In [22]:
df.isna().sum() # Verifing if the DataFrame contains null values

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

## Encoding text values to their numeric representations

In [23]:
label_encoder = LabelEncoder()
df_encoded = df.apply(label_encoder.fit_transform)
df_encoded.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,0,3,2,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,0,2,2,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,0,2,2,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,0,3,2,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,1,3,2,2,7,7,0,2,1,0,3,0,1


## Spliting the data into train and test sets

In [24]:
X = df_encoded.iloc[:, 1:]
y = df_encoded.iloc[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Choosing the best parameters using Randomized Search

In [25]:
AdaBoost_params = {
    'learning_rate': uniform(0.01, 1),
    'n_estimators': randint(50, 500),
    'algorithm': ['SAMME']
}

XGBoost_params = {
    'max_depth':randint(2, 10),
    'learning_rate': uniform(0.01, 1),
    'n_estimators': randint(50, 500),
    'reg_lambda': randint(0, 2),
    'colsample_bytree': uniform(0.1, 0.5),
    'eta': uniform(0.01, 1)
}

LightGBM_params = {
    'learning_rate': uniform(0.01, 1),
    'n_estimators': randint(50, 500),
    'future_fraction': uniform(0.01, 1),
    'bagging_freq': randint(2, 10),
    'num_leaves': randint(5, 50)
}

CatBoost_params = {
    'iterations': randint(10, 1000),
    'learning_rate': uniform(0.01, 1),
    'depth': randint(2, 10),
    'bagging_temperature': randint(2, 10),
    'boosting_type': ['Plain', 'Ordered'],
    'random_strength': randint(2, 10)
}

AdaBoost = AdaBoostClassifier(random_state=42)
AdaBoost_search = RandomizedSearchCV(estimator=AdaBoost, param_distributions=AdaBoost_params, n_iter=50, cv=5, scoring='accuracy', n_jobs=-1)
AdaBoost_search.fit(X_train, y_train)

XGBoost = XGBClassifier(random_state=42)
XGBoost_search = RandomizedSearchCV(estimator=XGBoost, cv=5, param_distributions=XGBoost_params, n_iter=50, scoring='accuracy', n_jobs=-1)
XGBoost_search.fit(X_train, y_train)

LightGBM = LGBMClassifier(verbosity=-1, random_state=42)
LightGBM_search = RandomizedSearchCV(estimator=LightGBM, param_distributions=LightGBM_params, cv=5, n_iter=50, scoring='accuracy', n_jobs=-1)
LightGBM_search.fit(X_train, y_train)

CatBoost = CatBoostClassifier(verbose=False, random_state=42)
CatBoost_search = RandomizedSearchCV(estimator=CatBoost, param_distributions=CatBoost_params, n_iter=50, cv=5, scoring='accuracy', n_jobs=-1)
CatBoost_search.fit(X_train, y_train)

print(f'AdaBoost best params: {AdaBoost_search.best_params_}\nXGBoost best params: {XGBoost_search.best_params_}\nLightGBM best params: {LightGBM_search.best_params_}\nCatBoost best params: {CatBoost_search.best_params_}')

AdaBoost best params: {'algorithm': 'SAMME', 'learning_rate': 0.7254036739357599, 'n_estimators': 341}
XGBoost best params: {'colsample_bytree': 0.2406025303676053, 'eta': 0.7194782984517882, 'learning_rate': 0.7606115041105427, 'max_depth': 8, 'n_estimators': 423, 'reg_lambda': 1}
LightGBM best params: {'bagging_freq': 8, 'future_fraction': 0.6240752607382307, 'learning_rate': 0.8554094926802642, 'n_estimators': 385, 'num_leaves': 5}
CatBoost best params: {'bagging_temperature': 9, 'boosting_type': 'Plain', 'depth': 8, 'iterations': 575, 'learning_rate': 0.9321142106006699, 'random_strength': 4}


## Models comparison by cross-validation score

In [26]:
models = []
models.append(('AdaBoost', AdaBoost_search.best_estimator_))
models.append(('XGBoost', XGBoost_search.best_estimator_))
models.append(('LightGBM', LightGBM_search.best_estimator_))
models.append(('CatBoost', CatBoost_search.best_estimator_))

for name, model in models:
    model.fit(X_train, y_train)
    val_sc = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'{name} accuracy: {np.mean(val_sc)*100:.2f}%')

AdaBoost accuracy: 91.53%
XGBoost accuracy: 88.24%
LightGBM accuracy: 90.05%
CatBoost accuracy: 96.49%
