# Activity 8.01: Is the Mushroom Poisonous?

Assume we want to develop a machine learning model capable of discerning whether a particular mushroom species is poisonous or not given attributes relating to its appearance.

The objective of this activity is to employ the grid and randomized search strategies to find an optimal model for this purpose.

In [1]:
import pandas as pd 

In [2]:
mushrooms = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter08/Dataset/agaricus-lepiota.data', header=None)


In [3]:
X_raw = mushrooms.iloc[:,1:]
y_raw = mushrooms.iloc[:,0]
y = (y_raw == 'p') * 1

In [4]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
encoder.fit(X_raw)
X = encoder.transform(X_raw).toarray()

In [5]:
# using grid search to find optimal hyperparameterization for a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

rfc = RandomForestClassifier(n_estimators=100, random_state=100)

grid = {
    'criterion' : ['gini', 'entropy'],
    'max_features' : [2, 4, 6, 8, 10, 12, 14]
}
gscv = GridSearchCV(estimator=rfc, param_grid=grid, cv=5, scoring='accuracy')

gscv.fit(X,y)


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=100),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [2, 4, 6, 8, 10, 12, 14]},
             scoring='accuracy')

In [10]:
gscv_results =pd.DataFrame(gscv.cv_results_)
gscv_results.sort_values('rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,0.40989,0.017017,0.028983,0.003032,entropy,6,"{'criterion': 'entropy', 'max_features': 6}",0.842462,1.0,0.999385,1.0,0.83867,0.936103,0.078016,1
13,0.485587,0.005161,0.025185,0.000972,entropy,14,"{'criterion': 'entropy', 'max_features': 14}",0.842462,1.0,1.0,1.0,0.821429,0.932778,0.082598,2
6,0.492136,0.012008,0.025389,0.001025,gini,14,"{'criterion': 'gini', 'max_features': 14}",0.842462,1.0,1.0,1.0,0.819581,0.932409,0.083098,3
10,0.423497,0.010722,0.026114,0.001342,entropy,8,"{'criterion': 'entropy', 'max_features': 8}",0.842462,1.0,1.0,1.0,0.806034,0.929699,0.086868,4
4,0.456774,0.007299,0.026309,0.001563,gini,10,"{'criterion': 'gini', 'max_features': 10}",0.842462,1.0,1.0,1.0,0.79803,0.928098,0.089175,5


In [8]:
# using random search to find optimal hyperparameterization
from scipy import stats

max_features = X.shape[1]
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_features': stats.randint(low=1, high=max_features)
}
rscv = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, n_iter=50, cv=5, scoring='accuracy')

rscv.fit(X,y)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=100),
                   n_iter=50,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000019F64865F70>},
                   scoring='accuracy')

In [9]:
rscv_results = pd.DataFrame(rscv.cv_results_)
rscv_results.sort_values('rank_test_score', ascending=True).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
44,1.234103,0.115426,0.024788,0.00147,gini,79,"{'criterion': 'gini', 'max_features': 79}",1.0,1.0,0.996308,1.0,0.817118,0.962685,0.072798,1
19,1.352329,0.13669,0.023391,0.001195,gini,87,"{'criterion': 'gini', 'max_features': 87}",1.0,1.0,0.996308,1.0,0.745074,0.948276,0.101611,2
21,1.284165,0.118848,0.02363,0.000861,gini,82,"{'criterion': 'gini', 'max_features': 82}",1.0,1.0,0.996308,1.0,0.737685,0.946798,0.104567,3
32,1.426627,0.146538,0.02619,0.005908,gini,93,"{'criterion': 'gini', 'max_features': 93}",1.0,1.0,0.996308,1.0,0.737685,0.946798,0.104567,3
3,1.520823,0.204488,0.02338,0.00102,gini,98,"{'criterion': 'gini', 'max_features': 98}",1.0,1.0,0.996308,1.0,0.730296,0.945321,0.107522,5
