In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
path = 'https://raw.githubusercontent.com/dsrscientist/dataset1/master/mushrooms.csv'
data = pd.read_csv(path)
data.sample(10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
5854,p,x,y,n,f,f,f,c,n,b,...,k,p,w,p,w,o,e,w,v,d
5208,p,x,y,e,f,y,f,c,n,b,...,s,p,w,p,w,o,e,w,v,l
3110,p,x,f,g,f,c,f,c,n,u,...,s,w,w,p,w,o,p,k,v,d
4544,p,f,f,g,f,f,f,c,b,p,...,k,p,p,p,w,o,l,h,v,d
4121,e,f,y,e,t,n,f,c,b,u,...,s,g,g,p,w,o,p,k,v,d
1005,e,x,s,g,f,n,f,w,b,p,...,f,w,w,p,w,o,e,n,a,g
8043,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,n,o,p,b,v,l
5794,e,f,y,n,f,n,f,w,n,w,...,f,w,n,p,w,o,e,w,v,l
6324,p,f,y,e,f,s,f,c,n,b,...,s,p,w,p,w,o,e,w,v,p
879,e,b,s,y,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,g


In [3]:
data['class'].value_counts()

e    4208
p    3916
Name: class, dtype: int64

From the above observation we can say that there is not much difference between the classes of the target variable hence there is no problem of class imbalance.

In [4]:
data.shape

(8124, 23)

In [5]:
data.dtypes


class                       object
cap-shape                   object
cap-surface                 object
cap-color                   object
bruises                     object
odor                        object
gill-attachment             object
gill-spacing                object
gill-size                   object
gill-color                  object
stalk-shape                 object
stalk-root                  object
stalk-surface-above-ring    object
stalk-surface-below-ring    object
stalk-color-above-ring      object
stalk-color-below-ring      object
veil-type                   object
veil-color                  object
ring-number                 object
ring-type                   object
spore-print-color           object
population                  object
habitat                     object
dtype: object

In [6]:
data.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

From the above observations we can say that there are no null values in our dataset and also all the columns are of object type which needs to be converted.

# Split the Data into X and y

In [7]:
X = data.drop('class', axis = 1)
y = data['class']

# Label Encoding for all the object variable

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y  = le.fit_transform(y)

In [9]:
collist = X.columns

In [10]:
for i in collist:
    X[i] = le.fit_transform(X[i])

In [11]:
X

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,3,2,4,0,5,0,0,0,11,0,...,2,5,5,0,1,1,4,0,1,2
8120,5,2,4,0,5,0,0,0,11,0,...,2,5,5,0,0,1,4,0,4,2
8121,2,2,4,0,5,0,0,0,5,0,...,2,5,5,0,1,1,4,0,1,2
8122,3,3,4,0,8,1,0,1,0,1,...,1,7,7,0,2,1,0,7,4,2


# Getting the best random state

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

maxAccu = 0
maxRS = 0
for i in range(200):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=i)
    LR = LogisticRegression()
    LR.fit(X_train, y_train)
    pred = LR.predict(X_test)
    acc = accuracy_score(y_test, pred)
    if acc>maxAccu:
        maxAccu = acc
        maxRS = i
print('Best Accuracy is', maxAccu, 'on Random state:', maxRS)

Best Accuracy is 0.9585726004922067 on Random state: 21


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=21)

In [14]:
from sklearn.neighbors import KNeighborsClassifier
classifier_KNC = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_KNC.fit(X_train, y_train)
y_pred = classifier_KNC.predict(X_test)
acc_KNC = accuracy_score(y_test, y_pred)
acc_KNC

0.9971287940935193

In [15]:
from sklearn.svm import SVC
classifier_SVC = SVC()
classifier_SVC.fit(X_train, y_train)
y_pred = classifier_SVC.predict(X_test)
acc_SVC = accuracy_score(y_test, y_pred)
acc_SVC

0.9885151763740772

In [16]:
from sklearn.naive_bayes import GaussianNB
classifier_NB = GaussianNB()
classifier_NB.fit(X_train, y_train)
y_pred = classifier_NB.predict(X_test)
acc_NB = accuracy_score(y_test, y_pred)
acc_NB

0.9232977850697293

In [17]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
acc = accuracy_score(y_test, pred)
acc

0.9585726004922067

In [18]:
from sklearn.tree import DecisionTreeClassifier
classifier_DT = DecisionTreeClassifier()
classifier_DT.fit(X_train, y_train)
y_pred = classifier_DT.predict(X_test)
acc_DT = accuracy_score(y_test, y_pred)
acc_DT

1.0

In [19]:
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier()
classifier_RF.fit(X_train, y_train)
y_pred = classifier_RF.predict(X_test)
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

1.0

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
cross_KNC = cross_val_score(classifier_KNC, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of KNeighborsClassifier: ',cross_KNC)

Cross validation score of KNeighborsClassifier:  0.8815549071618036


In [22]:
cross_SVC = cross_val_score(classifier_SVC, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of SVC: ',cross_SVC)

Cross validation score of SVC:  0.841292231906025


In [23]:
cross_NB = cross_val_score(classifier_NB, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of Naive Bayes: ',cross_NB)

Cross validation score of Naive Bayes:  0.7259685486926866


In [24]:
cross_LR = cross_val_score(LR, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of Logistic Regression: ',cross_LR)

Cross validation score of Logistic Regression:  0.8392256915498295


In [25]:
cross_DT = cross_val_score(classifier_DT, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of Decision Tree: ',cross_DT)

Cross validation score of Decision Tree:  0.9192263736263737


In [26]:
cross_RF = cross_val_score(classifier_RF, X, y, cv = 5, scoring = 'accuracy').mean()
print('Cross validation score of Random Forest Classifier: ',cross_RF)

Cross validation score of Random Forest Classifier:  0.8929999242137173


In [27]:
print('Difference between accuracy of KNeighborsClassifier: ',acc_KNC - cross_KNC )
print('Difference between accuracy of SVC: ',acc_SVC - cross_SVC)
print('Difference between accuracy of GaussianNB: ',acc_NB - cross_NB)
print('Difference between accuracy of LogisticRegression: ',acc - cross_LR)
print('Difference between accuracy of Decision Tree: ',acc_DT - cross_DT)
print('Difference between accuracy of Random Forest Classifier: ',acc_RF - cross_RF)

Difference between accuracy of KNeighborsClassifier:  0.11557388693171566
Difference between accuracy of SVC:  0.14722294446805217
Difference between accuracy of GaussianNB:  0.1973292363770427
Difference between accuracy of LogisticRegression:  0.1193469089423772
Difference between accuracy of Decision Tree:  0.08077362637362628
Difference between accuracy of Random Forest Classifier:  0.10700007578628268


From the above observation we can say that Decision tree clasifier has the least difference between the accuracy and cross val score so we select Decision tree clasifier as our best model.

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
classifier_DT = DecisionTreeClassifier()
classifier_DT.fit(X_train, y_train)
y_pred = classifier_DT.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


1.0
[[1268    0]
 [   0 1170]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1268
           1       1.00      1.00      1.00      1170

    accuracy                           1.00      2438
   macro avg       1.00      1.00      1.00      2438
weighted avg       1.00      1.00      1.00      2438



Hence from above observation we can say that Decision Tree is our best model

# Saving the model

In [29]:
import pickle
file='mushroom_DT.pkl'
pickle.dump(classifier_DT,open(file,'wb'))

In [32]:
#loading back the model
readme = open(file, 'rb')
model = pickle.load(readme)

In [33]:
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [34]:
model.predict(X_test)

array([0, 0, 1, ..., 0, 0, 1])