In [23]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.tree import export_graphviz
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [24]:
# import csv file
df = pd.read_csv("mushrooms.csv")

In [25]:
# check dataframe
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [26]:
# get column types, counts and column names
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [27]:
# encode df
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [28]:
# drop column that isn't needed for analysis since there's only one value
df = df.drop(["veil-type"], axis=1)

In [29]:
# get value counts for y
df['class'].value_counts()

0    4208
1    3916
Name: class, dtype: int64

In [30]:
X = df.drop(['class'], axis =1)
y = df['class']


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

In [32]:
scaler = StandardScaler()
scaler.fit(X_train)
X2 = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=23)

In [34]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [35]:
print("test accuracy: {}%".format(round(lr.score(X_test,y_test)*100,2)))

test accuracy: 96.06%


In [36]:
print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {(accuracy_score(y_train,lr.predict(X_train)))}')
print(f'Classification Report:{(classification_report(y_train,lr.predict(X_train)))}')
print(f'Confusion Matrix: {(confusion_matrix(y_train,lr.predict(X_train)))}')



TRAINING RESULTS:

Accuracy Score: 0.9673395699983588
Classification Report:              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3149
           1       0.97      0.97      0.97      2944

    accuracy                           0.97      6093
   macro avg       0.97      0.97      0.97      6093
weighted avg       0.97      0.97      0.97      6093

Confusion Matrix: [[3051   98]
 [ 101 2843]]


In [37]:
print("TEST RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,lr.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(y_test,lr.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,lr.predict(X_test))))

TEST RESULTS:

Accuracy Score: 0.9606

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      1059
           1       0.95      0.96      0.96       972

    accuracy                           0.96      2031
   macro avg       0.96      0.96      0.96      2031
weighted avg       0.96      0.96      0.96      2031


Confusion Matrix:
[[1014   45]
 [  35  937]]



SVM Classification

In [38]:
from sklearn.svm import SVC

In [40]:
svm = SVC(random_state=23, gamma="auto")
svm.fit(X_train,y_train)

In [41]:
print(f"Test Accuracy: {svm.score(X_test,y_test)*100}")

Test Accuracy: 100.0


KNN

In [44]:
from sklearn.neighbors import KNeighborsClassifier

best_Kvalue = 0
best_score = 0

for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    if knn.score(X_test,y_test) > best_score:
        best_score = knn.score(X_train,y_train)
        best_Kvalue = i


In [18]:
print("""Best KNN Value: {}
Test Accuracy: {}%""".format(best_Kvalue, round(best_score*100,2)))

Best KNN Value: 1
Test Accuracy: 100.0%


Decision Tree Learning

In [19]:
from sklearn.tree import DecisionTreeClassifier as DT

dt = DT(criterion='entropy',random_state=23)
dt.fit(X_train, y_train)

In [20]:
print("TRAINING RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_train,dt.predict(X_train))))
print('Classification Report:\n{}\n'.format(classification_report(y_train,dt.predict(X_train))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_train,dt.predict(X_train))))


TRAINING RESULTS:

Accuracy Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3149
           1       1.00      1.00      1.00      2944

    accuracy                           1.00      6093
   macro avg       1.00      1.00      1.00      6093
weighted avg       1.00      1.00      1.00      6093


Confusion Matrix:
[[3149    0]
 [   0 2944]]



In [22]:
print("TRAINING RESULTS:\n")
print('Accuracy Score: {0:.4f}\n'.format(accuracy_score(y_test,dt.predict(X_test))))
print('Classification Report:\n{}\n'.format(classification_report(y_test,dt.predict(X_test))))
print('Confusion Matrix:\n{}\n'.format(confusion_matrix(y_test,dt.predict(X_test))))


TRAINING RESULTS:

Accuracy Score: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1059
           1       1.00      1.00      1.00       972

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031


Confusion Matrix:
[[1059    0]
 [   0  972]]



In [47]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=23)
rf.fit(X_train,y_train)


In [49]:
print("Test Accuracy: {}%".format(round(rf.score(X_test,y_test)*100,2)))


Test Accuracy: 100.0%


In [50]:
importances = rf.feature_importances_
importances_sorted = sorted(zip(rf.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.14896325937140995, 'odor'),
 (0.1358429677226285, 'gill-color'),
 (0.1162162977242784, 'gill-size'),
 (0.09817523699049019, 'spore-print-color'),
 (0.0789262356718558, 'ring-type'),
 (0.06141209628159484, 'stalk-root'),
 (0.06045778020796624, 'population'),
 (0.049918356351370076, 'stalk-surface-below-ring'),
 (0.047373192693100606, 'bruises'),
 (0.04704949099790416, 'stalk-surface-above-ring'),
 (0.030609782753304473, 'habitat'),
 (0.025664572716849552, 'gill-spacing'),
 (0.02527880859701683, 'stalk-shape'),
 (0.01796123000032634, 'stalk-color-below-ring'),
 (0.014191168419869822, 'stalk-color-above-ring'),
 (0.014088900946317857, 'cap-color'),
 (0.012228314461483642, 'ring-number'),
 (0.009588350873627352, 'cap-surface'),
 (0.004449059899513968, 'cap-shape'),
 (0.000824942101090296, 'gill-attachment'),
 (0.0007799552180010933, 'veil-color')]