In [1]:
# import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score
from sklearn.tree import export_graphviz
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# import csv file
df = pd.read_csv("mushrooms.csv")

In [3]:
# check dataframe
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# get column types, counts and column names
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [5]:
# encode df
labelencoder=LabelEncoder()
for column in df.columns:
    df[column] = labelencoder.fit_transform(df[column])

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
# drop column that isn't needed for analysis since there's only one value
df = df.drop(["veil-type"], axis=1)

In [7]:
df=df.drop(columns=["odor", "gill-color"])

In [8]:
# get value counts for y
df['class'].value_counts()

0    4208
1    3916
Name: class, dtype: int64

In [9]:
X = df.drop(['class'], axis =1)
y = df['class']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

In [11]:
scaler = StandardScaler()
scaler.fit(X_train)
X2 = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=23, test_size=0.15)

Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [14]:
print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {(accuracy_score(y_train,lr.predict(X_train)))}')
print(f'Classification Report:\n{(classification_report(y_train,lr.predict(X_train)))}')
print(f'Confusion Matrix: \n{(confusion_matrix(y_train,lr.predict(X_train)))}')



TRAINING RESULTS:

Accuracy Score: 0.9595944967414917
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      3561
           1       0.96      0.95      0.96      3344

    accuracy                           0.96      6905
   macro avg       0.96      0.96      0.96      6905
weighted avg       0.96      0.96      0.96      6905

Confusion Matrix: 
[[3441  120]
 [ 159 3185]]


In [15]:
print("TEST RESULTS:\n")
print(f'Accuracy Score: {accuracy_score(y_test,lr.predict(X_test))}')
print(f'Classification Report: \n{classification_report(y_test,lr.predict(X_test))}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test,lr.predict(X_test))}')

TEST RESULTS:

Accuracy Score: 0.9581624282198523
Classification Report: 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       647
           1       0.96      0.95      0.96       572

    accuracy                           0.96      1219
   macro avg       0.96      0.96      0.96      1219
weighted avg       0.96      0.96      0.96      1219

Confusion Matrix:
[[623  24]
 [ 27 545]]


SVM Classification

In [16]:
from sklearn.svm import SVC

In [17]:
svm = SVC(random_state=23, gamma="auto")
svm.fit(X_train,y_train)

In [18]:

print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {svm.score(X_train,y_train)*100}')
print(f'Classification Report: \n{classification_report(y_train,svm.predict(X_train))}')
print(f'Confusion Matrix: \n{confusion_matrix(y_train,svm.predict(X_train))}')

TRAINING RESULTS:

Accuracy Score: 100.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3561
           1       1.00      1.00      1.00      3344

    accuracy                           1.00      6905
   macro avg       1.00      1.00      1.00      6905
weighted avg       1.00      1.00      1.00      6905

Confusion Matrix: 
[[3561    0]
 [   0 3344]]


In [19]:

print("TEST RESULTS:\n")
print(f'Accuracy Score: {svm.score(X_test,y_test)*100}')
print(f'Classification Report: \n{classification_report(y_test,svm.predict(X_test))}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test,svm.predict(X_test))}')

TEST RESULTS:

Accuracy Score: 100.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       647
           1       1.00      1.00      1.00       572

    accuracy                           1.00      1219
   macro avg       1.00      1.00      1.00      1219
weighted avg       1.00      1.00      1.00      1219

Confusion Matrix: 
[[647   0]
 [  0 572]]


Decision Tree Learning

In [20]:
from sklearn.tree import DecisionTreeClassifier as DT

dt = DT(criterion='entropy',random_state=23)
dt.fit(X_train, y_train)

In [21]:
print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {accuracy_score(y_train,dt.predict(X_train))*100}')
print(f'Classification Report:\n{classification_report(y_train,dt.predict(X_train))}')
print(f'Confusion Matrix:\n{confusion_matrix(y_train,dt.predict(X_train))}')


TRAINING RESULTS:

Accuracy Score: 100.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3561
           1       1.00      1.00      1.00      3344

    accuracy                           1.00      6905
   macro avg       1.00      1.00      1.00      6905
weighted avg       1.00      1.00      1.00      6905

Confusion Matrix:
[[3561    0]
 [   0 3344]]


In [22]:
print("TESTING RESULTS:\n")
print(f'Accuracy Score: {accuracy_score(y_test,dt.predict(X_test))*100}')
print(f'Classification Report:\n{classification_report(y_test,dt.predict(X_test))}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test,dt.predict(X_test))}')


TESTING RESULTS:

Accuracy Score: 100.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       647
           1       1.00      1.00      1.00       572

    accuracy                           1.00      1219
   macro avg       1.00      1.00      1.00      1219
weighted avg       1.00      1.00      1.00      1219

Confusion Matrix:
[[647   0]
 [  0 572]]


Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=23)
rf.fit(X_train,y_train)


In [24]:

print("TRAINING RESULTS:\n")
print(f'Accuracy Score: {accuracy_score(y_train,rf.predict(X_train))*100}')
print(f'Classification Report:\n{classification_report(y_train,rf.predict(X_train))}')
print(f'Confusion Matrix:\n{confusion_matrix(y_train,rf.predict(X_train))}')

TRAINING RESULTS:

Accuracy Score: 100.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3561
           1       1.00      1.00      1.00      3344

    accuracy                           1.00      6905
   macro avg       1.00      1.00      1.00      6905
weighted avg       1.00      1.00      1.00      6905

Confusion Matrix:
[[3561    0]
 [   0 3344]]


In [25]:

print("TESTING RESULTS:\n")
print(f'Accuracy Score: {accuracy_score(y_test,rf.predict(X_test))*100}')
print(f'Classification Report:\n{classification_report(y_test,rf.predict(X_test))}')
print(f'Confusion Matrix:\n{confusion_matrix(y_test,rf.predict(X_test))}')


TESTING RESULTS:

Accuracy Score: 100.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       647
           1       1.00      1.00      1.00       572

    accuracy                           1.00      1219
   macro avg       1.00      1.00      1.00      1219
weighted avg       1.00      1.00      1.00      1219

Confusion Matrix:
[[647   0]
 [  0 572]]


In [26]:
importances = rf.feature_importances_
importances_sorted = sorted(zip(rf.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.1668312617145844, 'gill-size'),
 (0.14079612086553678, 'spore-print-color'),
 (0.10133313439589153, 'ring-type'),
 (0.08584271948110646, 'bruises'),
 (0.08053245606207678, 'population'),
 (0.07951942653106653, 'stalk-surface-above-ring'),
 (0.06426870288512322, 'stalk-root'),
 (0.059680213794349235, 'stalk-surface-below-ring'),
 (0.05603242089775631, 'gill-spacing'),
 (0.041055876615283496, 'habitat'),
 (0.026462707416065035, 'stalk-shape'),
 (0.019666779452170817, 'stalk-color-below-ring'),
 (0.018085155395709637, 'cap-color'),
 (0.01590480473229471, 'stalk-color-above-ring'),
 (0.015585636951570652, 'cap-surface'),
 (0.01280384924277071, 'ring-number'),
 (0.00602092735121927, 'gill-attachment'),
 (0.005022449836457602, 'cap-shape'),
 (0.004555356378966791, 'veil-color')]