In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data Ingestion
data = pd.read_csv("mushrooms_data.csv")
data.head()

Unnamed: 0,class,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,brown,...,smooth,smooth,white,white,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,smooth,white,white,white,one,evanescent,brown,abundant,grasses


In [4]:
# Checking the dtype of Columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachment           8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

In [5]:
# create an instance of Label Encoder, One Hot Encode, Standard Scaler
le = LabelEncoder()
one_hot = OneHotEncoder()
ss = StandardScaler()

In [9]:
# split data into independent features and target variable
X = data.drop(['class'], axis=1)
y = data['class']

In [10]:
for i in X.columns:
    X[i] = le.fit_transform(X[i])

In [11]:
y = le.fit_transform(y)

In [12]:
X.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,...,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,2,3,0,1,7,1,0,1,0,0,...,3,3,7,7,2,1,4,0,3,4
1,2,3,9,1,0,1,0,0,0,0,...,3,3,7,7,2,1,4,1,2,0
2,0,3,8,1,1,1,0,0,1,0,...,3,3,7,7,2,1,4,1,2,2
3,2,2,8,1,7,1,0,1,1,0,...,3,3,7,7,2,1,4,0,3,4
4,2,3,3,0,6,1,1,0,0,1,...,3,3,7,7,2,1,0,1,0,0


In [13]:
y

array([1, 0, 0, ..., 0, 1, 0])

In [14]:
# divide the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=222)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (6093, 21)
y_train: (6093,)
X_test: (2031, 21)
y_test: (2031,)


In [15]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [16]:
# Create an Evaluate Function to give all metrics after model Training
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmatrix = confusion_matrix(true, predicted)    
    classificationreport = classification_report(true, predicted)
    return accuracy, confusionmatrix, classificationreport

In [18]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Support Vector Classifier': SVC(),
    'Gaussian Naive Bayes': GaussianNB(),
    'K-Neighbors Classifier': KNeighborsClassifier(),
}

model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    
    # train model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    train_accuracy, train_confusionmatrix, train_classificationreport = evaluate_model(y_train,y_train_pred)

    test_accuracy, test_confusionmatrix, test_classificationreport = evaluate_model(y_test, y_test_pred)
        
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("**Accuracy Score:", train_accuracy)
    print("**Confusion Matrix: \n", train_confusionmatrix)
    print("**Classification Report: \n", train_classificationreport)

    print('-'*35)
    
    print('Model performance for Test set')
    print("**Accuracy Score:", test_accuracy)
    print("**Confusion Matrix: \n", test_confusionmatrix)
    print("**Classification Report: \n", test_classificationreport)
    
    accuracy_list.append(test_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
**Accuracy Score: 0.9604464139176103
**Confusion Matrix: 
 [[3058   97]
 [ 144 2794]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.96      3155
           1       0.97      0.95      0.96      2938

    accuracy                           0.96      6093
   macro avg       0.96      0.96      0.96      6093
weighted avg       0.96      0.96      0.96      6093

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.9532250123092073
**Confusion Matrix: 
 [[1018   35]
 [  60  918]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      1053
           1       0.96      0.94      0.95       978

    accuracy                           0.95      2031
   macro avg       0.95      0.95      0.95      2031
weighted avg       0.95      0.95      0.95      2031

In [19]:
pd.DataFrame(list(zip(model_list, accuracy_list)),columns=['Model Name','Accuracy Score']).sort_values(
    by=["Accuracy Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy Score
1,Decision Tree Classifier,1.0
2,Random Forest Classifier,1.0
3,Gradient Boosting Classifier,1.0
4,AdaBoost Classifier,1.0
7,K-Neighbors Classifier,1.0
5,Support Vector Classifier,0.996553
0,Logistic Regression,0.953225
6,Gaussian Naive Bayes,0.842442


In [20]:
for name,model in models.items():
    model.fit(X_train, y_train)
    model.score(X_train, y_train)
    predictions = model.predict(X_test)
    # performance metrics
    print(name)
    print("="*40)
    print("Accuracy Score:", accuracy_score(y_test, predictions).round(4)*100)
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))
    print("="*40,"\n")

Logistic Regression
Accuracy Score: 95.32000000000001
Confusion Matrix:
 [[1018   35]
 [  60  918]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      1053
           1       0.96      0.94      0.95       978

    accuracy                           0.95      2031
   macro avg       0.95      0.95      0.95      2031
weighted avg       0.95      0.95      0.95      2031


Decision Tree Classifier
Accuracy Score: 100.0
Confusion Matrix:
 [[1053    0]
 [   0  978]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1053
           1       1.00      1.00      1.00       978

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031


Random Forest Classifier
Accuracy Score: 100.0
Confusion Matrix:
 [[1053    0]
 [   0  978]]
Classifica

In [24]:
# Difference between Actual and Predicted Values
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':predictions})
pred_df.head(20)

Unnamed: 0,Actual Value,Predicted Value
0,0,0
1,0,0
2,0,0
3,1,1
4,0,0
5,1,1
6,1,1
7,0,0
8,1,1
9,0,0
