This project demonstrates the application of **decision tree algorithms** to classify mushrooms into poisonous and edible according to thier characteristics. The dataset is obtained from the [machine learning repository of University of California Irvine (UCI)](https://archive.ics.uci.edu/ml/datasets/mushroom)
    
Data in csv format (zipped) is available in this [link](https://github.com/alineu/pyDataScintist-Notebooks/tree/main/data/mushrooms.csv.zip). **You can find the full project analysis at [Edible/Poisonous Mushrooms Classification Using Decision Trees](https://pydatascientist.com/tutorials/mushroom_classification_using_decision_trees/project.html)**

### Loading the libraries

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore','Warning')
pd.set_option('display.float_format', lambda x: '%.5f' % x)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

### Data exploration

In [None]:
mushroom_df = pd.read_csv('mushrooms.csv')

In [None]:
mushroom_df.iloc[:5,:13]

In [None]:
mushroom_df.iloc[:5,13:]

In [None]:
print(mushroom_df.info());

In [None]:
mushroom_df.dtypes

In [None]:
from pandas.api.types import CategoricalDtype
mushroom_df = mushroom_df.astype("category")

In [None]:
mushroom_df.dtypes

In [None]:
features = list(mushroom_df.columns[mushroom_df.columns!='target'])
print(features)

In [None]:
X = mushroom_df[features].values #features
y = mushroom_df['target'].values #target class

### Encoding the categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

# encode the input data
def prepare_inputs(X_train, X_test):
    
    # Set "handle_unknown" argument to "ignore". This is useful in case the model encounters a 
    # new feature level. Foe example, you train a model with unique colors "blue", "purple", and "yellow" 
    # and there is a color "red" appearing in the test data.
    oh_encoder = OneHotEncoder(handle_unknown="ignore") 
    oh_encoder.fit(X_train) 
    X_train_enc = oh_encoder.transform(X_train)
    X_test_enc = oh_encoder.transform(X_test)
    return X_train_enc, X_test_enc

In [None]:
# encode the target
def prepare_targets(y_train, y_test):
    # LableEncoder is pretty much the same as One-Hot encoder but is used for the target variable (labels)
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

#### Splitting the dataset to training (80%) and test data (20%):

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
oh_encoder_all = OneHotEncoder() 
oh_encoder_all.fit(X) 
encoded_features = oh_encoder_all.get_feature_names(features)
print(encoded_features)

In [None]:
print(f'The dataset has {X.shape[1]} features and {len(encoded_features)} One-Hot encoded features')

### Baseline classifier

In [None]:
from sklearn.linear_model import LogisticRegression

clf_logreg = LogisticRegression(solver='lbfgs')
clf_logreg.fit(X_train_enc, y_train_enc)
y_pred = clf_logreg.predict(X_test_enc)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y_test_enc, y_pred, digits=3))

In [None]:
accuracy = accuracy_score(y_test_enc, y_pred)
print('Accuracy: %.2f' % (accuracy*100))

#### Splitting the dataset to training (10%) and test data (90%)!

In [None]:
# We are going to use indices later!
indices = np.arange(len(mushroom_df))
X_train, X_test, y_train, y_test, tr_ids, test_ids = train_test_split(X, y, indices, test_size=0.9, random_state=10)

In [None]:
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
clf_logreg = LogisticRegression(solver='lbfgs')
clf_logreg.fit(X_train_enc, y_train_enc)
y_pred = clf_logreg.predict(X_test_enc)
print(classification_report(y_test_enc, y_pred, digits=3))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test_enc, y_pred)
cm

In [None]:
fig, ax = plt.subplots(1, 1, dpi=120)
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['edible', 'poisonous'])
cm_plot.plot(ax=ax);

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz

# Create Decision Tree classifer object
clf_gini = DecisionTreeClassifier(criterion="gini", random_state=10, max_depth=5, max_leaf_nodes=10)
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=10, max_depth=5, max_leaf_nodes=10)

# Train Decision Tree Classifer
clf_gini = clf_gini.fit(X_train_enc,y_train_enc)
clf_entropy = clf_entropy.fit(X_train_enc,y_train_enc)

# Predict the response for test dataset
y_pred_gini = clf_gini.predict(X_test_enc)
y_pred_entropy = clf_entropy.predict(X_test_enc)

In [None]:
print(classification_report(y_test_enc, y_pred_gini, digits=3))

In [None]:
print(classification_report(y_test_enc, y_pred_entropy, digits=3))

In [None]:
fig, ax = plt.subplots(1, 1, dpi=120)
cm = confusion_matrix(y_test_enc, y_pred_gini)
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['edible', 'poisonous'])
cm_plot.plot(ax=ax);

In [None]:
fig, ax = plt.subplots(1, 1, dpi=120)
cm = confusion_matrix(y_test_enc, y_pred_entropy)
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['edible', 'poisonous'])
cm_plot.plot(ax=ax);

### Visualizing the tree

In [None]:
from sklearn.tree import export_graphviz 
from io import StringIO
from IPython.display import Image 
from pydot import graph_from_dot_data
import pydotplus
dot_data = StringIO()

# Training feature names
oh_encoder_tr = OneHotEncoder(handle_unknown="ignore") 
oh_encoder_tr.fit(X_train)
encoded_tr_features = oh_encoder_tr.get_feature_names(features)

# Export the tree
export_graphviz(clf_entropy, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=False,
                feature_names = encoded_tr_features,
                class_names=['edible', 'poisonous'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png(), width=800, height=400)

In [None]:
from sklearn.tree import export_text 
tree_text = export_text(clf_entropy, feature_names = list(encoded_tr_features))
print(tree_text)

### Model diagnosis

In [None]:
oh_encoder = OneHotEncoder() 
oh_encoder.fit(X_test) 
encoded_test_features = oh_encoder.get_feature_names(features)
len(encoded_test_features), len(encoded_tr_features)

#### Unseen instances

In [None]:
list(set(encoded_test_features) - set(list(encoded_tr_features)))

In [None]:
unseen_data = mushroom_df[
    (mushroom_df['cap-shape']=='conical') | (mushroom_df['cap-surface']=='grooves') | 
    (mushroom_df['veil-color']=='yellow') | (mushroom_df['stalk-color-above-ring']=='yellow')
]
unseen_data

In [None]:
X_test_unseen = unseen_data[features].values
__, X_test_unseen_enc = prepare_inputs(X_train, X_test_unseen)
y_pred_unseen = clf_entropy.predict(X_test_unseen_enc)
y_pred_unseen

In [None]:
false_predictions = np.where(y_test_enc!=y_pred_entropy)
false_predictions

In [None]:
np.sort(test_ids[false_predictions])

#### Misclassified instances

In [None]:
mushroom_df.iloc[np.sort(test_ids[false_predictions]), :12]

In [None]:
mushroom_df[
    (mushroom_df['cap-color']=='white') & (mushroom_df['bruises']=='bruises') 
    & (mushroom_df['odor']=='none') & (mushroom_df['gill-spacing']=='crowded')
]

In [None]:
mushroom_df_tr = mushroom_df.iloc[np.sort(tr_ids), :]

In [None]:
mushroom_df_tr[
    (mushroom_df['cap-color']=='white')
  & (mushroom_df['bruises']=='bruises')
  & (mushroom_df['odor']=='none')
  & (mushroom_df['gill-spacing']=='crowded')]

### Cross-validation

In [None]:
num_val_tests = 10
best_tree_model = clf_entropy
best_tree_score = best_tree_model.score(X_test_enc, y_test_enc)
best_logreg_model = clf_logreg
best_logreg_score = best_logreg_model.score(X_test_enc, y_test_enc)
for i in range(num_val_tests):
    seed = np.random.randint(1000)
    # train/test split
    X_train, X_test, y_train, y_test, tr_ids, test_ids = train_test_split(
        X, y, indices, test_size=0.9, random_state=seed
    )
    # Instantiate the DecisionTreeClassifier and the 
    clf_tree_tmp = DecisionTreeClassifier(
        criterion="entropy", random_state=10, max_depth=5, max_leaf_nodes=10
    )
    clf_logreg_tmp = LogisticRegression(solver='lbfgs')
    # Encode train/test
    X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
    y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
    # fit
    clf_tree_tmp = clf_tree_tmp.fit(X_train_enc,y_train_enc)
    clf_logreg_tmp = clf_logreg_tmp.fit(X_train_enc,y_train_enc)
    # evaluate
    tree_score = clf_tree_tmp.score(X_test_enc, y_test_enc)
    logreg_score = clf_logreg_tmp.score(X_test_enc, y_test_enc)
    # compare with the best tree model so far
    if tree_score > best_tree_score:
        best_tree_score = tree_score
        best_tree_model = clf_tree_tmp
        X_enc_best_tree = X_test_enc
        y_enc_best_tree = y_test_enc
        seed_best_tree = seed
    # repeat for the logistic regression model
    if logreg_score > best_logreg_score:
        best_logreg_score = logreg_score
        best_logreg_model = clf_logreg_tmp
        X_enc_best_logreg = X_test_enc
        y_enc_best_logreg = y_test_enc
        seed_best_logreg = seed

In [None]:
print(f"best decision tree model reaches the accuracy {np.round(best_tree_score*100, 3)}%")
print(f"best logistic regression model reaches the accuracy {np.round(best_logreg_score*100, 3)}%")

In [None]:
y_pred_tree = best_tree_model.predict(X_enc_best_tree)
y_pred_logreg = best_logreg_model.predict(X_enc_best_logreg)

In [None]:
print(classification_report(y_enc_best_tree, y_pred_tree, digits=3))

In [None]:
fig, ax = plt.subplots(1, 1, dpi=120)
cm = confusion_matrix(y_enc_best_tree, y_pred_tree)
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['edible', 'poisonous'])
ax.set_title('Decision tree consfusion matrix')
cm_plot.plot(ax=ax);

In [None]:
print(classification_report(y_enc_best_logreg, y_pred_logreg, digits=3))

In [None]:
fig, ax = plt.subplots(1, 1, dpi=120)
cm = confusion_matrix(y_enc_best_logreg, y_pred_logreg)
cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['edible', 'poisonous'])
ax.set_title('Logistic regression consfusion matrix')
cm_plot.plot(ax=ax);

### Visualizing the best tree

In [None]:
dot_data = StringIO()

# Training feature names
oh_encoder_tr = OneHotEncoder(handle_unknown="ignore") 
X_train, X_test, y_train, y_test, tr_ids, test_ids = train_test_split(
    X, y, indices, test_size=0.9, random_state=seed_best_tree
)
oh_encoder_tr.fit(X_train)
encoded_tr_features = oh_encoder_tr.get_feature_names(features)

# Export the tree
export_graphviz(best_tree_model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=False,
                feature_names = encoded_tr_features,
                class_names=['edible', 'poisonous'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png(), width=800, height=400)