# **Mushroom Poison Classification**

# **Problem Set Up**

In [None]:
# Install and import dataset.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt


# Fetch dataset.
mushroom = pd.read_csv('mushroom_cleaned.csv')

X = mushroom.drop(columns=['class'])
y = mushroom['class']

# Normalize data.
X = normalize(X, norm='max')

# Partition data into test and train sets.
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=.4)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=.5)

# Create tensors for neural network
X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
X_val_tensor = torch.from_numpy(X_val)

y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)
y_val_tensor = torch.tensor(y_val.values)

# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)
dval = xgb.DMatrix(X_val, label=y_val)

# **XGBoost**

In [None]:
# Defining parameters for XGBoost
params = {
  'objective': 'binary:hinge',
  'eval_metric': ['error', 'logloss'],
}

evallist = [(dtrain, 'train'), (dval, 'eval')]

num_round = 250

evals_result = {}

# Training model
bst = xgb.train(params, dtrain, num_round, evallist, evals_result=evals_result, verbose_eval=False)

# Testing model on training and test sets
predictions_train = bst.predict(dtrain)
predictions_val = bst.predict(dval)
predictions_test = bst.predict(dtest)

xgb_train_accuracy = accuracy_score(y_train, predictions_train)
xgb_val_accuracy = accuracy_score(y_val, predictions_val)
xgb_test_accuracy = accuracy_score(y_test, predictions_test)

xgb_stats = precision_recall_fscore_support(y_test, predictions_test, average='macro')

# Convert error to accuracy
train_accuracy = list(map(lambda x: 1-x, evals_result['train']['error']))
eval_accuracy = list(map(lambda x: 1-x, evals_result['eval']['error']))

# Plot results
fig, ax = plt.subplots(1, 2)
ax[0].set_title("Accuracy Results")
ax[0].plot(train_accuracy, label='Training Accuracy')
ax[0].plot(eval_accuracy, label='Validation Accuracy')
ax[0].legend()

ax[1].set_title("Loss Results")
ax[1].plot(evals_result['train']['logloss'], label='Training Loss')
ax[1].plot(evals_result['eval']['logloss'], label='Validation Loss')
ax[1].legend()
fig.set_size_inches(9, 5)
ax[1].grid(True, color='lightgray', linestyle='--', linewidth=0.5)
ax[1].spines[['right', 'top']].set_visible(False)
ax[0].grid(True, color='lightgray', linestyle='--', linewidth=0.5)
ax[0].spines[['right', 'top']].set_visible(False)
fig.show()

In [None]:
# Allowing user input to test model
user_input = []
user_input.append(int(input("Enter cap diameter(cm):\n")))
user_input.append(int(input("Enter cap shape(bell=1, conical=2, convex=3, flat=4, sunken=5, spherical=6, others=7):\n")))
user_input.append(int(input("Enter gill attachment(adnate=1, adnexed=2, decurrent=3, free=4, sinuate=5, pores=6, none=7, unknown=8):\n")))
user_input.append(int(input("Enter gill color(brown=1, buff=2, gray=3, green=4, pink=5, purple=6, red=7, white=8, yellow=9, blue=10, orange=11, black=12, none=13):\n")))
user_input.append(int(input("Enter stem height(mm):\n")))
user_input.append(int(input("Enter stem width(cm):\n")))
user_input.append(int(input("Enter stem color(brown=1, buff=2, gray=3, green=4, pink=5, purple=6, red=7, white=8, yellow=9, blue=10, orange=11, black=12, none=13):\n")))
user_input.append(int(input("Enter season(spring=1, summer=2, autumn=3, winter=4):\n")))

user_input = np.array(user_input).reshape(1, -1)
user_input = normalize(user_input, norm='max')
user_dmatrix = xgb.DMatrix(user_input)

result = bst.predict(user_dmatrix)

if(result == 1):
  print("Mushroom is poisonous.")
else:
  print("Mushroom is edible.")

# **Neural Network**


In [None]:
torch.set_default_dtype(torch.float64)

# Define neural network
class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeuralNet, self).__init__()

        # Define each step of custom network for use in forward()
        self.fc1 = nn.Linear(input_size, 16)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(16, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(32, 16)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(16, output_size)

    def forward(self, x):
        out = None

        # Define forwards pass using each step defined in __init__()
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        out = self.relu4(out)
        out = self.fc5(out)

        return out

In [None]:

# Train the network
def train_network(model,optimizer,criterion,X_train_tensor,y_train_tensor,X_val_tensor,y_val_tensor,num_epochs,train_losses,val_losses):
  ygraph = []
  xgraph = []
  for epoch in range(num_epochs):
    # using training data to train
    optimizer.zero_grad()
    z = model(X_train_tensor)
    loss = criterion(z, y_train_tensor)
    loss.backward()
    optimizer.step()
    train_losses[epoch] = loss

    # using training data to see accuracy
    with torch.no_grad():
      xgraph.append(get_accuracy_multiclass(model(X_train_tensor), y_train_tensor))

    # using validation data to see losses and accuracy
    with torch.no_grad():
      optimizer.zero_grad()
      z = model(X_val_tensor)
      loss = criterion(z, y_val_tensor)
      val_losses[epoch] = loss
      ygraph.append(get_accuracy_multiclass(model(X_val_tensor), y_val_tensor))

  # Graphing train and validation results
  plt.plot(np.arange(1, num_epochs + 1), ygraph, color="r", label="testing")
  plt.plot(np.arange(1, num_epochs + 1), xgraph, color="orange", label="validation")
  plt.title("Train and Test Accuracy Over Epochs")
  plt.legend(loc="upper right")
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.grid(True, color='lightgray', linestyle='--', linewidth=0.5)
  plt.gca().spines[['right', 'top']].set_visible(False)
  plt.show()

In [None]:
# Function for calculating multiclass accuracy
def get_accuracy_multiclass(pred_arr,original_arr):
    if len(pred_arr)!=len(original_arr):
        return False
    pred_arr = pred_arr.numpy()
    original_arr = original_arr.numpy()
    final_pred= []
    for i in range(len(pred_arr)):
        final_pred.append(np.argmax(pred_arr[i]))
    final_pred = np.array(final_pred)
    count = 0
    for i in range(len(original_arr)):
        if final_pred[i] == original_arr[i]:
            count+=1
    return count/len(final_pred)

In [None]:
# Training network
learning_rate = 0.01
loss = torch.nn.functional.cross_entropy
input_dim  = 8
output_dim = 2
nn = NeuralNet(input_dim, output_dim)
optimizer = torch.optim.Adam(nn.parameters(), lr=learning_rate)
num_epochs = 6000
train_losses = np.zeros(num_epochs)
test_losses  = np.zeros(num_epochs)

train_network(nn, optimizer, loss, X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor, num_epochs, train_losses, test_losses)

In [None]:
plt.plot(np.arange(1, num_epochs + 1), train_losses, color='orange', label="Train Loss")
plt.plot(np.arange(1, num_epochs + 1), test_losses, color='r', label="Test Loss")
plt.title("Train and Test Accuracy Over Epochs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(loc="upper right")
plt.grid(True, color='lightgray', linestyle='--', linewidth=0.5)
plt.gca().spines[['right', 'top']].set_visible(False)
plt.show()

In [None]:
# calculate the accuracy
predictions_train = []
predictions_test =  []
predictions_val = []
with torch.no_grad():
    predictions_train = nn(X_train_tensor)
    predictions_test = nn(X_test_tensor)
    predictions_val = nn(X_val_tensor)

nn_train_accuracy = get_accuracy_multiclass(predictions_train, y_train_tensor)
nn_val_accuracy = get_accuracy_multiclass(predictions_val, y_val_tensor)
nn_test_accuracy = get_accuracy_multiclass(predictions_test, y_test_tensor)


numpy_predictions_test = predictions_test.numpy()
numpy_predictions_test_binary = []

for i in range(len(numpy_predictions_test)):
  if numpy_predictions_test[i][0] > numpy_predictions_test[i][1]:
    numpy_predictions_test_binary.append(0)
  else:
    numpy_predictions_test_binary.append(1)

nn_stats = precision_recall_fscore_support(y_test, numpy_predictions_test_binary, average='macro')

# **Linear Regression**

In [None]:
# Initialize, train, and test linear regression model
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)

linear_train_results = linearRegression.predict(X_train)
linear_val_results = linearRegression.predict(X_val)
linear_test_results = linearRegression.predict(X_test)

linear_train_results = np.where(linear_train_results >= 0.5, 1, 0)
linear_val_results = np.where(linear_val_results >= 0.5, 1, 0)
linear_test_results = np.where(linear_test_results >= 0.5, 1, 0)

lr_train_accuracy = accuracy_score(y_train, linear_train_results)
lr_test_accuracy = accuracy_score(y_test, linear_test_results)
lr_val_accuracy = accuracy_score(y_val, linear_val_results)

lr_stats = precision_recall_fscore_support(y_test, linear_test_results, average='macro')

# **Support Vector Machine**

In [None]:
# Initialize, train, and test SVM
svm = SVC()
svm.fit(X_train, y_train)

svm_train_results=  svm.predict(X_train)
svm_val_results = svm.predict(X_val)
svm_test_results = svm.predict(X_test)

svm_train_accuracy = accuracy_score(y_train, svm_train_results)
svm_test_accuracy = accuracy_score(y_test, svm_test_results)
svm_val_accuracy = accuracy_score(y_val, svm_val_results)

svm_stats = precision_recall_fscore_support(y_test, svm_test_results, average='macro')

print(svm_train_accuracy)
print(svm_test_accuracy)
print(svm_val_accuracy)

# **K-Nearest Neighbors**

In [None]:
# Initialize, train, and test KNN
knn = KNeighborsClassifier(n_neighbors=3, p=0.3)
knn.fit(X_train, y_train)

knn_train_results=  knn.predict(X_train)
knn_val_results = knn.predict(X_val)
knn_test_results = knn.predict(X_test)

knn_train_accuracy = accuracy_score(y_train, knn_train_results)
knn_test_accuracy = accuracy_score(y_test, knn_test_results)
knn_val_accuracy = accuracy_score(y_val, knn_val_results)

knn_stats = precision_recall_fscore_support(y_test, knn_test_results, average='macro')

# **Results**

In [None]:
# Graph accuracy results
X_graph = [1, 2 , 3, 4, 5]
y_val_graph = [xgb_val_accuracy, nn_val_accuracy, lr_val_accuracy, svm_val_accuracy, knn_val_accuracy]
y_test_graph = [xgb_test_accuracy, nn_test_accuracy, lr_test_accuracy, svm_test_accuracy, knn_test_accuracy]
y_train_graph = [xgb_train_accuracy, nn_train_accuracy, lr_train_accuracy, svm_train_accuracy, knn_train_accuracy]

plt.figure(figsize=(12, 6))
plt.plot(X_graph, y_val_graph, marker='^', color='r', label='Validation Accuracy')
plt.plot(X_graph, y_test_graph, marker='s', color='orange', label='Test Accuracy')
plt.plot(X_graph, y_train_graph, marker='o', color='y', label='Training Accuracy')
plt.xticks(X_graph, ['XGBoost', 'Neural Network', 'Linear Regression', 'Support Vector Machine', 'K-Nearest Neighbors'])
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Training, Test, and Validation Accuracy for Different Models')
plt.legend()
plt.grid(True, color='lightgray', linestyle='--', linewidth=0.5)
plt.gca().spines[['right', 'top']].set_visible(False)
plt.show()

In [None]:
# Print metric results
print("{:^30}{:^10}{:^10}{:^10}{:^10}".format("Model", "Accuracy", "Precision", "Recall", "F1 Score"))
print("-"*70)
print("{:^30}{:^10.2f}{:^10.2f}{:^10.2f}{:^10.2f}".format("XGBoost", xgb_test_accuracy*100, xgb_stats[0]*100, xgb_stats[1]*100, xgb_stats[2]*100))
print("{:^30}{:^10.2f}{:^10.2f}{:^10.2f}{:^10.2f}".format("Neural Network", nn_test_accuracy*100, nn_stats[0]*100, nn_stats[1]*100, nn_stats[2]*100))
print("{:^30}{:^10.2f}{:^10.2f}{:^10.2f}{:^10.2f}".format("Linear Regression", lr_test_accuracy*100, lr_stats[0]*100, lr_stats[1]*100, lr_stats[2]*100))
print("{:^30}{:^10.2f}{:^10.2f}{:^10.2f}{:^10.2f}".format("SVM", svm_test_accuracy*100, svm_stats[0]*100, svm_stats[1]*100, svm_stats[2]*100))
print("{:^30}{:^10.2f}{:^10.2f}{:^10.2f}{:^10.2f}".format("K-NN", knn_test_accuracy*100, knn_stats[0]*100, knn_stats[1]*100, knn_stats[2]*100))