<a href="https://colab.research.google.com/github/bridgetmanu/Mushroom_Prediction/blob/main/Mushroom_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# load libraries
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.neighbors import KNeighborsClassifier as knn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV as gscv
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as rfc
import joblib





Data info

- 17 features are categorical

- 3 features are numerical

In [None]:
# Load data
def load_data(path):
  data = pd.read_csv(path)
  return data


### data preprocessing for knn/ decision tree


#### Handling missing values

- no mean/median imputations needed for the numerical features
- delete column with significantly less observations
- mode imputation for categorical features


In [None]:

# 1. Preprocessing with mode imputation for categorical features

def preprocess(data):
  # first replace ? with NA then replace
  # return a list of features with observations < 50000
  data = data.replace('?', np.nan)
  column = list(data.columns)
  missing = list(filter(lambda x: data[x].count()<len(data), column))


  # delete features with significantly less observations
  delete = set(filter(lambda x: data[x].count()/len(data) < 0.2, missing))
  mod = set(missing) - delete
  data = data.drop(columns=list(delete))

  # replaces nan with category with highest frequency
  for i in mod:
    a = data[i].mode()[0]
    data[i] = data[i].fillna(a)
  return data








#### One hot encoding for nominal variables


In [None]:
# one hot encoding

def one_hot(pred, reference_columns=None):
  ccolumn = list(pred.select_dtypes(include='object')) # categorical features
  data_encode = pd.get_dummies(pred, columns=ccolumn)
  bool_col = data_encode.select_dtypes(include='bool').columns
  data_encode[bool_col] = data_encode[bool_col].astype(float)
  # Align with reference columns (for test data)
  if reference_columns is not None:
      data_encode = data_encode.reindex(columns=reference_columns, fill_value=0)
  return data_encode


#### Normalization for KNN

In [None]:
# normalize the variables for KNN

def normalize(new_data, pred):
  ncolumn = pred.select_dtypes(include='float').columns # numerical features
  # address skewness
  for i in ncolumn:
      skew_value = new_data[i].skew()
      if skew_value >= 3 or skew_value <= -3:
          new_data[i] = np.log1p(new_data[i])
      elif 1 < skew_value < 3 or -3 < skew_value <= -1:
          new_data[i] = np.sqrt(new_data[i])

  # normalize the data
  m = MinMaxScaler()
  new_data[ncolumn] = m.fit_transform(new_data[ncolumn])
  return new_data



In [None]:
# build the model with training data

def build_model(x, y, param_grid, algo):
  grid = gscv(algo, param_grid, cv=5, scoring='accuracy')
  grid.fit(x, y)
  print("Best parameter: ",grid.best_params_) # the best k that have the best model performance
  print("Best score: ", grid.best_score_) # average accuracy of the model using 5-fold
  eval_table = pd.DataFrame(grid.cv_results_)
  columns = ['param_' + key for key in param_grid.keys()] + [
        'mean_test_score', 'std_test_score']
  eval_table = (eval_table[columns]).sort_values('mean_test_score', ascending=False)
  return grid.best_estimator_, eval_table



In [None]:
# evaluate model using test data

def accuracy_formula(predictions, test_label):

  eval_df = pd.DataFrame(predictions, columns=['p_y'], index=test_label.index)
  eval_df['act_y'] = test_label

  total_right = 0
  total_wrong = 0

  for index, row in eval_df.iterrows():
      y_target = row["act_y"]
      y_pred = row['p_y']
      if y_pred == y_target: # change if needed
          total_right = total_right + 1
      else:
          total_wrong = total_wrong + 1
  # absolute error
  mapping = {'e': 1, 'p': 2}
  y_true_num = [mapping[val] for val in test_label]
  y_pred_num = [mapping[val] for val in predictions]
  errors = abs(np.array(y_true_num)-np.array(y_pred_num))
  mean_errors = np.mean(errors)
  std_errors = np.std(errors)

  print("correct:",total_right, ", wrong:", total_wrong)
  print('Final Accuracy is ', total_right / (total_right + total_wrong))
  print("Mean of Errors:", mean_errors)
  print("Standard Deviation of Errors:", std_errors)


In [None]:
# preprocessing phase for training data

# load data
path = "/content/drive/MyDrive/mushroom_mixed_50000.csv"
data = load_data(path)

# Split into training and test sets
pred, X_test = train_test_split(data, test_size=0.3, random_state=42)

# preprocess test data
data = preprocess(pred)

# separate the predictors and target variable
label = data['class']
pred = data.drop(columns='class')
print(pred.info())

# one-hot encoding + normalization
new_data = one_hot(pred)

# Save the training columns for later use
train_columns = new_data.columns
joblib.dump(train_columns, '/content/drive/MyDrive/Models/train_columns.joblib')

p = normalize(new_data, pred)



['/content/drive/MyDrive/Models/train_columns.joblib']

In [None]:
# preprocessing phase for test data

# load the data
test_data = load_data('/content/drive/MyDrive/mushroom_mixed_test.csv')

test_data = pd.concat([test_data, X_test])

# preprocessing
test_pro = preprocess(test_data)
test_label = test_pro['class']
test_pred = test_pro.drop(columns = ['class'])

# one-hot encoding + normalization
test_p = one_hot(test_pred, train_columns) # use this for decision tree
test = normalize(test_p, test_pred) # use this for knn







#### KNN Algorithm

In [None]:
# knn model: use normalized data

# build model

algo = knn(metric='euclidean')
param_grid = {'n_neighbors': [1000, 2000, 5000],
                'weights': ['uniform', 'distance']}

best_model, table_knn = build_model(p, label, param_grid, algo)

# evaluate test model
predictions = best_model.predict(test)

# Accuracy Formula
accuracy_formula(predictions, test_label)

# my own eval
test_accuracy = best_model.score(test, test_label)
print("test accuracy:", test_accuracy) # average accuracy of the 5-fold cv
print(classification_report(test_label, predictions)) # F-1 score

table_knn.head(5)


Best parameter:  {'n_neighbors': 1000, 'weights': 'distance'}
Best score:  0.9780857142857142
correct: 6 , wrong: 1
Final Accuracy is  0.8571428571428571
Mean of Errors: 0.14285714285714285
Standard Deviation of Errors: 0.3499271061118826
test accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           e       0.00      0.00      0.00         1
           p       0.86      1.00      0.92         6

    accuracy                           0.86         7
   macro avg       0.43      0.50      0.46         7
weighted avg       0.73      0.86      0.79         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,param_n_neighbors,param_weights,mean_test_score,std_test_score
1,1000,distance,0.978086,0.004019
3,2000,distance,0.955771,0.005519
5,5000,distance,0.921657,0.005428
0,1000,uniform,0.777429,0.007118
2,2000,uniform,0.721743,0.007278


#### Decision Tree Algorithm

In [None]:
# Train the dtc model: use non-normalized data

algo = dtc(random_state=42)
param_grid_dtc = {'splitter': ['best', 'random'],
                'criterion': ['gini', 'entropy'],
                'min_samples_split': [5, 10],
                'max_depth': [5, 10]}


best_dtc_model, table = build_model(new_data, label, param_grid_dtc, algo)

# evaluate model using test data
predictions_dtc = best_dtc_model.predict(test_p)

# Accuracy Formula Given
accuracy_formula(predictions_dtc, test_label)

# my own accuracy eval
test_dtc_accuracy = best_dtc_model.score(test_p, test_label)
print("test accuracy:", test_dtc_accuracy)
print(classification_report(test_label, predictions_dtc)) # F-1 score

table.head(5)




Best parameter:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5, 'splitter': 'random'}
Best score:  0.9139428571428571
correct: 5 , wrong: 2
Final Accuracy is  0.7142857142857143
Mean of Errors: 0.2857142857142857
Standard Deviation of Errors: 0.45175395145262565
test accuracy: 0.7142857142857143
              precision    recall  f1-score   support

           e       0.00      0.00      0.00         1
           p       0.83      0.83      0.83         6

    accuracy                           0.71         7
   macro avg       0.42      0.42      0.42         7
weighted avg       0.71      0.71      0.71         7



Unnamed: 0,param_splitter,param_criterion,param_min_samples_split,param_max_depth,mean_test_score,std_test_score
7,random,gini,10,10,0.913943,0.01562
5,random,gini,5,10,0.913943,0.01562
6,best,gini,10,10,0.908143,0.001927
4,best,gini,5,10,0.908114,0.001982
15,random,entropy,10,10,0.875943,0.00495


#### Random Forest Algorithm

In [None]:
# Train the dtc model: use non-normalized data

algo = rfc(random_state=42)
param_grid_rfc = {'criterion': ['gini', 'entropy'],
                'min_samples_split': [5, 10],
                'max_depth': [5, 10],
                'ccp_alpha': [0.0, 0.2]}

best_rfc_model, table_rfc = build_model(new_data, label, param_grid_rfc, algo)

# evaluate model using test data
predictions_rfc = best_rfc_model.predict(test_p)

# Accuracy Formula Given
accuracy_formula(predictions_rfc, test_label)

# my own accuracy eval
test_rfc_accuracy = best_rfc_model.score(test_p, test_label)
print("test accuracy:", test_rfc_accuracy)
print(classification_report(test_label, predictions_rfc)) # F-1 score

table_rfc.head(5)

Best parameter:  {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 10}
Best score:  0.9740857142857143
correct: 6 , wrong: 1
Final Accuracy is  0.8571428571428571
Mean of Errors: 0.14285714285714285
Standard Deviation of Errors: 0.3499271061118826
test accuracy: 0.8571428571428571
              precision    recall  f1-score   support

           e       0.00      0.00      0.00         1
           p       0.86      1.00      0.92         6

    accuracy                           0.86         7
   macro avg       0.43      0.50      0.46         7
weighted avg       0.73      0.86      0.79         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,param_criterion,param_min_samples_split,param_max_depth,param_ccp_alpha,mean_test_score,std_test_score
3,gini,10,10,0.0,0.974086,0.003671
2,gini,5,10,0.0,0.972457,0.004399
6,entropy,5,10,0.0,0.964629,0.006745
7,entropy,10,10,0.0,0.962114,0.006965
0,gini,5,5,0.0,0.863257,0.004048


In [None]:
# save the models

knn_best_model = joblib.dump(best_model, '/content/drive/MyDrive/Models/knn_best_model.joblib')
dtc_best_model = joblib.dump(best_dtc_model, '/content/drive/MyDrive/Models/dtc_best_model.joblib')
rfc_best_model = joblib.dump(best_rfc_model, '/content/drive/MyDrive/Models/rfc_best_model.joblib')

In [None]:
""""""""""""""""""""""""""""""""""""""""""""""""""""""
"""
Evaluation begins
"""
""""""""""""""""""""""""""""""""""""""""""""""""""""""

def load_model(model_name):
    model = None
    if model_name.endswith('.joblib'):
        model = joblib.load(model_name)
    return model



model_filename = "/content/drive/MyDrive/Models/knn_best_model.joblib"
columns_filename = "/content/drive/MyDrive/Models/train_columns.joblib"

#df = pd.read_csv(test_filename, header = 0)

# Prepare your data as needed.

# preprocessing
test_pro = preprocess(test_data)
X = test_pro.iloc[:, 1:]
Y_actual = test_pro['class']

# one-hot encoding + normalization
column = joblib.load(columns_filename)
test_p = one_hot(X, column)
test = normalize(test_p, X)

# Prepare your model as needed.
model = load_model(model_filename)

# evaluate model
Y_pred = model.predict(test)

############# Try not to change the accuracy formula ############
accuracy_formula(Y_pred, Y_actual)

correct: 14753 , wrong: 254
Final Accuracy is  0.9830745652029053
Mean of Errors: 0.01692543479709469
Standard Deviation of Errors: 0.12899211004563024
