## 1. Data Import and EDA

In [None]:
# Install the required packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Install the ucimlrepo package to access UCI Machine Learning datasets
pip install ucimlrepo

In [None]:
# Import the fetch_ucirepo function from the ucimlrepo package
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
mush_features = mushroom.data.features 
mush_class = mushroom.data.targets 

mush_features.head()
  
# # metadata 
# print(mushroom.metadata) 
  
# # variable information 
# print(mushroom.variables) 


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [None]:
mush_features.shape, mush_class.shape

mush_features.isnull().sum()

mush_features['stalk-root'].isnull().sum()

# Check the distribution of the 'stalk-root' feature
mush_features['stalk-root'].value_counts()


stalk-root
b    3776
e    1120
c     556
r     192
Name: count, dtype: int64

In [None]:
# Impute missing values in 'stalk-root' with the mode of the column
mush_features['stalk-root'].fillna(mush_features['stalk-root'].mode()[0], inplace=True)

# Verify that there are no more missing values
mush_features['stalk-root'].isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mush_features['stalk-root'].fillna(mush_features['stalk-root'].mode()[0], inplace=True)


## 2. Pre-process Data for Model 

In [21]:
# split the dataset into training and testing sets - 80% training and 20% testing, no stratification as e to p ratio is almost equal
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mush_features, mush_class, test_size=0.2, random_state=42)

In [25]:
# One hot encode the categorical features
from sklearn.preprocessing import OneHotEncoder

# Instantiate the OneHotEncoder
one_hot = OneHotEncoder(drop = 'first', sparse_output=False)

# Fit and transform the training data
X_train_encoded = one_hot.fit_transform(X_train)

# Convert the encoded data back to a DataFrame for better readability and add feature names
X_train_encoded = pd.DataFrame(X_train_encoded, columns=one_hot.get_feature_names_out())

# Tranform the test data
X_test_encoded = one_hot.transform(X_test)

# Convert the encoded test data back to a DataFrame for better readability and add feature names
X_test_encoded = pd.DataFrame(X_test_encoded, columns=one_hot.get_feature_names_out())

X_train_encoded.shape, X_test_encoded.shape


((6499, 94), (1625, 94))

In [None]:
# Label encode the target variable 
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder 
lab_encoder = LabelEncoder()

# Fit and transform the training target variable
y_train_encoded = lab_encoder.fit_transform(y_train)

# Transform the test target variable
y_test_encoded = lab_encoder.transform(y_test)

