In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Initialise dataframe with headers
col_names = ["type", "capshape", "capsurface", "capcolor", "bruises", "odor", "gillattachment", "gillspacing", "gillsize", "gillcolor", "stalkshape", "stalkroot", "stalksurfaceabovering", "stalksurfacebelowring", "stalkcolorabovering", "stalkcolorbelowring", "veiltype", "veilcolor", "ringnumber", "ringtype", "sporeprintcolor", "population", "habitat"]
mushroom_data = pd.read_csv('agaricus-lepiota.data', sep=',', names=col_names)
# Output dataframe contains 8124 entries

In [2]:
## DATA CLEANING
# Remove rows with ANY null values
X1 = mushroom_data.dropna()
# doesn't actually do anything because all "values" are present in the dataset
# Output dataframe contains 8124 entries

# In Column 11 "Stalk Root", remove all rows with missing ("?") values
X2 = X1[X1.stalkroot != "?"]
# Output dataframe contains 5644 entries
# So there were 2480 samples with no value for "Stalk Root"


In [7]:
### Pie Chart of Count of Edible vs Poisonous BEFORE data cleaning
## Edible: 4208 (51.8%)
## Poisonous: 3916 (48.2%)
## Total: 8124 instances
mushroom_data.type.value_counts()

e    4208
p    3916
Name: type, dtype: int64

In [6]:
### Pie Chart of Count of Edible vs Poisonous AFTER data cleaning
## Edible: 4208 (51.8%)
## Poisonous: 3916 (48.2%)
## Total: 8124 instances
X2.type.value_counts()

e    3488
p    2156
Name: type, dtype: int64

In [51]:
## Cleaned training and target data, ready to split and train

# Training without the target column (22 Attributes)
X = X2.drop(columns=["class"])

# Initialise target column (Edible or Poisonous) in new dataframe
Y = X2["type"]

Unnamed: 0,type,capshape,capsurface,capcolor,bruises,odor,gillattachment,gillspacing,gillsize,gillcolor,...,stalksurfacebelowring,stalkcolorabovering,stalkcolorbelowring,veiltype,veilcolor,ringnumber,ringtype,sporeprintcolor,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7986,e,b,y,n,f,n,f,c,b,w,...,y,n,n,p,w,t,p,w,y,p
8001,e,x,y,n,f,n,f,c,b,w,...,y,n,n,p,w,t,p,w,y,p
8038,e,x,y,g,t,n,f,c,b,w,...,s,w,w,p,w,t,p,w,y,p
8095,p,x,y,c,f,m,f,c,b,y,...,y,c,c,p,w,n,n,w,c,d
