# Predict Whether a Mushroom is Edible

Using a logistic regression model

In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('mushrooms.csv')

In [3]:
df.shape

(8124, 23)

In [4]:
df.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

Check the number of unique entries in each row; we can't build a model on all of them.

In [5]:
df.value_counts('odor')

odor
n    3528
f    2160
s     576
y     576
a     400
l     400
p     256
c     192
m      36
dtype: int64

We'll try these columns:

In [6]:
df_reduced = df[['class', 'cap-color', 'bruises', 'odor', 'stalk-shape', 'veil-type', 'gill-spacing']]

In [7]:
data = pd.get_dummies(df_reduced)

If a mushroom isn't edible, it's poisonous, so we can drop the `class_p` column.

Similarly, because there are only two options for `bruises`, `stalk-shape`, and `gill-spacing`, we can drop the second species of each.

In [8]:
data.rename(columns={'class_e':'edible'}, inplace=True)
data.drop(['class_p', 'bruises_t', 'stalk-shape_t', 'gill-spacing_w'], axis=1, inplace=True)

In [9]:
data.head(3)

Unnamed: 0,edible,cap-color_b,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,...,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y,stalk-shape_e,veil-type_p,gill-spacing_c
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,1,1


Let's start building our linear regression model:

In [10]:
x = data.loc[:, 'cap-color_b':]
y = data.loc[:, 'edible']

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [12]:
classifier = LogisticRegression(solver='lbfgs')

In [13]:
classifier.fit(xtrain, ytrain)

LogisticRegression()

In [14]:
classifier.score(xtest, ytest)

0.9862136878385032

Let's predict a few really quick:

In [15]:
xtest.head(5)

Unnamed: 0,cap-color_b,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,...,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y,stalk-shape_e,veil-type_p,gill-spacing_c
6854,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
7220,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
2046,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1
488,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,1
2595,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1


In [16]:
predictions = classifier.predict(xtest.head(5))

In [17]:
[bool(prediction) for prediction in predictions]

[False, True, True, True, True]

In [18]:
[bool(actual) for actual in ytest.head(5)]

[False, True, True, True, True]