In [1]:
# Import libraries

import pandas as pan
import numpy as num
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# Assign names to the columns based on the .names file from the UCI website

columns = ['class','cap-shape','cap-surface','cap-color','bruises?','odor','gill-attachment','gill-spacing','gill-size',
           'gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring',
           'stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type',
           'spore-print-color','population','habitat']

# Read the data from UCI into the dataframe  

mushroom_data = pan.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', 
                           names=columns)

In [3]:
# Display the information in the dataframe

mushroom_data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
# Create dictionaries for mapping class, odor, and cap-color columns

toxicity = {'numeric': {'e': 0, 'p': 1}, 
            'name':{0: 'edible', 1: 'poisonous'}}

odor = {'numeric': {'a': 0, 'l': 1, 'c': 2, 'y': 3, 'f': 4,'m': 5, 'n':6, 'p':7, 's':8,},
        'name': {0: 'almond', 1: 'anise', 2: 'creosote', 3: 'fishy', 4: 'foul', 5: 'musty', 6: 'none', 7: 'pungent', 
                 8: 'spicy'}}

cap_color = {'numeric': {'n': 0, 'b': 1, 'c': 2, 'g': 3, 'r': 4, 'p': 5, 'u': 6, 'e': 7, 'w': 8, 'y': 9}, 
             'name': {0: 'brown', 1: 'buff', 2: 'cinnamon', 3: 'gray', 4: 'green', 5: 'pink', 6: 'purple', 7: 'red', 
                      8: 'white', 9: 'yellow'}}

In [5]:
# Map the codes in the class, odor, and cap-color columns to numeric values with dictionary
# Map class-name, odor-name, and cap-color-name to new columns with dictionary

mushroom_data['class'] = mushroom_data['class'].map(toxicity['numeric'])
mushroom_data['class-name'] = mushroom_data['class'].map(toxicity['name'])
mushroom_data['odor'] = mushroom_data['odor'].map(odor['numeric'])
mushroom_data['odor-name'] = mushroom_data['odor'].map(odor['name'])
mushroom_data['cap-color'] = mushroom_data['cap-color'].map(cap_color['numeric'])
mushroom_data['cap-color-name'] = mushroom_data['cap-color'].map(cap_color['name'])

In [6]:
# Display the information in the dataframe, listing only a subset of columns

mushroom_data[['class', 'class-name','odor', 'odor-name', 'cap-color', 'cap-color-name']]

Unnamed: 0,class,class-name,odor,odor-name,cap-color,cap-color-name
0,1,poisonous,7,pungent,0,brown
1,0,edible,0,almond,9,yellow
2,0,edible,1,anise,8,white
3,1,poisonous,7,pungent,8,white
4,0,edible,6,none,3,gray
...,...,...,...,...,...,...
8119,0,edible,6,none,0,brown
8120,0,edible,6,none,0,brown
8121,0,edible,6,none,0,brown
8122,1,poisonous,3,fishy,0,brown


In [7]:
# Prepare x and y

x_odor = mushroom_data['odor'].values.reshape(-1,1)
x_cap_color = mushroom_data['cap-color'].values.reshape(-1,1)
y = mushroom_data['class']

## Odor

In [8]:
# Split the data into training and test sets

x_train, x_test, y_train, y_test = train_test_split(x_odor, y, random_state=1)

In [9]:
# Cross-validation with KNN and GridSearchCV

knn = KNeighborsClassifier()
k_range = list(range(1, 31))
grid_params = dict(n_neighbors=k_range)

# Define parameter range

grid = GridSearchCV(knn, grid_params, cv=10, scoring='accuracy')

# Fit the model for grid search

grid.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='accuracy')

In [10]:
# Calculate the best optimal hyperparameters

grid.best_score_

0.9862141645804734

In [11]:
# Fit the linear regression model to the training data

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

# Make predictions using the linear model

y_pred = lin_reg.predict(x_test)

# Compute the root mean squared error

num.sqrt(metrics.mean_squared_error(y_test, y_pred))

0.49854329889107485

## Cap Color

In [12]:
# Split the data into training and test sets

x_train, x_test, y_train, y_test = train_test_split(x_cap_color, y, random_state=1)

In [13]:
# Cross-validation with KNN and GridSearchCV

knn = KNeighborsClassifier()
k_range = list(range(1, 31))
grid_params= dict(n_neighbors=k_range)

# Define parameter range

grid = GridSearchCV(knn, grid_params, cv=10, scoring='accuracy')

# Fit the model for grid search

grid.fit(x_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             scoring='accuracy')

In [14]:
# Calculate the best optimal hyperparameters

grid.best_score_

0.5568629034428921

In [15]:
# Fit the linear regression model to the training data

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

# Make predictions using the linear model

y_pred = lin_reg.predict(x_test)

# Compute the root mean squared error

num.sqrt(metrics.mean_squared_error(y_test, y_pred))

0.4994488239833032

## Conclusion

To determine whether odor or cap color would more accurately predict whether a mushroom was poisonous, the K-Nearest Neighbors (KNN) model was run on the data and root mean square errors (RMSE) were calculated using linear regression.

The KNN had an accuracy score of .98 for odor and .55 for cap color, which shows that odor would more accurately predict whether a mushroom was poisonous. 

RMSE a measure of the amount of error in a model. Therefore the model with the lower RMSE is the better predictor of whether a mushroom is poisonous. In this case, odor had an RMSE of .4985 and cap color has an error of .4994, which means that odor is the more accurate predictor.