In [40]:
import pandas as pd
import urllib.request
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from matplotlib import pyplot as plt
import seaborn as sns

# Download the mushroom data

In [27]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data',header=None)

In [28]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


# Pull out the Edible/Poisonous, Odor, and Population columns located at 0, 5, and 21. 

In [29]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data', sep=',', header=None, usecols=[0,5,21], names=['Edible/Poisonous','Odor','Population'])

In [39]:
df.head()

Unnamed: 0,Edible/Poisonous,Odor,Population
0,1,7,3
1,0,0,2
2,0,1,2
3,1,7,3
4,0,6,0


# Replace the letter codes with number codes for scikit-learn purposes

In [31]:
df['Edible/Poisonous'].replace({'e':0, 'p': 1}, inplace=True)
df['Odor'].replace({'a':0, 'l':1, 'c':2, 'y':3, 'f':4, 'm':5, 'n':6, 'p':7, 's':8}, inplace=True)
df['Population'].replace({'a':0, 'c':1, 'n':2, 's':3, 'v':4, 'y':5}, inplace=True)

In [32]:
df

Unnamed: 0,Edible/Poisonous,Odor,Population
0,1,7,3
1,0,0,2
2,0,1,2
3,1,7,3
4,0,6,0
...,...,...,...
8119,0,6,1
8120,0,6,4
8121,0,6,1
8122,1,3,4


# Use the get_dummies() method for predictor columns and append them to the dataframe

In [55]:
o_dummy = pd.get_dummies(df[('Odor')])
p_dummy = pd.get_dummies(df[('Population')])

In [53]:
df_two = pd.concat([df, o_dummy, p_dummy], axis = 1)
df_two

Unnamed: 0,Edible/Poisonous,Odor,Population,0,1,2,3,4,5,6,7,8,0.1,1.1,2.1,3.1,4.1,5.1
0,1,7,3,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,1,2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
3,1,7,3,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
4,0,6,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,6,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
8120,0,6,4,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
8121,0,6,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
8122,1,3,4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


# Set X and Y values

In [46]:
X = o_dummy.iloc[:, :-1].values
y = p_dummy.iloc[:, 1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=1)

# Create true prediction values with the use of linear regression

In [47]:
linreg = LinearRegression()
linreg.fit(X_train, Y_train)
Y_pred = linreg.predict(X_test)
true = [1, 0]
pred = [1, 0]

print(metrics.mean_absolute_error(true, pred))
print(metrics.mean_squared_error(true, pred))
print(metrics.mean_squared_error(true, pred))

0.0
0.0
0.0


# Margin of error

In [48]:
print(metrics.mean_squared_error(Y_test, Y_pred))

0.04161325813204338


# Remove individual variables to determine importance

In [50]:
X = o_dummy.iloc[:, 5:6].values
y = p_dummy.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

print(metrics.mean_squared_error(y_test, y_pred))

0.04427126327288063


# The margins in line 50 and 51 match which tells us that odor and population could help predict wh

In [51]:
X = o_dummy.iloc[:, 3:4].values
y = p_dummy.iloc[:, 1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
linreg.fit(X_train, y_train)
Y_pred = linreg.predict(X_test)

print(metrics.mean_squared_error(y_test, y_pred))

0.04427126327288063
