From Kaggle:
This dataset includes descriptions of hypothetical samples corresponding to **23 species** of gilled mushrooms in the Agaricus and Lepiota Family Mushroom drawn from The Audubon Society Field Guide to North American Mushrooms (1981). Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like "leaflets three, let it be'' for Poisonous Oak and Ivy.

Description of columns by Kaggle:

    Attribute Information: (classes: edible=e, poisonous=p)

    cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

    cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

    cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

    bruises: bruises=t,no=f

    odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

    gill-attachment: attached=a,descending=d,free=f,notched=n

    gill-spacing: close=c,crowded=w,distant=d

    gill-size: broad=b,narrow=n

    gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

    stalk-shape: enlarging=e,tapering=t

    stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

    stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

    stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

    stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

    stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

    veil-type: partial=p,universal=u

    veil-color: brown=n,orange=o,white=w,yellow=y

    ring-number: none=n,one=o,two=t

    ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

    spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

    population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

    habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('mushrooms.csv')  # https://www.kaggle.com/uciml/mushroom-classification
pd.set_option('display.max_columns', 120)
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l


In [3]:
# our data is claimed to be of a very good quality, but lets check it anyways
data.dropna()

def test_contains_only(colname, allowed):
    col = data[colname]
    for l in col:
        if l not in allowed:
            print(colname, l)
test_contains_only('class', 'ep')
test_contains_only('cap-shape', 'bcxfks')
test_contains_only('cap-surface', 'fgys')
test_contains_only('cap-color', 'nbcgrpuewy')
test_contains_only('bruises', 'tf')
test_contains_only('odor', 'alcyfmnps')
test_contains_only('gill-attachment', 'fa')
test_contains_only('gill-spacing', 'cwd')
test_contains_only('gill-size', 'bn')
test_contains_only('gill-color', 'knbhgropuewy')
test_contains_only('stalk-shape', 'et')
test_contains_only('stalk-root', 'bcuezr?')
test_contains_only('stalk-surface-above-ring', 'fyks')
test_contains_only('stalk-surface-below-ring', 'fyks')
test_contains_only('stalk-color-above-ring', 'nbcgopewy')
test_contains_only('stalk-color-below-ring', 'nbcgopewy')
test_contains_only('veil-type', 'p')
test_contains_only('veil-color', 'nowy')
test_contains_only('ring-number', 'not')
test_contains_only('ring-type', 'ceflnpsz')
test_contains_only('spore-print-color', 'knbhrouwy')
test_contains_only('population', 'acnsvy')
test_contains_only('habitat', 'glmpuwd')


In [4]:
# cheking for not unique rows in dataset
for i in data.duplicated():
    if i == True:
        print(i)
        

In [5]:
###

#using map function for columns with only 2 values, to reduce a bit number of columns after dummies function

#class: edible=1, poisonous=0

#bruises: bruises=1,no=0

#gill-size: broad=1,narrow=0

#stalk-shape: enlarging=1,tapering=0

#veil-type: partial=1,universal=0
    
###

data['class'] = data['class'].map(dict(e=1, p=0))
data['bruises'] = data['bruises'].map(dict(t=1, f=0))
data['gill-size'] = data['gill-size'].map(dict(b=1, n=0))
data['stalk-shape'] = data['stalk-shape'].map(dict(e=1, t=0))
data['veil-type'] = data['veil-type'].map(dict(p=1, u=0))
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,x,s,n,1,p,f,c,0,k,1,e,s,s,w,w,1,w,o,p,k,s,u
1,1,x,s,y,1,a,f,c,1,k,1,c,s,s,w,w,1,w,o,p,n,n,g
2,1,b,s,w,1,l,f,c,1,n,1,c,s,s,w,w,1,w,o,p,n,n,m
3,0,x,y,w,1,p,f,c,0,n,1,e,s,s,w,w,1,w,o,p,k,s,u
4,1,x,s,g,0,n,f,w,1,k,0,e,s,s,w,w,1,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,k,s,n,0,n,a,c,1,y,1,?,s,s,o,o,1,o,o,p,b,c,l
8120,1,x,s,n,0,n,a,c,1,y,1,?,s,s,o,o,1,n,o,p,b,v,l
8121,1,f,s,n,0,n,a,c,1,n,1,?,s,s,o,o,1,o,o,p,b,c,l
8122,0,k,y,n,0,y,f,c,0,b,0,?,s,k,w,w,1,w,o,e,w,v,l


In [6]:
#for easier copy-pasting prin out all column names
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [7]:
#Using hot encoding to train models on binary values
#All columns that have more than 2 values are encoded
binary_df = pd.get_dummies(data, columns = ['cap-shape', 'cap-surface', 'cap-color', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-color', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'])

In [8]:
#spliting data into train and test
X_train, X_test, y_train, y_test = train_test_split(binary_df.drop(columns=['class']), binary_df['class'], test_size=0.5, random_state=1)

In [9]:
#visually cheking training data
X_train

Unnamed: 0,bruises,gill-size,stalk-shape,veil-type,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_b,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,odor_a,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y,gill-attachment_a,gill-attachment_f,gill-spacing_c,gill-spacing_w,gill-color_b,gill-color_e,gill-color_g,gill-color_h,gill-color_k,gill-color_n,gill-color_o,gill-color_p,gill-color_r,gill-color_u,gill-color_w,gill-color_y,stalk-root_?,stalk-root_b,stalk-root_c,stalk-root_e,stalk-root_r,stalk-surface-above-ring_f,stalk-surface-above-ring_k,stalk-surface-above-ring_s,stalk-surface-above-ring_y,stalk-surface-below-ring_f,stalk-surface-below-ring_k,stalk-surface-below-ring_s,stalk-surface-below-ring_y,stalk-color-above-ring_b,stalk-color-above-ring_c,stalk-color-above-ring_e,stalk-color-above-ring_g,stalk-color-above-ring_n,stalk-color-above-ring_o,stalk-color-above-ring_p,stalk-color-above-ring_w,stalk-color-above-ring_y,stalk-color-below-ring_b,stalk-color-below-ring_c,stalk-color-below-ring_e,stalk-color-below-ring_g,stalk-color-below-ring_n,stalk-color-below-ring_o,stalk-color-below-ring_p,stalk-color-below-ring_w,stalk-color-below-ring_y,veil-color_n,veil-color_o,veil-color_w,veil-color_y,ring-number_n,ring-number_o,ring-number_t,ring-type_e,ring-type_f,ring-type_l,ring-type_n,ring-type_p,spore-print-color_b,spore-print-color_h,spore-print-color_k,spore-print-color_n,spore-print-color_o,spore-print-color_r,spore-print-color_u,spore-print-color_w,spore-print-color_y,population_a,population_c,population_n,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
6950,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
2761,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
7090,0,1,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0
6130,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
2719,1,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7935,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
5192,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0
3980,0,1,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
235,1,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0


In [10]:
# Here we are trying Ridge and Lasso regression.
# Ofcourse it will not help with classification but we are interested in their coefficients

def MSE(y_target, y_pred):
    summ=0
    if len(y_target) == len(y_pred):
        for i in range(len(y_target)):
            summ += (y_target[i]-y_pred[i])**2
        return summ/len(y_target)

ridge = linear_model.Ridge().fit(X_train, y_train)
lasso = linear_model.Lasso().fit(X_train, y_train)

# Using MSE just to be shure that model gives predictions around needed values
# Regression do not give us binary values, but MSE still shows how far from needed value prediction is

print(MSE(y_test.values, ridge.predict(X_test)))
print(MSE(y_test.values, lasso.predict(X_test)))

0.000802602415317546
0.249836543909128


In [11]:
#lets find best alphas for regression
def get_best_alpha(model,X,y,alphas):
    kf = KFold(n_splits=3, random_state = 0, shuffle=False)
    results = []
    for alpha in alphas:
        model.set_params(alpha=alpha)
        for train_indexes, test_indexes in kf.split(X):
            X_test, X_val = X.values[train_indexes], X.values[test_indexes]
            y_test, y_val = y.values[train_indexes], y.values[test_indexes]

            model.fit(X_test, y_test)
            mse = mean_squared_error(y_val,model.predict(X_val))
            results.append((alpha, mse))
   

    best_alpha, mse = sorted(results, key=lambda x: x[1])[0]
    return best_alpha, mse

lasso_alphas = np.linspace(0.001, 5, 100)
print("Lasso best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Lasso(), X_train, y_train, lasso_alphas))

ridge_alphas = np.linspace(0.1, 10, 100)
print("Ridge best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Ridge(), X_train, y_train, ridge_alphas))



Lasso best alpha 0.0010 - Avg MSE 0.0010 




Ridge best alpha 0.1000 - Avg MSE 0.0000 


In [12]:
#Now we can search for top coefficients, that Ridge model found usefull making predictions

ridge = linear_model.Ridge(alpha = 0.1).fit(X_train, y_train)

coeff=[]

#treshold was chosen randomly, and then changed to left 3-4 parameters
treshold = 0.55

for i in range(len(ridge.coef_)):
    if ridge.coef_[i] < -treshold or ridge.coef_[i] > treshold:
        coeff.append(i)
        print(i, X_train.columns[i])
        


25 odor_c
89 ring-type_l
97 spore-print-color_r


In [13]:
#Now we can search for top coefficients, that Lasso model found usefull making predictions

lasso = linear_model.Lasso(alpha = 0.001).fit(X_train, y_train)

coeff=[]

#treshold was chosen randomly, and then changed to left 3-4 parameters

treshold = 0.4

for i in range(len(lasso.coef_)):
    if lasso.coef_[i] < -treshold or lasso.coef_[i] > treshold:
        coeff.append(i)
        print(i, X_train.columns[i])

24 odor_a
27 odor_l
29 odor_n
79 stalk-color-below-ring_y
97 spore-print-color_r


In [14]:
def apriori_apply(data):
    from mlxtend.frequent_patterns import apriori
    from mlxtend.frequent_patterns import association_rules
    freq_itemsets = apriori(data, min_support=0.25, use_colnames=True)
    ar = association_rules(freq_itemsets, metric='confidence', min_threshold=0.5)
    ar.sort_values('lift', inplace=True, ascending=False)
    print(1)


if __name__ == '__main__':
    # Loading data
    data = pd.read_csv('mushrooms.csv')
    
    # One-hot encoding data
    parts_hot_encoded = []
    hot_encoded = pd.DataFrame()
    for c in data.columns:
        parts_hot_encoded += [pd.get_dummies(data[c], prefix=c)]
    data = pd.concat(parts_hot_encoded, axis=1)

    # Dropping unnecessary columns
    # Veil type only has one value
    data.drop(['class_p', 'bruises_f', 'gill-attachment_a', 'gill-size_n', 'stalk-shape_t', 'veil-type_p'],
              axis=1, inplace=True)

    #apriori_apply(data)

    data_edibility = data['class_e']
    data.drop(['class_e'], axis=1, inplace=True)

    # Drop not-so-important features
    for c in data.columns:
        if 'odor' not in c and 'stalk-color-below-ring' not in c and 'spore-print-color' not in c:
            data.drop([c], axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(data, data_edibility, random_state=0)
    

    # See how well the models do
    
    #with parameters from Lasso regression
    print("Based on 'odor', 'spore-print-color' and 'stalk-color-below-ring' ")
    
    knn = KNeighborsClassifier(n_neighbors=5, leaf_size=1000, metric='minkowski', p=1)
    knn.fit(X_train, y_train)
    print("Knn prediction score: ", knn.score(X_test, y_test))

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print("Random Forest prediction score: ", rf.score(X_test, y_test))

Based on 'odor', 'spore-print-color' and 'stalk-color-below-ring' 
Knn prediction score:  0.9975381585425899
Random Forest prediction score:  0.9975381585425899


In [15]:
if __name__ == '__main__':
# Loading data
    data = pd.read_csv('mushrooms.csv')
    
    # One-hot encoding data
    parts_hot_encoded = []
    hot_encoded = pd.DataFrame()
    for c in data.columns:
        parts_hot_encoded += [pd.get_dummies(data[c], prefix=c)]
    data = pd.concat(parts_hot_encoded, axis=1)

    # Dropping unnecessary columns
    # Veil type only has one value
    data.drop(['class_p', 'bruises_f', 'gill-attachment_a', 'gill-size_n', 'stalk-shape_t', 'veil-type_p'],
              axis=1, inplace=True)

    #apriori_apply(data)

    data_edibility = data['class_e']
    data.drop(['class_e'], axis=1, inplace=True)
    
    for c in data.columns:
        if 'odor' not in c and 'ring-type' not in c and 'spore-print-color' not in c:
            data.drop([c], axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(data, data_edibility, random_state=0)

    # See how well the models do
    
    #with parameters from Ridge regression
    print("Based on 'odor', 'spore-print-color' and 'ring-type' ")

    knn = KNeighborsClassifier(n_neighbors=5, leaf_size=1000, metric='minkowski', p=1)
    knn.fit(X_train, y_train)
    print("Knn prediction score: ", knn.score(X_test, y_test))

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print("Random Forest prediction score: ", rf.score(X_test, y_test))


Based on 'odor', 'spore-print-color' and 'ring-type' 
Knn prediction score:  0.9950763170851797
Random Forest prediction score:  0.9950763170851797


In [16]:
#Now we are going to filter those parameters that are hard to determine visually and left only parameters that are easy to see
data = pd.read_csv('mushrooms.csv')
data = data.drop(columns=['odor', 'gill-attachment', 'gill-size', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'veil-type', 'ring-type', 'spore-print-color', 'population', 'habitat'])

data['class'] = data['class'].map(dict(e=1, p=0))
data['bruises'] = data['bruises'].map(dict(t=1, f=0))
data['stalk-shape'] = data['stalk-shape'].map(dict(e=1, t=0))

In [17]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'gill-spacing', 'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color',
       'ring-number'],
      dtype='object')

In [18]:
binary_df = pd.get_dummies(data, columns = ['cap-shape', 'cap-surface', 'cap-color',
       'gill-spacing', 'gill-color', 'stalk-root',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-color',
       'ring-number'])
#Splitting dataset into Train(65%) Validation(20%) and Test(15%) data

X_train, X_preval, y_train, y_preval = train_test_split(binary_df.drop(columns=['class']), binary_df['class'], test_size=0.35, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_preval, y_preval, test_size=0.43, random_state=1)


In [19]:
#Searching for most valueble features 
ridge = linear_model.Ridge(alpha = 0.1).fit(X_train, y_train)

coeff=[]

#treshold was chosen randomly, and then changed to left 3-4 parameters
treshold = 0.55

for i in range(len(ridge.coef_)):
    if ridge.coef_[i] < -treshold or ridge.coef_[i] > treshold:
        coeff.append(i)
        print(i, X_train.columns[i])

1 stalk-shape
6 cap-shape_s
24 gill-color_b
38 stalk-root_c


In [20]:
#Searching for most valueble features 
lasso = linear_model.Lasso(alpha = 0.001).fit(X_train, y_train)

coeff=[]

#treshold was chosen randomly, and then changed to left 3-4 parameters

treshold = 0.4

for i in range(len(lasso.coef_)):
    if lasso.coef_[i] < -treshold or lasso.coef_[i] > treshold:
        coeff.append(i)
        print(i, X_train.columns[i])

1 stalk-shape
24 gill-color_b
37 stalk-root_b
39 stalk-root_e
42 stalk-color-above-ring_c
58 stalk-color-below-ring_y


In [21]:
#Testing Random Forest and KNN models on best features only
if __name__ == '__main__':
# Loading data
    data = pd.read_csv('mushrooms.csv')
    
    # One-hot encoding data
    parts_hot_encoded = []
    hot_encoded = pd.DataFrame()
    for c in data.columns:
        parts_hot_encoded += [pd.get_dummies(data[c], prefix=c)]
    data = pd.concat(parts_hot_encoded, axis=1)

    #apriori_apply(data)

    data_edibility = data['class_e']
    data.drop(['class_e'], axis=1, inplace=True)
    
    for c in data.columns:
        if 'stalk-shape' not in c and 'gill-color' not in c and 'stalk-root' not in c and 'stalk-color-above-ring' not in c and 'stalk-color-below-ring' not in c:
            data.drop([c], axis=1, inplace=True)

    X_train, X_preval, y_train, y_preval = train_test_split(data, data_edibility, test_size=0.35, random_state=1)
    X_val, X_test, y_val, y_test = train_test_split(X_preval, y_preval, test_size=0.43, random_state=1) #0.43 because 43% of 35% is 15% of whole dataset

    # See how well the models do
    
    #with parameters from Lasso regression
    print("Based on 'stalk-shape', 'gill-color', 'stalk-root', 'stalk-color-above-ring' and 'stalk-color-below-ring' ")

    knn = KNeighborsClassifier(n_neighbors=5, leaf_size=1000, metric='minkowski', p=1)
    knn.fit(X_train, y_train)
    print("Knn prediction score: ", knn.score(X_test, y_test))

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print("Random Forest prediction score: ", rf.score(X_test, y_test))

Based on 'stalk-shape', 'gill-color', 'stalk-root', 'stalk-color-above-ring' and 'stalk-color-below-ring' 
Knn prediction score:  0.9672935404742437
Random Forest prediction score:  0.9705641864268193


In [30]:
if __name__ == '__main__':
# Loading data
    data = pd.read_csv('mushrooms.csv')
    
    # One-hot encoding data
    parts_hot_encoded = []
    hot_encoded = pd.DataFrame()
    for c in data.columns:
        parts_hot_encoded += [pd.get_dummies(data[c], prefix=c)]
    data = pd.concat(parts_hot_encoded, axis=1)

    #apriori_apply(data)

    data_edibility = data['class_e']
    data.drop(['class_e'], axis=1, inplace=True)
    
    for c in data.columns:
        if 'stalk-shape' not in c and 'gill-color' not in c and 'stalk-root' not in c:
            data.drop([c], axis=1, inplace=True)

    X_train, X_preval, y_train, y_preval = train_test_split(data, data_edibility, test_size=0.35, random_state=1)
    X_val, X_test, y_val, y_test = train_test_split(X_preval, y_preval, test_size=0.43, random_state=1) #0.43 because 43% of 35% is 15% of whole dataset


    # See how well the models do
    
    #with parameters from Ridge regression
    print("Based on 'stalk-shape', 'gill-color' and 'stalk-root' ")

    knn = KNeighborsClassifier(n_neighbors=9, leaf_size=1000, metric='minkowski', p=1)
    knn.fit(X_train, y_train)
    print("Knn prediction score: ", knn.score(X_test, y_test))

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print("Random Forest prediction score: ", rf.score(X_test, y_test))

Based on 'stalk-shape', 'gill-color' and 'stalk-root' 
Knn prediction score:  0.9533932951757972
Random Forest prediction score:  0.9533932951757972


Was decided to make model based on 'stalk-shape', 'gill-color' and 'stalk-root' because it is easier for user to visually recognize. We also removed 'cap-shape' parameter. Tests shown that it affects precision less than 1% but there are more than 6 cap shape types and some of theme are pretty hard to discern from another.

In [31]:
#Finding best parameter for KNN on validation data
for i in range(1,17,2):
        knn = KNeighborsClassifier(n_neighbors=i, leaf_size=1000, metric='minkowski', p=1)
        knn.fit(X_train, y_train)
        print(knn.score(X_val, y_val),i)

0.9568167797655768 1
0.9568167797655768 3
0.9568167797655768 5
0.9568167797655768 7
0.9568167797655768 9
0.9568167797655768 11
0.9568167797655768 13
0.9568167797655768 15


In [29]:
#Test data also gives good score for 7NN
knn = KNeighborsClassifier(n_neighbors=7, leaf_size=1000, metric='minkowski', p=1)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))

0.9599345870809485
