In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("mushrooms.csv")

y = data['class']
x = data.drop(['class', 'veil-type'], axis=1)

In [3]:

columns = np.array(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'])

# Preprocess Data

## Feature Selection

### Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

le = data.apply(LabelEncoder().fit_transform)
le.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [5]:
le.nunique()

class                        2
cap-shape                    6
cap-surface                  4
cap-color                   10
bruises                      2
odor                         9
gill-attachment              2
gill-spacing                 2
gill-size                    2
gill-color                  12
stalk-shape                  2
stalk-root                   5
stalk-surface-above-ring     4
stalk-surface-below-ring     4
stalk-color-above-ring       9
stalk-color-below-ring       9
veil-type                    1
veil-color                   4
ring-number                  3
ring-type                    5
spore-print-color            9
population                   6
habitat                      7
dtype: int64

### SelectKBest

In [6]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression

In [7]:
yle = le['class']
xle = le.drop(['class', 'veil-type'], axis=1)

In [8]:
select = SelectKBest(score_func=chi2, k=10).fit(x,y)
z = select.transform(x)

filter = select.get_support()
columns[filter]

ValueError: ignored

In [9]:
pd.DataFrame({'variable': columns,
              'score_f_anova': select.scores_}).sort_values(ascending=False, by='score_f_anova').reset_index(drop=True)

NameError: ignored

## Correlation

In [10]:
plt.figure(figsize=(20,10))
sns.heatmap(x.corr(), annot=True)

ValueError: ignored

<Figure size 1440x720 with 0 Axes>

### One Hot Encoder

In [11]:
y = data['class']
x = data.drop(['class', 'veil-type'], axis=1)

feature = columns[filter]
selected = x[feature]
selected

IndexError: ignored

In [None]:
ohe = pd.get_dummies(selected, columns=feature)
ohe.columns

# Classification

## Decision Tree

In [None]:
y = yle
x = ohe

from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

parameters = {'splitter' : ['best', 'random'],
              'criterion' :['gini', 'entropy'],
              'min_samples_split':[2, 3, 4, 5], 
              'max_depth':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
              'class_weight':('balanced', None),
             }


tr = tree.DecisionTreeClassifier()
gsearch = GridSearchCV(tr, parameters)
gsearch.fit(X_train, y_train)
model = gsearch.best_estimator_
model

In [None]:
score = model.score(X_train, y_train)
score

In [None]:
import graphviz
dot_data = tree.export_graphviz(model, out_file=None,
                                feature_names=X_test.columns,
                                class_names=['edible', 'poisonous'],
                                filled=True, rounded=True,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph

In [None]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))