In [11]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report

In [12]:
mushroom = pd.read_csv('../data/agaricus-lepiota.data')

In [13]:
mushroom.head()

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
0,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


In [14]:
mushroom.columns

Index(['p', 'x', 's', 'n', 't', 'p.1', 'f', 'c', 'n.1', 'k', 'e', 'e.1', 's.1',
       's.2', 'w', 'w.1', 'p.2', 'w.2', 'o', 'p.3', 'k.1', 's.3', 'u'],
      dtype='object')

In [15]:
mushroom.rename(columns = {'p':'classes','x':'cap-shape','s':'cap-surface','n':'cap-color',
                           't':'bruises','p.1':'odor','f':'gill-attachment','c':'gill-spacing',
                           'n.1':'gill-size','k':'gill-color','e':'stalk-shape','e.1':'stalk-root',
                           's.1':'stalk-surface-above-ring','s.2':'stalk-surface-below-ring',
                           'w':'stalk-color-above-ring','w.1':'stalk-color-below-ring',
                           'p.2':'veil-type','w.2':'veil-color','o':'ring-number','p.3':'ring-type',
                           'k.1':'spore-print-color','s.3':'population','u':'habitat'},inplace = True)

In [16]:
mushroom = pd.get_dummies(mushroom)
mushroom

Unnamed: 0,classes_e,classes_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8118,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8119,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8120,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8121,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [17]:
LABELS = ['classes_e', 'classes_p']
FEATURES = [a  for a in mushroom.columns if a not in LABELS ]
y = mushroom[LABELS[0]]
x= mushroom[FEATURES]

In [18]:
#create parameter grid for values of k
parameters = {'knn__n_neighbors': np.arange(1,10)}

#instatiate pipeline with KNNClassifier: pl
pl = Pipeline([('knn', KNeighborsClassifier())])

#split data into test and training data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#instatiate GridsearchCV:cv
cv = GridSearchCV(pl, param_grid= parameters, cv = 3)

#fit model to training data
cv.fit(X_train, y_train)

#predict test data: y_pred
y_pred = cv.predict(X_test)

In [19]:
#print performance metrics
print (cv.best_params_)
print(classification_report(y_test, y_pred))

{'knn__n_neighbors': 1}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1163
           1       1.00      1.00      1.00      1274

    accuracy                           1.00      2437
   macro avg       1.00      1.00      1.00      2437
weighted avg       1.00      1.00      1.00      2437



In [23]:
import graphviz
from sklearn import tree
from sklearn.tree import export_graphviz

#export a graphic representation of tree to file

dot_data =export_graphviz(tree, out_file = None, feature_names =x.columns, class_names = ['edible', 'poisonous'])
graph = graphviz.Source(dot_data)
graph

TypeError: <module 'sklearn.tree' from '/home/recruit/Desktop/Umuzi/decision-tree/lib/python3.6/site-packages/sklearn/tree/__init__.py'> is not an estimator instance.