# Decision Tree를 활용한 Mushroom 데이터 분류

### 1) Mushroom Data Set 로드 및 scikit을 활용하기 위한 데이터 분리

In [13]:
import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


- categorical 데이터를 ordered 데이터로 변경

In [15]:
df[0] = df[0].map({'p': 1, 'e': 0})
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [16]:
map_dic = {}
num_columns = df.shape[1]
for i in range(num_columns):
    unique_array = df[i].unique()
    map_dic_sub = {}
    for j in range(len(unique_array)):
        map_dic_sub[unique_array[j]] = j
    df[i] = df[i].map(map_dic_sub)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,1,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,0,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [17]:
attributes = df.iloc[:, 1:23]
attributes.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [18]:
mushroom_data = attributes.values
mushroom_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 1, 1],
       [1, 0, 2, ..., 1, 1, 2],
       ..., 
       [3, 0, 0, ..., 8, 5, 6],
       [4, 1, 0, ..., 4, 3, 6],
       [0, 0, 0, ..., 6, 5, 6]])

In [19]:
target_series = df.iloc[:, 0]
target_series.head()

0    0
1    1
2    1
3    0
4    1
Name: 0, dtype: int64

In [20]:
mushroom_target = target_series.values
mushroom_target

array([0, 1, 1, ..., 1, 0, 1])

### 2) scikit의 DecisionTreeClassifier를 활용한 결정 트리 분류

In [23]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(mushroom_data, mushroom_target)

In [25]:
with open("mushroom.dot", 'w') as f2:
    tree.export_graphviz(clf, out_file=f2)

<img src="./mushroom.png"/>

- classifier (clf2) 객체를 활용한 새로운 데이터에 대한 분류 추론

In [26]:
mushroom_data[-1]

array([ 0,  0,  0,  1,  3,  1,  0,  1, 10,  0,  4,  0,  0,  6,  7,  0,  2,
        0,  0,  6,  5,  6])

In [32]:
mushroom_data[-1].reshape(1,-1)

array([[ 0,  0,  0,  1,  3,  1,  0,  1, 10,  0,  4,  0,  0,  6,  7,  0,  2,
         0,  0,  6,  5,  6]])

In [31]:
clf.predict(mushroom_data[-1].reshape(1,-1))

array([1])

In [33]:
clf.predict(mushroom_data[-2].reshape(1,-1))

array([0])