# Decision Tree를 활용한 Mushroom 데이터 분류

### 1) Mushroom Data Set 로드 및 scikit을 활용하기 위한 데이터 분리

In [2]:
import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data'
raw_csv = urllib2.urlopen(path)
col_names = range(23)
df = pd.read_csv(raw_csv, names = col_names)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


- categorical 데이터를 ordered 데이터로 변경

In [4]:
df[0] = df[0].map({'p': 1, 'e': 0})
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,0,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,0,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,1,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,0,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
map_dic = {}
num_columns = df.shape[1]
for i in range(num_columns):
    unique_array = df[i].unique()
    map_dic_sub = {}
    for j in range(len(unique_array)):
        map_dic_sub[unique_array[j]] = j
    df[i] = df[i].map(map_dic_sub)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,1,1,1
2,1,1,0,2,0,2,0,0,1,1,...,0,0,0,0,0,0,0,1,1,2
3,0,0,1,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,3,1,3,0,1,1,0,...,0,0,0,0,0,0,1,1,2,1


In [6]:
attributes = df.iloc[:, 1:23]
attributes.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [7]:
mushroom_data = attributes.values
mushroom_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 1, 1],
       [1, 0, 2, ..., 1, 1, 2],
       ..., 
       [3, 0, 0, ..., 8, 5, 6],
       [4, 1, 0, ..., 4, 3, 6],
       [0, 0, 0, ..., 6, 5, 6]])

In [8]:
target_series = df.iloc[:, 0]
target_series.head()

0    0
1    1
2    1
3    0
4    1
Name: 0, dtype: int64

In [9]:
mushroom_target = target_series.values
mushroom_target

array([0, 1, 1, ..., 1, 0, 1])

### 2) scikit의 DecisionTreeClassifier를 활용한 결정 트리 분류

In [10]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(mushroom_data, mushroom_target)

In [11]:
with open("mushroom.dot", 'w') as f2:
    tree.export_graphviz(clf, out_file=f2)

<img src="./mushroom.png"/>

- classifier (clf2) 객체를 활용한 새로운 데이터에 대한 분류 추론

In [12]:
mushroom_data[-1]

array([ 0,  0,  0,  1,  3,  1,  0,  1, 10,  0,  4,  0,  0,  6,  7,  0,  2,
        0,  0,  6,  5,  6])

In [13]:
mushroom_data[-1].reshape(1,-1)

array([[ 0,  0,  0,  1,  3,  1,  0,  1, 10,  0,  4,  0,  0,  6,  7,  0,  2,
         0,  0,  6,  5,  6]])

In [14]:
clf.predict(mushroom_data[-1].reshape(1,-1))

array([1])

In [15]:
clf.predict(mushroom_data[-2].reshape(1,-1))

array([0])

### 3) Spark을 활용한 Mushroom 데이터 분류

In [81]:
import findspark     
 
findspark.init()  
from pyspark import SparkContext, SparkFiles, SQLContext

if not 'sc' in locals():  
    sc = SparkContext()  

sqlCtx = SQLContext(sc)
sdf = sqlCtx.createDataFrame(df)
sdf.show()
#sdf.printSchema()
print "Raw data size is %s" % sdf.count()

+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22|
+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|  1|  0|  0|  1|  0|  1|  0|  0|  1|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  1|
|  1|  1|  0|  2|  0|  2|  0|  0|  1|  1|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  2|
|  0|  0|  1|  2|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|  1|  0|  0|  3|  1|  3|  0|  1|  1|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|  2|  1|
|  1|  0|  1|  1|  0|  1|  0|  0|  1|  1|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  1|
|  1|  1|  0|  2|  0|  1|  0|  0|  1|  2|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|  1|  2|
|  1|  1|  1|  2|  0|  2|  0|  0|  1|  1|  0|  1|  0|  0|  0

In [82]:
from pyspark.mllib.tree import DecisionTree, LabeledPoint

result = sdf.rdd.map(lambda row: LabeledPoint(row[0], row[1:23]))
(trainingData, testData) = result.randomSplit([0.7, 0.3])

featuresTrainingData = trainingData.map(lambda x: x.features)
labelTrainingData = trainingData.map(lambda x: x.label)


print featuresTrainingData.take(10)
print labelTrainingData.take(10)
print testData.count()

[DenseVector([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0, 1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0, 0.0, 3.0, 1.0, 3.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 2.0, 1.0]), DenseVector([0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0]), DenseVector([1.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0]), DenseVector([1.0, 1.0, 2.0, 0.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]), DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0]), DenseVector([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 4.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0]),

In [83]:
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                          impurity='gini', maxDepth=6, maxBins=200)

In [84]:
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)

#print labelsAndPredictions.take(100)

errorCount = labelsAndPredictions.filter(lambda (v, p): v != p).count()
print "errorCount = %s" % errorCount
testErr = errorCount / float(testData.count())
print 'Test Error = %s' % testErr
print 'Learned classification tree model:'
print model.toDebugString()

errorCount = 14
Test Error = 0.00567490879611
Learned classification tree model:
DecisionTreeModel classifier of depth 6 with 15 nodes
  If (feature 4 <= 3.0)
   If (feature 4 <= 0.0)
    Predict: 0.0
   Else (feature 4 > 0.0)
    If (feature 19 <= 4.0)
     If (feature 13 <= 6.0)
      If (feature 1 <= 2.0)
       If (feature 11 <= 1.0)
        Predict: 1.0
       Else (feature 11 > 1.0)
        Predict: 1.0
      Else (feature 1 > 2.0)
       Predict: 0.0
     Else (feature 13 > 6.0)
      Predict: 0.0
    Else (feature 19 > 4.0)
     If (feature 2 <= 0.0)
      Predict: 1.0
     Else (feature 2 > 0.0)
      Predict: 0.0
  Else (feature 4 > 3.0)
   Predict: 0.0



### 4) Spark의 Random Forest 라이브러리를 이용

In [85]:
from pyspark.mllib.tree import RandomForest

In [87]:
model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=100, 
                                          impurity='gini', maxDepth=6, maxBins=200)

In [88]:
predictions = model2.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda x: x.label).zip(predictions)

#print labelsAndPredictions.take(100)

errorCount = labelsAndPredictions.filter(lambda (v, p): v != p).count()
print "errorCount = %s" % errorCount
testErr = errorCount / float(testData.count())
print 'Test Error = %s' % testErr
print 'Learned classification tree model2:'
print model2.toDebugString()

errorCount = 0
Test Error = 0.0
Learned classification tree model2:
TreeEnsembleModel classifier with 100 trees

  Tree 0:
    If (feature 7 <= 0.0)
     If (feature 6 <= 0.0)
      If (feature 4 <= 3.0)
       If (feature 18 <= 1.0)
        If (feature 1 <= 1.0)
         Predict: 0.0
        Else (feature 1 > 1.0)
         If (feature 14 <= 0.0)
          Predict: 1.0
         Else (feature 14 > 0.0)
          Predict: 0.0
       Else (feature 18 > 1.0)
        Predict: 1.0
      Else (feature 4 > 3.0)
       Predict: 0.0
     Else (feature 6 > 0.0)
      If (feature 0 <= 1.0)
       If (feature 4 <= 3.0)
        If (feature 8 <= 4.0)
         If (feature 2 <= 1.0)
          Predict: 1.0
         Else (feature 2 > 1.0)
          Predict: 1.0
        Else (feature 8 > 4.0)
         Predict: 0.0
       Else (feature 4 > 3.0)
        Predict: 0.0
      Else (feature 0 > 1.0)
       If (feature 10 <= 1.0)
        Predict: 0.0
       Else (feature 10 > 1.0)
        If (feature 20 <= 3.0)
 