In [1]:
#import Spark and MLlib packages
from pyspark import SparkContext, SparkConf
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

#import data analysis packages
import numpy as np
import pandas as pd
import sklearn

from pandas import Series, DataFrame
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from numpy import array

from sklearn import tree
from sklearn.externals.joblib import Memory
from sklearn.datasets import load_svmlight_file

#for sklearn decision tree pdf plotting
from sklearn.externals.six import StringIO
import pydot

#import data visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#misc packages
from __future__ import division
from __future__ import print_function

In [2]:
#I.Load dataset
mem = Memory("./mycache")

#using decoration to pass file to memory
@mem.cache
def get_data():
    data = load_svmlight_file("/usr/local/spark/data/mllib/sample_libsvm_data.txt")
    return data[0], data[1]

x, y = get_data()

In [3]:
#Have to convert to dense array to fit the model
dense_x = x.toarray()

#Split the training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(dense_x, y, test_size=0.3)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(70, 692) (30, 692) (70,) (30,)


In [4]:
#Training the model
ID3 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth=5).fit(X_train, Y_train)
CART = tree.DecisionTreeClassifier(criterion = 'gini', max_depth=5).fit(X_train, Y_train)

In [5]:
def cal_model_accuracy(list):
    for i, clf in enumerate(list):
        predicted = clf.predict(X_test)
        expected = Y_test
    
        #compare results
        accuracy = metrics.accuracy_score(expected, predicted)
        if i==0: print("ID3 accuracy is {}".format(accuracy))
        else:    print("CART accuracy is {}".format(accuracy))

l_list = (ID3, CART)
cal_model_accuracy(l_list)

ID3 accuracy is 0.966666666667
CART accuracy is 0.966666666667


In [7]:
#Generate pdf file of decision tree
dot_data = StringIO()
tree.export_graphviz(CART, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("CART_decision_tree.pdf")

True

In [8]:
#IV Use MLlib
sc = SparkContext("local", "Decision_tree")

In [9]:
data = MLUtils.loadLibSVMFile(sc, '/usr/local/spark/data/mllib/sample_libsvm_data.txt')

In [11]:
data.take(1)

[LabeledPoint(0.0, (692,[127,128,129,130,131,154,155,156,157,158,159,181,182,183,184,185,186,187,188,189,207,208,209,210,211,212,213,214,215,216,217,235,236,237,238,239,240,241,242,243,244,245,262,263,264,265,266,267,268,269,270,271,272,273,289,290,291,292,293,294,295,296,297,300,301,302,316,317,318,319,320,321,328,329,330,343,344,345,346,347,348,349,356,357,358,371,372,373,374,384,385,386,399,400,401,412,413,414,426,427,428,429,440,441,442,454,455,456,457,466,467,468,469,470,482,483,484,493,494,495,496,497,510,511,512,520,521,522,523,538,539,540,547,548,549,550,566,567,568,569,570,571,572,573,574,575,576,577,578,594,595,596,597,598,599,600,601,602,603,604,622,623,624,625,626,627,628,629,630,651,652,653,654,655,656,657],[51.0,159.0,253.0,159.0,50.0,48.0,238.0,252.0,252.0,252.0,237.0,54.0,227.0,253.0,252.0,239.0,233.0,252.0,57.0,6.0,10.0,60.0,224.0,252.0,253.0,252.0,202.0,84.0,252.0,253.0,122.0,163.0,252.0,252.0,252.0,253.0,252.0,252.0,96.0,189.0,253.0,167.0,51.0,238.0,253.0,253.0,190.0

In [12]:
#Split the training set and test set
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [13]:
#Training model
ID3_model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)
CART_model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)

In [14]:
#Predication
def cal_mllib_accuracy(list):
    for i, clf in enumerate(list):
        #prediction with the features
        predictions = clf.predict(testData.map(lambda x: x.features))
        #append with lables first then features
        labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
        
        accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).count()/testData.count()
    
        #compare results
        
        if i==0: print("PySpark ID3 accuracy is {}".format(accuracy))
        else:    print("PySpark CART accuracy is {}".format(accuracy))
            
cal_mllib_accuracy((ID3_model, CART_model))

PySpark ID3 accuracy is 1.0
PySpark CART accuracy is 1.0
