In [11]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile('file:/usr/hdp/2.4.0.0-169/spark/final_patientyr_data_3.csv')
parsedData = data.map(parsePoint)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

In [12]:
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)

In [13]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.0253984063745
Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 31 nodes
  If (feature 129 <= 0.0)
   Predict: 0.0
  Else (feature 129 > 0.0)
   If (feature 34 <= 2.0)
    If (feature 34 <= 0.0)
     If (feature 27 <= 30220.24)
      If (feature 29 <= 978.25)
       Predict: 0.0
      Else (feature 29 > 978.25)
       Predict: 0.0
     Else (feature 27 > 30220.24)
      Predict: 1.0
    Else (feature 34 > 0.0)
     If (feature 26 <= 132.6)
      If (feature 5 <= 0.0)
       Predict: 0.0
      Else (feature 5 > 0.0)
       Predict: 1.0
     Else (feature 26 > 132.6)
      If (feature 80 <= 0.0)
       Predict: 0.0
      Else (feature 80 > 0.0)
       Predict: 1.0
   Else (feature 34 > 2.0)
    If (feature 33 <= 0.0)
     If (feature 32 <= 0.0)
      If (feature 128 <= 0.0)
       Predict: 0.0
      Else (feature 128 > 0.0)
       Predict: 0.0
     Else (feature 32 > 0.0)
      If (feature 13 <= 0.0)
       Predict: 1.0
      Else (feature 13 

In [15]:
treeModel = model.stages[2]
print(treeModel)

AttributeError: 'DecisionTreeModel' object has no attribute 'stages'

In [14]:
# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")