## Setup PySpark JPMML-SparkML Library

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar --py-files /root/lib/jpmml.py pyspark-shell'
os.environ['PYTHONPATH'] = '/root/lib/jpmml-sparkml-package-1.0-SNAPSHOT.jar:$SPARK_HOME/python/lib/py4j-0.10.3-src.zip:$PYTHONPATH'

## Setup Spark and SQL Contexts

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)

sqlContext

<pyspark.sql.context.SQLContext at 0x7fb02f5ab050>

## Build Decision Tree (Regression) with Spark ML Pipeline

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.regression import DecisionTreeRegressor

data = sqlContext.read.csv('file:///root/notebooks/data/wine.csv', header = True, inferSchema = True)

formula = RFormula(formula = "quality ~ .")
regressor = DecisionTreeRegressor()
pipeline = Pipeline(stages = [formula, regressor])
pipelineModel = pipeline.fit(data)

pipelineModel

PipelineModel_4039849ff239708d3da9

## Convert Generated Spark ML Pipeline to PMML

In [4]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkContext, data, pipelineModel)

str(pmmlBytes)

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">\n\t<Header>\n\t\t<Application/>\n\t\t<Timestamp>2016-09-25T23:13:38Z</Timestamp>\n\t</Header>\n\t<DataDictionary>\n\t\t<DataField name="quality" optype="continuous" dataType="double"/>\n\t\t<DataField name="volatile_acidity" optype="continuous" dataType="double"/>\n\t\t<DataField name="citric_acid" optype="continuous" dataType="double"/>\n\t\t<DataField name="residual_sugar" optype="continuous" dataType="double"/>\n\t\t<DataField name="chlorides" optype="continuous" dataType="double"/>\n\t\t<DataField name="free_sulfur_dioxide" optype="continuous" dataType="double"/>\n\t\t<DataField name="total_sulfur_dioxide" optype="continuous" dataType="double"/>\n\t\t<DataField name="density" optype="continuous" dataType="double"/>\n\t\t<DataField name="pH" optype="continuous" dataType="double"/>\n\t\t<DataField name="sulphates" optype="continuous" dataType="double"/>\n\t\t<DataField n