## Setup SparkContext and SQLContext

In [1]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)

sqlContext

<pyspark.sql.context.SQLContext at 0x7fef58460898>

## Load Training Dataset from S3 into Spark

In [51]:
data = sqlContext.read.format("csv") \
  .option("inferSchema", "true").option("header", "true") \
  .load("s3a://datapalooza/R/census.csv")

data.collect()

[Row(age=39, workclass='State-gov', education='Bachelors', education_num=13, marital_status='Never-married', occupation='Adm-clerical', relationship='Not-in-family', race='White', sex='Male', capital_gain=2174, capital_loss=0, hours_per_week=40, native_country='United-States', income='<=50K'),
 Row(age=50, workclass='Self-emp-not-inc', education='Bachelors', education_num=13, marital_status='Married-civ-spouse', occupation='Exec-managerial', relationship='Husband', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=13, native_country='United-States', income='<=50K'),
 Row(age=38, workclass='Private', education='HS-grad', education_num=9, marital_status='Divorced', occupation='Handlers-cleaners', relationship='Not-in-family', race='White', sex='Male', capital_gain=0, capital_loss=0, hours_per_week=40, native_country='United-States', income='<=50K'),
 Row(age=53, workclass='Private', education='11th', education_num=7, marital_status='Married-civ-spouse', occupation=

## Use Spark ML Pipeline to build Decision Tree Classifier

In [22]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier

formula = RFormula(formula = "income ~ .")
classifier = DecisionTreeClassifier()
pipeline = Pipeline(stages = [formula, classifier])
pipelineModel = pipeline.fit(data)

pipelineModel

PipelineModel_4660ba8d2ffc85487db1

## Convert Spark ML Pipeline to PMML

In [23]:
from jpmml import toPMMLBytes

pmmlBytes = toPMMLBytes(sparkContext, data, pipelineModel)

pmmlBytes.decode("utf-8")

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">\n\t<Header>\n\t\t<Application/>\n\t\t<Timestamp>2016-10-14T07:06:06Z</Timestamp>\n\t</Header>\n\t<DataDictionary>\n\t\t<DataField name="income" optype="categorical" dataType="string">\n\t\t\t<Value value="&lt;=50K"/>\n\t\t\t<Value value="&gt;50K"/>\n\t\t</DataField>\n\t\t<DataField name="workclass" optype="categorical" dataType="string">\n\t\t\t<Value value="Private"/>\n\t\t\t<Value value="Self-emp-not-inc"/>\n\t\t\t<Value value="Local-gov"/>\n\t\t\t<Value value="State-gov"/>\n\t\t\t<Value value="Self-emp-inc"/>\n\t\t\t<Value value="Federal-gov"/>\n\t\t\t<Value value="Without-pay"/>\n\t\t</DataField>\n\t\t<DataField name="education" optype="categorical" dataType="string">\n\t\t\t<Value value="HS-grad"/>\n\t\t\t<Value value="Some-college"/>\n\t\t\t<Value value="Bachelors"/>\n\t\t\t<Value value="Masters"/>\n\t\t\t<Value value="Assoc-voc"/>\n\t\t\t<Value value="11th"/>\n\t\t\

## Deployment Option 1:  Mutable Model Deployment

### Deploy PMML to Live Prediction Server

In [10]:
import urllib.request

update_url = 'http://prediction:9040/update-pmml/census'

update_headers = {}
update_headers['Content-type'] = 'application/json'

req = urllib.request.Request(update_url, headers=update_headers, data=pmmlBytes)
resp = urllib.request.urlopen(req)

print(resp.status) # Should return Http Status 200 

200


### Test New Model on Live Prediction Server

In [11]:
import urllib.parse
import json

evaluate_url = 'http://prediction:9040/evaluate-pmml/census'

evaluate_headers = {}
evaluate_headers['Content-type'] = 'application/json'
input_params = '{"age":39,"workclass":"State-gov","education":"Bachelors","education_num":13,"marital_status":"Never-married","occupation":"Adm-clerical","relationship":"Not-in-family","race":"White","sex":"Male","capital_gain":2174,"capital_loss":0,"hours_per_week":40,"native_country":"United-States"}' 
encoded_input_params = input_params.encode('utf-8')

req = urllib.request.Request(evaluate_url, headers=evaluate_headers, data=encoded_input_params)
resp = urllib.request.urlopen(req)

print(resp.read()) # Should return valid classification with probabilities

b'{"results":[{\'income\': \'NodeScoreDistribution{result=<=50K, probability_entries=[<=50K=0.9564524694636218, >50K=0.04354753053637812], entityId=7, confidence_entries=[]}\'}]}'


## Deployment Option 2:  Immutable Model Deployment

### Deploy New Prediction Service with New Model as a [Canary](http://martinfowler.com/bliki/CanaryRelease.html) Release

In [55]:
!mkdir -p pmml

with open('pmml/census.pmml', 'wb') as f:
  f.write(pmmlBytes)

!cat pmml/census.pmml

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
	<Header>
		<Application/>
		<Timestamp>2016-10-14T07:06:06Z</Timestamp>
	</Header>
	<DataDictionary>
		<DataField name="income" optype="categorical" dataType="string">
			<Value value="&lt;=50K"/>
			<Value value="&gt;50K"/>
		</DataField>
		<DataField name="workclass" optype="categorical" dataType="string">
			<Value value="Private"/>
			<Value value="Self-emp-not-inc"/>
			<Value value="Local-gov"/>
			<Value value="State-gov"/>
			<Value value="Self-emp-inc"/>
			<Value value="Federal-gov"/>
			<Value value="Without-pay"/>
		</DataField>
		<DataField name="education" optype="categorical" dataType="string">
			<Value value="HS-grad"/>
			<Value value="Some-college"/>
			<Value value="Bachelors"/>
			<Value value="Masters"/>
			<Value value="Assoc-voc"/>
			<Value value="11th"/>
			<Value value="Assoc-acdm"/>
			<Value value="10th"/>
			<Value 