In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.ml import Pipeline
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('app').getOrCreate()

## Create SparkML Model

In [3]:
data = spark.read.csv('./data/boston_housing.csv', header=True, inferSchema=True)
data.printSchema()
data.count()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- bb: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



506

In [4]:
train, test = data.randomSplit([0.7, 0.3])

In [5]:
from pyspark.ml.feature import VectorAssembler

feature_columns = data.columns[:-1] # here we omit the final column
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [6]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol="features", labelCol="medv", maxIter=10, regParam=0.3, elasticNetParam=0.8)

## Create and Save Model Pipeline

In [7]:
pipeline = Pipeline(stages=[assembler, algo]) 

fitted_pipeline = pipeline.fit(train)

fitted_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
| 31.96544812305129|
| 30.48653104413399|
| 17.64777116159539|
|31.588668432303127|
| 41.70368572994125|
|31.639110686320922|
|26.305448554170525|
|25.890322199404977|
| 20.82280120313411|
|26.232771861001403|
| 25.53682610340246|
| 29.72646774659044|
|19.952750916394383|
|31.095275660945667|
|29.384591317369377|
|26.153917196276012|
|27.318130050039855|
|  35.9186521802321|
| 28.40004962565269|
| 27.17441279733336|
+------------------+
only showing top 20 rows



In [8]:
fitted_pipeline.write().overwrite().save('./data/models/boton_housing_spark_model_pipeline')

In [9]:
from pyspark.ml import PipelineModel

loaded_model_pipeline = PipelineModel.load("./data/models/boton_housing_spark_model_pipeline")

loaded_model_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
| 31.96544812305129|
| 30.48653104413399|
| 17.64777116159539|
|31.588668432303127|
| 41.70368572994125|
|31.639110686320922|
|26.305448554170525|
|25.890322199404977|
| 20.82280120313411|
|26.232771861001403|
| 25.53682610340246|
| 29.72646774659044|
|19.952750916394383|
|31.095275660945667|
|29.384591317369377|
|26.153917196276012|
|27.318130050039855|
|  35.9186521802321|
| 28.40004962565269|
| 27.17441279733336|
+------------------+
only showing top 20 rows



## On-board to Arthur

In [10]:
from arthurai import ArthurAI
from arthurai.common.constants import InputType, OutputType, Stage, ValueType

In [11]:
client = ArthurAI(url='app.arthur.ai', access_key='<YOUR-API-KEY>')

In [12]:
# create a dataframe from the training data to on-board model metadata
train_df = train.toPandas()
train_df = train_df.drop('medv', axis=1)  # drop predicted value column to leave only pipeline input
train_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,bb,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.00906,90.0,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85
2,0.01096,55.0,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23
3,0.01381,80.0,0.46,0,0.422,7.875,32.0,5.6484,4,255,14.4,394.23,2.97
4,0.01439,60.0,2.93,0,0.401,6.604,18.8,6.2196,1,265,15.6,376.7,4.38


In [13]:
MODEL_METADATA = {
    "partner_model_id": 'Spark Boston Housing Model',
    "description": "Spark Boston Housing Model",
    "input_type": InputType.Tabular,
    "model_type": OutputType.Regression,
    "tags": ['Spark'],
    "is_batch": True
}

model = client.model(**MODEL_METADATA)
model.from_dataframe(train_df[list(train_df.columns)[0:]], Stage.ModelPipelineInput)

model.add_regression_output_attributes({"medv_prediction": "medv_gt"}, value_type=ValueType.Float)

{'medv_prediction': ArthurAttribute(name='medv_prediction', value_type='FLOAT', stage='PREDICTED_VALUE', id=None, label=None, position=0, categorical=False, min_range=None, max_range=None, monitor_for_bias=False, categories=None, bins=None, is_unique=False, is_positive_predicted_attribute=False, attribute_link='medv_gt'),
 'medv_gt': ArthurAttribute(name='medv_gt', value_type='FLOAT', stage='GROUND_TRUTH', id=None, label=None, position=0, categorical=False, min_range=None, max_range=None, monitor_for_bias=False, categories=None, bins=None, is_unique=False, is_positive_predicted_attribute=False, attribute_link='medv_prediction')}

In [14]:
# review the model to ensure all attributes were inferred correctly
model.review()

Unnamed: 0,name,stage,value_type,categorical,is_unique,categories,bins,range,monitor_for_bias
0,crim,PIPELINE_INPUT,FLOAT,False,True,[],,"[0.00632, 73.5341]",False
1,zn,PIPELINE_INPUT,FLOAT,False,False,[],,"[0.0, 95.0]",False
2,indus,PIPELINE_INPUT,FLOAT,False,False,[],,"[0.46, 27.74]",False
3,chas,PIPELINE_INPUT,INTEGER,True,False,"[{value: 0}, {value: 1}]",,"[None, None]",False
4,nox,PIPELINE_INPUT,FLOAT,False,False,[],,"[0.389, 0.871]",False
5,rm,PIPELINE_INPUT,FLOAT,False,False,[],,"[3.561, 8.78]",False
6,age,PIPELINE_INPUT,FLOAT,False,False,[],,"[6.0, 100.0]",False
7,dis,PIPELINE_INPUT,FLOAT,False,False,[],,"[1.1296, 12.1265]",False
8,rad,PIPELINE_INPUT,INTEGER,True,False,"[{value: 1}, {value: 2}, {value: 3}, {value: 4...",,"[None, None]",False
9,tax,PIPELINE_INPUT,INTEGER,False,False,[],,"[187, 711]",False


In [15]:
# chas and rad are categorical, check the inferred possible categories
print(model.get_attribute('chas', Stage.ModelPipelineInput).categories)
print(model.get_attribute('rad', Stage.ModelPipelineInput).categories)

[AttributeCategory(value='0', label=None), AttributeCategory(value='1', label=None)]
[AttributeCategory(value='1', label=None), AttributeCategory(value='2', label=None), AttributeCategory(value='3', label=None), AttributeCategory(value='4', label=None), AttributeCategory(value='5', label=None), AttributeCategory(value='6', label=None), AttributeCategory(value='7', label=None), AttributeCategory(value='8', label=None), AttributeCategory(value='24', label=None)]


In [None]:
model.save()

In [19]:
# When using a spark model be sure to allocate at least 2 cpus to the model server.
# This can scale as you change the configurations of the spark session in your entrypoint
# script.
model.enable_explainability(df=train_df, project_directory='.',
                            user_predict_function_import_path='entrypoint',
                            streaming_explainability_enabled=False,
                            requirements_file='requirements.txt',
                            python_version="3.7",
                            sdk_version='3.1.0',
                            model_server_num_cpu='2')

## Send an inference batch:

In [20]:
# Ground truth must be sent separately when sending batch data, lets create a ground truth column to later break out
# and upload. Also note the convention that the ground truth column must be names the same as its 
# corresponding predicted value attribute with "_ground_truth" appended
test = test.withColumnRenamed("medv","medv_gt")
test.show()

+-------+-----+-----+----+------+-----+----+------+---+---+-------+------+-----+-------+
|   crim|   zn|indus|chas|   nox|   rm| age|   dis|rad|tax|ptratio|    bb|lstat|medv_gt|
+-------+-----+-----+----+------+-----+----+------+---+---+-------+------+-----+-------+
|0.01301| 35.0| 1.52|   0| 0.442|7.241|49.3|7.0379|  1|284|   15.5|394.74| 5.49|   32.7|
|0.01311| 90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|   35.4|
| 0.0136| 75.0|  4.0|   0|  0.41|5.888|47.6|7.3197|  3|469|   21.1| 396.9| 14.8|   18.9|
|0.01432|100.0| 1.32|   0| 0.411|6.816|40.5|8.3248|  5|256|   15.1| 392.9| 3.95|   31.6|
|0.01501| 90.0| 1.21|   1| 0.401|7.923|24.8| 5.885|  1|198|   13.6|395.52| 3.16|   50.0|
|0.01778| 95.0| 1.47|   0| 0.403|7.135|13.9|7.6534|  3|402|   17.0| 384.3| 4.45|   32.9|
| 0.0187| 85.0| 4.15|   0| 0.429|6.516|27.7|8.5353|  4|351|   17.9|392.43| 6.36|   23.1|
|0.01951| 17.5| 1.38|   0|0.4161|7.104|59.5|9.2229|  3|216|   18.6|393.24| 8.05|   33.0|
|0.01965| 80.0| 1.76|

In [21]:
import uuid
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
# make predictions
predicted_dataframe = loaded_model_pipeline.transform(test).withColumnRenamed("prediction", "medv_prediction")
# In order to send ground truth we must use an external id to match up rows in the ground truth dataframe and
# inferences dataframe
uuidUdf= udf(lambda : str(uuid.uuid4()), StringType())
predicted_dataframe = (predicted_dataframe.withColumn('partner_inference_id', uuidUdf())
                       .withColumn('inference_timestamp', F.unix_timestamp()))
predicted_dataframe.show()

+-------+-----+-----+----+------+-----+----+------+---+---+-------+------+-----+-------+--------------------+------------------+--------------------+-------------------+
|   crim|   zn|indus|chas|   nox|   rm| age|   dis|rad|tax|ptratio|    bb|lstat|medv_gt|            features|   medv_prediction|partner_inference_id|inference_timestamp|
+-------+-----+-----+----+------+-----+----+------+---+---+-------+------+-----+-------+--------------------+------------------+--------------------+-------------------+
|0.01301| 35.0| 1.52|   0| 0.442|7.241|49.3|7.0379|  1|284|   15.5|394.74| 5.49|   32.7|[0.01301,35.0,1.5...| 31.96544812305129|dc520798-c475-42f...|         1610561059|
|0.01311| 90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|   35.4|[0.01311,90.0,1.2...| 30.48653104413399|aab565db-2d8c-4ae...|         1610561059|
| 0.0136| 75.0|  4.0|   0|  0.41|5.888|47.6|7.3197|  3|469|   21.1| 396.9| 14.8|   18.9|[0.0136,75.0,4.0,...| 17.64777116159539|6660604f-92f4-418...| 

In [22]:
# Now we separate out the inference input dataframe frame and the ground truth dataframe
pipeline_input_attr_names = [attr.name for attr in model.get_attributes(Stage.ModelPipelineInput)]
columns_to_select = pipeline_input_attr_names + ['medv_prediction', 'partner_inference_id', 'inference_timestamp']
batch_inferences = predicted_dataframe.select(columns_to_select)

In [23]:
# getting ground truth batch dataframe
columns_to_select = ['medv_gt', 'partner_inference_id']
ground_truth_batch = predicted_dataframe.select(columns_to_select).withColumn('ground_truth_timestamp', F.unix_timestamp())

In [24]:
# write inferences dataframe to parquet file
batch_inferences.write.mode('overwrite').parquet("./data/batch_inference_files/batch_inferences.parquet")
ground_truth_batch.write.mode('overwrite').parquet("./data/batch_ground_truth_files/ground_truth.parquet")

In [25]:
model.send_batch_inferences(directory_path='./data/batch_inference_files/', batch_id="batch1")

({'counts': {'success': 153, 'failure': 0, 'total': 153}, 'failures': [[]]},
 {'dataset_close_result': 'success'})

In [26]:
model.send_batch_ground_truths(directory_path='./data/batch_ground_truth_files/')

{'counts': {'success': 153, 'failure': 0, 'total': 153}, 'failures': [[]]}