In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('app').getOrCreate()

## Create SparkML Model

In [3]:
data = spark.read.csv('./data/boston_housing.csv', header=True, inferSchema=True)
data.printSchema()
data.count()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



506

In [4]:
train, test = data.randomSplit([0.7, 0.3])

In [5]:
from pyspark.ml.feature import VectorAssembler

feature_columns = data.columns[:-1] # here we omit the final column
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [6]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol="features", labelCol="medv", maxIter=10, regParam=0.3, elasticNetParam=0.8)

## Create and Save Model Pipeline

In [7]:
pipeline = Pipeline(stages=[assembler, algo]) 

fitted_pipeline = pipeline.fit(train)

fitted_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
|30.481707803483047|
| 16.89827799730775|
|40.212911231135195|
| 33.97810421299857|
|  31.0127510212452|
|29.605011549181675|
| 29.89424834671887|
|20.987956345840807|
|27.596850682188567|
| 36.79490216562817|
|22.783675426549465|
|36.401465521459905|
|22.124403711703017|
|27.751813493172925|
|17.306536637397006|
| 24.27501240378283|
|30.074080207076484|
|25.999155030858645|
|31.640696850526474|
|26.691771062721898|
+------------------+
only showing top 20 rows



In [8]:
fitted_pipeline.write().overwrite().save('./data/models/boton_housing_spark_model_pipeline')

In [9]:
from pyspark.ml import PipelineModel

loaded_model_pipeline = PipelineModel.load("./data/models/boton_housing_spark_model_pipeline")

loaded_model_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
|30.481707803483047|
| 16.89827799730775|
|40.212911231135195|
| 33.97810421299857|
|  31.0127510212452|
|29.605011549181675|
| 29.89424834671887|
|20.987956345840807|
|27.596850682188567|
| 36.79490216562817|
|22.783675426549465|
|36.401465521459905|
|22.124403711703017|
|27.751813493172925|
|17.306536637397006|
| 24.27501240378283|
|30.074080207076484|
|25.999155030858645|
|31.640696850526474|
|26.691771062721898|
+------------------+
only showing top 20 rows



## On-board to Arthur

In [10]:
from arthurai import ArthurAI
from arthurai import ModelType, InputType, Stage, DataType, ArthurModel
from arthurai.client.apiv2.arthur_explainer import ArthurExplainer

In [11]:
client = ArthurAI(url='dashboard.arthur.ai', access_key='<access_key>')

In [12]:
# create a dataframe from the training data to on-board model metadata
train_df = train.toPandas()
train_df = train_df.drop('medv', axis=1)  # drop predicted value column to leave only pipeline input
train_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00906,90.0,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85
1,0.01096,55.0,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23
2,0.01301,35.0,1.52,0,0.442,7.241,49.3,7.0379,1,284,15.5,394.74,5.49
3,0.01311,90.0,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81
4,0.01381,80.0,0.46,0,0.422,7.875,32.0,5.6484,4,255,14.4,394.23,2.97


In [13]:
MODEL_METADATA = {
    "name": 'Spark Boston Housing Model',
    "description": "Spark Boston Housing Model",
    "input_type": InputType.Tabular,
    "model_type": ModelType.Regression,
    "tags": ['Spark'],
    "is_batch": True
}

model = client.model(**MODEL_METADATA)
model.from_dataframe(train_df[list(train_df.columns)[0:]], Stage.ModelPipelineInput)
model.attribute(
    name='medv',
    stage=Stage.GroundTruth,
    data_type=DataType.Float,
    categorical=False,
    position=0
)

In [14]:
# review the model to ensure all attributes were inferred correctly
model.review_model()

name               stage                     data_type         categorical  is_unique  
crim               Stage.ModelPipelineInput  DataType.Float    False        True       
zn                 Stage.ModelPipelineInput  DataType.Float    False        True       
indus              Stage.ModelPipelineInput  DataType.Float    False        True       
chas               Stage.ModelPipelineInput  DataType.Integer  True         False      
nox                Stage.ModelPipelineInput  DataType.Float    False        True       
rm                 Stage.ModelPipelineInput  DataType.Float    False        True       
age                Stage.ModelPipelineInput  DataType.Float    False        True       
dis                Stage.ModelPipelineInput  DataType.Float    False        True       
rad                Stage.ModelPipelineInput  DataType.Integer  True         False      
tax                Stage.ModelPipelineInput  DataType.Integer  False        True       
ptratio            Stage.ModelPi

In [15]:
# chas and rad are categorical, check the inferred possible categories
print(model.get_attribute('chas', Stage.ModelPipelineInput).categories)
print(model.get_attribute('rad', Stage.ModelPipelineInput).categories)

[0, 1]
[1, 2, 3, 4, 5, 6, 7, 8, 24]


In [16]:
# When using a spark model be sure to allocate at least 2 cpus to the model server.
# This can scale as you change the configurations of the spark session in your entrypoint
# script.
model.enable_explainability(df=train_df, project_directory='.',
                            user_predict_function_import_path='entrypoint',
                            requirements_file='requirements.txt',
                            model_server_num_cpu='2')

In [17]:
model.save()

'b4f289a5-37cc-4514-a0c5-2e16d22bd987'

## Send an inference batch:

In [18]:
# Ground truth must be sent separately when sending batch data, lets create a ground truth column to later break out
# and upload. Also note the convention that the ground truth column must be names the same as its 
# corresponding predicted value attribute with "_ground_truth" appended
test = test.withColumnRenamed("medv","medv_ground_truth")
test.show()

+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+-----------------+
|   crim|  zn|indus|chas|   nox|   rm| age|    dis|rad|tax|ptratio|     b|lstat|medv_ground_truth|
+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+-----------------+
|0.00632|18.0| 2.31|   0| 0.538|6.575|65.2|   4.09|  1|296|   15.3| 396.9| 4.98|             24.0|
| 0.0136|75.0|  4.0|   0|  0.41|5.888|47.6| 7.3197|  3|469|   21.1| 396.9| 14.8|             18.9|
|0.01501|90.0| 1.21|   1| 0.401|7.923|24.8|  5.885|  1|198|   13.6|395.52| 3.16|             50.0|
|0.01538|90.0| 3.75|   0| 0.394|7.454|34.2| 6.3361|  3|244|   15.9|386.34| 3.11|             44.0|
|0.02729| 0.0| 7.07|   0| 0.469|7.185|61.1| 4.9671|  2|242|   17.8|392.83| 4.03|             34.7|
| 0.0315|95.0| 1.47|   0| 0.403|6.975|15.3| 7.6534|  3|402|   17.0| 396.9| 4.56|             34.9|
|0.03237| 0.0| 2.18|   0| 0.458|6.998|45.8| 6.0622|  3|222|   18.7|394.63| 2.94|             33.4|
|0.03427| 

In [19]:
import uuid
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
# make predictions
predicted_dataframe = loaded_model_pipeline.transform(test).withColumnRenamed("prediction", "medv")
# In order to send ground truth we must use an external id to match up rows in the ground truth dataframe and
# inferences dataframe
uuidUdf= udf(lambda : str(uuid.uuid4()), StringType())
predicted_dataframe = predicted_dataframe.withColumn('external_id', uuidUdf())
predicted_dataframe.show()

+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+-----------------+--------------------+------------------+--------------------+
|   crim|  zn|indus|chas|   nox|   rm| age|    dis|rad|tax|ptratio|     b|lstat|medv_ground_truth|            features|              medv|         external_id|
+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+-----------------+--------------------+------------------+--------------------+
|0.00632|18.0| 2.31|   0| 0.538|6.575|65.2|   4.09|  1|296|   15.3| 396.9| 4.98|             24.0|[0.00632,18.0,2.3...|30.481707803483047|d545f0ca-73ae-456...|
| 0.0136|75.0|  4.0|   0|  0.41|5.888|47.6| 7.3197|  3|469|   21.1| 396.9| 14.8|             18.9|[0.0136,75.0,4.0,...| 16.89827799730775|0ce4fdf0-6b77-4b4...|
|0.01501|90.0| 1.21|   1| 0.401|7.923|24.8|  5.885|  1|198|   13.6|395.52| 3.16|             50.0|[0.01501,90.0,1.2...|40.212911231135195|c13c119e-8752-46b...|
|0.01538|90.0| 3.75|   0| 0.394|7.454|34

In [20]:
# Now we separate out the inference input dataframe frame and the ground truth dataframe
pipeline_input_attr_names = [attr.as_dict()['name'] 
                             for attr in model.get_attributes_for_stage(Stage.ModelPipelineInput)]
columns_to_select = pipeline_input_attr_names + ['medv', 'external_id']
batch_inferences = predicted_dataframe.select(columns_to_select)

In [21]:
# getting ground truth batch dataframe
columns_to_select = ['medv_ground_truth', 'external_id']
ground_truth_batch = predicted_dataframe.select(columns_to_select)

In [22]:
# write inferences dataframe to parquet file
batch_inferences.write.parquet("./data/batch_inference_files/batch_inferences.parquet")
ground_truth_batch.write.parquet("./data/batch_ground_truth_files/ground_truth.parquet")

In [23]:
model.send_batch_inferences(directory_path='./data/batch_inference_files/')

In [None]:
model.send_batch_ground_truth(directory_path='./data/batch_ground_truth_files/')