In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import pandas as pd

In [2]:
spark = SparkSession.builder.appName('app').getOrCreate()

## Create SparkML Model

In [3]:
data = spark.read.csv('./data/boston_housing.csv', header=True, inferSchema=True)
data.printSchema()
data.count()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



506

In [4]:
train, test = data.randomSplit([0.7, 0.3])

In [5]:
from pyspark.ml.feature import VectorAssembler

feature_columns = data.columns[:-1] # here we omit the final column
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [6]:
from pyspark.ml.regression import LinearRegression

algo = LinearRegression(featuresCol="features", labelCol="medv", maxIter=10, regParam=0.3, elasticNetParam=0.8)

## Create and Save Model Pipeline

In [7]:
pipeline = Pipeline(stages=[assembler, algo]) 

fitted_pipeline = pipeline.fit(train)

fitted_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
| 37.78720362816466|
|29.879873686483087|
|25.849931584306574|
|26.826383257489635|
| 31.27114848618972|
|26.124461013987293|
|26.244138502778384|
|29.863952431849345|
| 27.29795362946603|
| 37.38115182191164|
|23.959925082092433|
| 36.88136520458576|
| 32.46044591915444|
| 35.33705593923822|
|24.349457513192164|
| 25.98978696899479|
| 31.98961846899827|
|22.431738444828603|
| 28.57496931837754|
|21.645267684350816|
+------------------+
only showing top 20 rows



In [8]:
fitted_pipeline.write().overwrite().save('./data/models/boton_housing_spark_model_pipeline')

In [9]:
from pyspark.ml import PipelineModel

loaded_model_pipeline = PipelineModel.load("./data/models/boton_housing_spark_model_pipeline")

loaded_model_pipeline.transform(test).select('prediction').show()

+------------------+
|        prediction|
+------------------+
| 37.78720362816466|
|29.879873686483087|
|25.849931584306574|
|26.826383257489635|
| 31.27114848618972|
|26.124461013987293|
|26.244138502778384|
|29.863952431849345|
| 27.29795362946603|
| 37.38115182191164|
|23.959925082092433|
| 36.88136520458576|
| 32.46044591915444|
| 35.33705593923822|
|24.349457513192164|
| 25.98978696899479|
| 31.98961846899827|
|22.431738444828603|
| 28.57496931837754|
|21.645267684350816|
+------------------+
only showing top 20 rows



## On-board to Arthur

In [10]:
from arthurai import ArthurAI
from arthurai import ModelType, InputType, Stage, DataType, ArthurModel
from arthurai.client.apiv2.arthur_explainer import ArthurExplainer

In [11]:
client = ArthurAI(url='dashboard.arthur.ai', access_key='<access_key>')

In [12]:
# create a dataframe from the training data to on-board model metadata
train_df = train.toPandas()
train_df = train_df.drop('medv', axis=1)  # drop predicted value column to leave only pipeline input
train_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.00906,90.0,2.97,0,0.4,7.088,20.8,7.3073,1,285,15.3,394.72,7.85
2,0.01096,55.0,2.25,0,0.389,6.453,31.9,7.3073,1,300,15.3,394.72,8.23
3,0.01301,35.0,1.52,0,0.442,7.241,49.3,7.0379,1,284,15.5,394.74,5.49
4,0.01311,90.0,1.22,0,0.403,7.249,21.9,8.6966,5,226,17.9,395.93,4.81


In [13]:
MODEL_METADATA = {
    "name": 'boston_housing_model_2',
    "description": "Spark Boston Housing Model",
    "input_type": InputType.Tabular,
    "model_type": ModelType.Regression,
    "tags": ['Spark'],
    "is_batch": True
}

model = client.model(**MODEL_METADATA)
model.from_dataframe(train_df[list(train_df.columns)[0:]], Stage.ModelPipelineInput)
model.attribute(
    name='medv',
    stage=Stage.GroundTruth,
    data_type=DataType.Float,
    categorical=False,
    position=0
)

In [14]:
# review the model to ensure all attributes were inferred correctly
model.review_model()

name               stage                     data_type         categorical  is_unique  
crim               Stage.ModelPipelineInput  DataType.Float    False        True       
zn                 Stage.ModelPipelineInput  DataType.Float    False        True       
indus              Stage.ModelPipelineInput  DataType.Float    False        True       
chas               Stage.ModelPipelineInput  DataType.Integer  True         False      
nox                Stage.ModelPipelineInput  DataType.Float    False        True       
rm                 Stage.ModelPipelineInput  DataType.Float    False        True       
age                Stage.ModelPipelineInput  DataType.Float    False        True       
dis                Stage.ModelPipelineInput  DataType.Float    False        True       
rad                Stage.ModelPipelineInput  DataType.Integer  True         False      
tax                Stage.ModelPipelineInput  DataType.Integer  False        True       
ptratio            Stage.ModelPi

In [15]:
# chas and rad were inferred as categorical, lets change those to be continuous
model.get_attribute('chas', Stage.ModelPipelineInput).set(categorical=False)
model.get_attribute('rad', Stage.ModelPipelineInput).set(categorical=False)
model.review_model()

name               stage                     data_type         categorical  is_unique  
crim               Stage.ModelPipelineInput  DataType.Float    False        True       
zn                 Stage.ModelPipelineInput  DataType.Float    False        True       
indus              Stage.ModelPipelineInput  DataType.Float    False        True       
chas               Stage.ModelPipelineInput  DataType.Integer  False        False      
nox                Stage.ModelPipelineInput  DataType.Float    False        True       
rm                 Stage.ModelPipelineInput  DataType.Float    False        True       
age                Stage.ModelPipelineInput  DataType.Float    False        True       
dis                Stage.ModelPipelineInput  DataType.Float    False        True       
rad                Stage.ModelPipelineInput  DataType.Integer  False        False      
tax                Stage.ModelPipelineInput  DataType.Integer  False        True       
ptratio            Stage.ModelPi

In [17]:
# When using a spark model be sure to allocate at least 2 cpus to the model server.
# This can scale as you change the configurations of the spark session in your entrypoint
# script.
model.enable_explainability(df=train_df, project_directory='.',
                            user_predict_function_import_path='entrypoint',
                            requirements_file='requirements.txt',
                            model_server_num_cpu='2')

In [18]:
model.save()

'1565847e-2673-4552-9d80-13197fd7da60'

## Send an inference batch:

In [19]:
# lets make inferences on the test data and then send to Arthur
# first lets rename the medv column to be the ground truth column
test = test.withColumnRenamed("medv","medv_ground_truth")
test.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+-----------------+
|   crim|  zn|indus|chas|   nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|medv_ground_truth|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+-----------------+
|0.01381|80.0| 0.46|   0| 0.422|7.875|32.0|5.6484|  4|255|   14.4|394.23| 2.97|             50.0|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|             29.1|
| 0.0187|85.0| 4.15|   0| 0.429|6.516|27.7|8.5353|  4|351|   17.9|392.43| 6.36|             23.1|
|0.02498| 0.0| 1.89|   0| 0.518| 6.54|59.7|6.2669|  1|422|   15.9|389.96| 8.65|             16.5|
|0.02729| 0.0| 7.07|   0| 0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|             34.7|
|0.02875|28.0|15.04|   0| 0.464|6.211|28.9|3.6659|  4|270|   18.2|396.33| 6.21|             25.0|
|0.02899|40.0| 1.25|   0| 0.429|6.939|34.5|8.7921|  1|335|   19.7|389.85| 5.89|             26.6|
| 0.0315|95.0| 1.47|

In [20]:
pipeline_input_attr_names = [attr.as_dict()['name'] 
                             for attr in model.get_attributes_for_stage(Stage.ModelPipelineInput)]

In [21]:
# make predictions
predicted_dataframe = loaded_model_pipeline.transform(test).withColumnRenamed("prediction", "medv")
columns_to_select = pipeline_input_attr_names + ['medv', 'medv_ground_truth']
predicted_dataframe = predicted_dataframe.select(columns_to_select)
predicted_dataframe.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+------------------+-----------------+
|   crim|  zn|indus|chas|   nox|   rm| age|   dis|rad|tax|ptratio|     b|lstat|              medv|medv_ground_truth|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+------------------+-----------------+
|0.01381|80.0| 0.46|   0| 0.422|7.875|32.0|5.6484|  4|255|   14.4|394.23| 2.97| 37.78720362816466|             50.0|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.879873686483087|             29.1|
| 0.0187|85.0| 4.15|   0| 0.429|6.516|27.7|8.5353|  4|351|   17.9|392.43| 6.36|25.849931584306574|             23.1|
|0.02498| 0.0| 1.89|   0| 0.518| 6.54|59.7|6.2669|  1|422|   15.9|389.96| 8.65|26.826383257489635|             16.5|
|0.02729| 0.0| 7.07|   0| 0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03| 31.27114848618972|             34.7|
|0.02875|28.0|15.04|   0| 0.464|6.211|28.9|3.6659|  4|270|   18.

In [22]:
import numpy as np
import pandas as pd
# write inferences dataframe to parquet file
pd_df = predicted_dataframe.toPandas()
pd_df.to_parquet("./data/batch_inference_files/inferences.parquet")
pd_df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,medv_ground_truth
0,0.01381,80.0,0.46,0,0.422,7.875,32.0,5.6484,4,255,14.4,394.23,2.97,37.787204,50.0
1,0.01439,60.0,2.93,0,0.401,6.604,18.8,6.2196,1,265,15.6,376.7,4.38,29.879874,29.1
2,0.0187,85.0,4.15,0,0.429,6.516,27.7,8.5353,4,351,17.9,392.43,6.36,25.849932,23.1
3,0.02498,0.0,1.89,0,0.518,6.54,59.7,6.2669,1,422,15.9,389.96,8.65,26.826383,16.5
4,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,31.271148,34.7


In [23]:
model.send_batch_inferences(directory_path='./data/batch_inference_files/')