In [0]:
print('Manage machine learning in production with Azure Databricks')

Manage machine learning in production with Azure Databricks


# Provision an Azure Databricks workspace
https://portal.azure.com > PowerShell

## Remove the existing instance and download the installation setup file
-      rm -r mslearn-databricks -f
-      git clone https://github.com/MicrosoftLearning/mslearn-databricks

## Setup a Azure Databricks workspace
-      ./mslearn-databricks/setup.ps1
-      ./mslearn-databricks/setup.ps1 eastus

In [0]:
 %sh
 rm -r /Workspace/MicrosoftLearnings/dbfs/ml_lab
 mkdir /Workspace/MicrosoftLearnings/dbfs/ml_lab
 wget -O /Workspace/MicrosoftLearnings/dbfs/ml_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

- Remove any incomplete rows
- Apply appropriate data types
- View a random sample of the data
- Split the data into two datasets: one for training, and another for testing.

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = spark.read.format("csv").option("header", "true").load("file:/Workspace/MicrosoftLearnings/dbfs/ml_lab/penguins.csv")
data = data.dropna().select(col("Island").astype("string"),
                          col("CulmenLength").astype("float"),
                          col("CulmenDepth").astype("float"),
                          col("FlipperLength").astype("float"),
                          col("BodyMass").astype("float"),
                          col("Species").astype("int")
                          )
display(data.sample(0.2).limit(9))
   
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,36.6,17.8,185.0,3700.0,0
Torgersen,46.0,21.5,194.0,4200.0,0
Biscoe,40.5,17.9,187.0,3200.0,0
Dream,40.9,18.9,184.0,3900.0,0
Dream,36.4,17.0,195.0,3325.0,0
Dream,39.6,18.8,190.0,4600.0,0
Dream,36.0,17.9,190.0,3450.0,0
Biscoe,34.5,18.1,187.0,2900.0,0


Training Rows: 232  Testing Rows: 110


# Run a pipeline to preprocess the data and train a ML model
Create a pipeline that encapsulates the data preparation and model training steps

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression

catFeature = "Island"
numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]

# Define the feature engineering and model training algorithm steps
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=10, regParam=0.3)

# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])

# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained!")

Downloading artifacts:   0%|          | 0/45 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Model trained!


Since the feature engineering steps are now encapsulated in the model trained by the pipeline, you can use the model with the test data without needing to apply each transformation (they’ll be applied automatically by the model).

In [0]:
# Apply the pipeline to the test data and evaluate the model

prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Species").alias("trueLabel"))
display(predicted.limit(9))

# Generate evaluation metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction")

# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)

# Class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                    evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighed (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1) 

Features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.11196916428677685, 0.5662650242480307, 0.3050847457627119, 0.17391304347826086))","Map(vectorType -> dense, length -> 3, values -> List(0.8132791379729964, 0.08841165912898039, 0.09830920289802322))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.13899621917864713, 0.44578309930587445, 0.22033898305084745, 0.08695652173913043))","Map(vectorType -> dense, length -> 3, values -> List(0.7979285681723355, 0.0911639776714057, 0.1109074541562589))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2239383285735537, 0.6506024636279539, 0.3559322033898305, 0.021739130434782608))","Map(vectorType -> dense, length -> 3, values -> List(0.7587373093298683, 0.07965878354444202, 0.16160390712568964))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23552131001704094, 0.5903615011568758, 0.22033898305084745, 0.3188405797101449))","Map(vectorType -> dense, length -> 3, values -> List(0.7464927703180938, 0.11808419061109462, 0.13542303907081163))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23552131001704094, 0.8192771125867657, 0.3050847457627119, 0.30434782608695654))","Map(vectorType -> dense, length -> 3, values -> List(0.7793882120251684, 0.07901757557520417, 0.14159421239962738))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.26640930962820697, 0.5180723002313752, 0.23728813559322035, 0.20289855072463767))","Map(vectorType -> dense, length -> 3, values -> List(0.7120211413673391, 0.12748277120140786, 0.16049608743125304))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.28957527251518145, 0.5421687771402204, 0.23728813559322035, 0.18840579710144928))","Map(vectorType -> dense, length -> 3, values -> List(0.701795379099593, 0.12316419338682807, 0.1750404275135789))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.28957527251518145, 0.9036145519666889, 0.3220338983050847, 0.30434782608695654))","Map(vectorType -> dense, length -> 3, values -> List(0.7540350409831782, 0.07485546990712295, 0.17110948910969895))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2934363645200772, 0.5421687771402204, 0.3559322033898305, 0.10144927536231883))","Map(vectorType -> dense, length -> 3, values -> List(0.6732003750792119, 0.13610095268767874, 0.19069867223310918))",0,0


Accuracy: 0.9545454545454546

Individual class metrics:
Class 0
	Precision: 0.9074074074074074
	Recall: 1.0
	F1 Score: 0.9514563106796117
Class 1
	Precision: 1.0
	Recall: 1.0
	F1 Score: 1.0
Class 2
	Precision: 1.0
	Recall: 0.7619047619047619
	F1 Score: 0.8648648648648648
Overall Precision: 0.9587542087542087
Overall Recall: 0.9545454545454545
Overall F1 Score: 0.952577467140574


# Register and deploy the model


You’ve already logged the model trained by each experiment run when you ran the pipeline. You can also register models and deploy them so they can be served to client applications.