## Evaluate a machine learning model

In [0]:
#Evaluating regression models
from pyspark.ml.evaluation import RegressionEvaluator

# Inference predicted labels from validation data
predictions_df = model.transform(validation_df)

# Assume predictions_df includes a 'prediction' column with the predicted labels
# and a 'label' column with the actual known label values

# Use an evaluator to get metrics
evaluator = RegressionEvaluator()
evaluator.setPredictionCol("prediction")
mse = evaluator.evaluate(predictions_df, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions_df, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions_df, {evaluator.metricName: "r2"})
print("MSE:", str(mse))
print("RMSE:", str(rmse))
print("R2", str(r2))

In [0]:
#Evaluating classification models

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Inference predicted labels from validation data
predictions_df = model.transform(validation_df)

# Assume predictions_df includes a 'prediction' column with the predicted labels
# and a 'label' column with the actual known label values

# Use an evaluator to get metrics
accuracy = evaluator.evaluate(predictions_df, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)

labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
    precision = evaluator.evaluate(predictions_df, {evaluator.metricLabel:label,
                                                    evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
    recall = evaluator.evaluate(predictions_df, {evaluator.metricLabel:label,
                                                 evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
    f1 = evaluator.evaluate(predictions_df, {evaluator.metricLabel:label,
                                             evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
    
overallPrecision = evaluator.evaluate(predictions_df, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(predictions_df, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(predictions_df, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

In [0]:
# Evaluating unsupervised clustering models

# The Spark MLlib library provides the ClusteringEvaluator class, which computes the Silhouette for the predictions made by a clustering model as shown here:

from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors

# Inference predicted labels from validation data
predictions_df = model.transform(validation_df)

# Assume predictions_df includes a 'prediction' column with the predicted cluster

# Use an evaluator to get metrics
evaluator = ClusteringEvaluator(predictionCol="prediction")
silhouetteVal = evaluator.evaluate(predictions_df)
print(silhouetteVal)


# Exercise

## Ingest data

In [0]:
%sh
rm -r dbfs/ml_lab
mkdir dbfs/ml_lab
wget -O dbfs/ml_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

--2025-09-21 07:28:28--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘dbfs/ml_lab/penguins.csv’

     0K .........                                             100% 1.94M=0.005s

2025-09-21 07:28:28 (1.94 MB/s) - ‘dbfs/ml_lab/penguins.csv’ saved [9533/9533]



## Explore and clean up the data

The data itself consists of measurements of the following details of penguins that have been observed in Antarctica:

- Island: The island in Antarctica where the penguin was observed.
- CulmenLength: The length in mm of the penguin’s culmen (bill).
- CulmenDepth: The depth in mm of the penguin’s culmen.
- FlipperLength: The length in mm of the penguin’s flipper.
- BodyMass: The body mass of the penguin in grams.
- Species: An integer value that represents the species of the penguin:
0: Adelie
1: Gentoo
2: Chinstrap

Our goal in this project is to use the observed characteristics of a penguin (its features) in order to predict its species (which in machine learning terminology, we call the label).

In [0]:
df = spark.read.format("csv").option("header", "true").load("file:/Workspace/MicrosoftLearnings/dbfs/ml_lab/penguins.csv")
display(df.limit(9))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,,,,,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0


Run the following cell to remove the rows with incomplete data by using the dropna method, and to apply appropriate data types to the data by using the select method with the col and astype functions.

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = df.dropna().select(col("Island").astype("string"),
                           col("CulmenLength").astype("float"),
                          col("CulmenDepth").astype("float"),
                          col("FlipperLength").astype("float"),
                          col("BodyMass").astype("float"),
                          col("Species").astype("int")
                          )
display(data.limit(9))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0
Torgersen,42.0,20.2,190.0,4250.0,0


## Split the data

In [0]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Training Rows: 219  Testing Rows: 123


## Perform feature engineering

### Encode categorical features - eg: Island Name in this case

In [0]:
display(train.limit(9))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Biscoe,35.0,17.9,190.0,3450.0,0
Biscoe,35.0,17.9,192.0,3725.0,0
Biscoe,35.3,18.9,187.0,3800.0,0
Biscoe,35.7,16.9,185.0,3150.0,0
Biscoe,35.9,19.2,189.0,3800.0,0
Biscoe,36.4,17.1,184.0,2850.0,0
Biscoe,36.5,16.6,181.0,2850.0,0
Biscoe,37.6,17.0,185.0,3600.0,0
Biscoe,37.6,19.1,194.0,3750.0,0


In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Island", outputCol="IslandIdx")
indexedData = indexer.fit(train).transform(train).drop("Island")
display(indexedData.limit(9))

CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species,IslandIdx
35.0,17.9,190.0,3450.0,0,0.0
35.0,17.9,192.0,3725.0,0,0.0
35.3,18.9,187.0,3800.0,0,0.0
35.7,16.9,185.0,3150.0,0,0.0
35.9,19.2,189.0,3800.0,0,0.0
36.4,17.1,184.0,2850.0,0,0.0
36.5,16.6,181.0,2850.0,0,0.0
37.6,17.0,185.0,3600.0,0,0.0
37.6,19.1,194.0,3750.0,0,0.0


### Normalize (scale) numeric features

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

# Create a vector column containing all numeric features
numericFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
numericColVector = VectorAssembler(inputCols=numericFeatures, outputCol="numericFeatures")
vectorizedData = numericColVector.transform(indexedData)

# Use a MinMax scaler to normalize the numeric values in the vector
minMax = MinMaxScaler(inputCol = numericColVector.getOutputCol(), outputCol="normalizedFeatures")
scaledData = minMax.fit(vectorizedData).transform(vectorizedData)

# Display the data with numeric feature vectors (before and after scaling)
compareNumerics = scaledData.select("numericFeatures", "normalizedFeatures")
display(compareNumerics.limit(9))

"""
The numericFeatures column in the results contains a vector for each row. The vector includes four unscaled numeric values (the original measurements of the penguin). You can use the ▸ toggle to see the discrete values more clearly.

The normalizedFeatures column also contains a vector for each penguin observation, but this time the values in the vector are normalized to a relative scale based on the minimum and maximum values for each measurement.
"""



numericFeatures,normalizedFeatures
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 190.0, 3450.0))","Map(vectorType -> dense, length -> 4, values -> List(0.07169817078788325, 0.5949366416252734, 0.3050847457627119, 0.20833333333333334))"
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 192.0, 3725.0))","Map(vectorType -> dense, length -> 4, values -> List(0.07169817078788325, 0.5949366416252734, 0.3389830508474576, 0.2847222222222222))"
"Map(vectorType -> dense, length -> 4, values -> List(35.29999923706055, 18.899999618530273, 187.0, 3800.0))","Map(vectorType -> dense, length -> 4, values -> List(0.08301889671469634, 0.7215189109378106, 0.2542372881355932, 0.3055555555555556))"
"Map(vectorType -> dense, length -> 4, values -> List(35.70000076293945, 16.899999618530273, 185.0, 3150.0))","Map(vectorType -> dense, length -> 4, values -> List(0.09811329391767394, 0.4683543723127361, 0.22033898305084745, 0.125))"
"Map(vectorType -> dense, length -> 4, values -> List(35.900001525878906, 19.200000762939453, 189.0, 3800.0))","Map(vectorType -> dense, length -> 4, values -> List(0.10566049251916274, 0.7594937365934827, 0.288135593220339, 0.3055555555555556))"
"Map(vectorType -> dense, length -> 4, values -> List(36.400001525878906, 17.100000381469727, 184.0, 2850.0))","Map(vectorType -> dense, length -> 4, values -> List(0.12452841704746462, 0.49367092274985086, 0.2033898305084746, 0.041666666666666664))"
"Map(vectorType -> dense, length -> 4, values -> List(36.5, 16.600000381469727, 181.0, 2850.0))","Map(vectorType -> dense, length -> 4, values -> List(0.1283019443727889, 0.43037978809358224, 0.15254237288135594, 0.041666666666666664))"
"Map(vectorType -> dense, length -> 4, values -> List(37.599998474121094, 17.0, 185.0, 3600.0))","Map(vectorType -> dense, length -> 4, values -> List(0.16981132075471697, 0.48101264753129347, 0.22033898305084745, 0.25))"
"Map(vectorType -> dense, length -> 4, values -> List(37.599998474121094, 19.100000381469727, 194.0, 3750.0))","Map(vectorType -> dense, length -> 4, values -> List(0.16981132075471697, 0.7468354613749254, 0.3728813559322034, 0.2916666666666667))"


'\nThe numericFeatures column in the results contains a vector for each row. The vector includes four unscaled numeric values (the original measurements of the penguin). You can use the ▸ toggle to see the discrete values more clearly.\n\nThe normalizedFeatures column also contains a vector for each penguin observation, but this time the values in the vector are normalized to a relative scale based on the minimum and maximum values for each measurement.\n'

### Prepare features and labels for training

In [0]:
featVect = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="featuresVector")
preppedData = featVect.transform(scaledData)[col("featuresVector").alias("features"), col("Species").alias("label")]
display(preppedData.limit(9))

features,label
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.07169817078788325, 0.5949366416252734, 0.3050847457627119, 0.20833333333333334))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.07169817078788325, 0.5949366416252734, 0.3389830508474576, 0.2847222222222222))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08301889671469634, 0.7215189109378106, 0.2542372881355932, 0.3055555555555556))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.09811329391767394, 0.4683543723127361, 0.22033898305084745, 0.125))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10566049251916274, 0.7594937365934827, 0.288135593220339, 0.3055555555555556))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.12452841704746462, 0.49367092274985086, 0.2033898305084746, 0.041666666666666664))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1283019443727889, 0.43037978809358224, 0.15254237288135594, 0.041666666666666664))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16981132075471697, 0.48101264753129347, 0.22033898305084745, 0.25))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16981132075471697, 0.7468354613749254, 0.3728813559322034, 0.2916666666666667))",0


## Train a machine learning model

Now that the training data is prepared, you can use it to train a model. Models are trained using an algorithm that tries to establish a relationship between the features and labels. Since in this case you want to train a model that predicts a category of class , you need to use a classification algorithm. There are many algorithms for classification - let’s start with a well-establish one: logistic regression, which iteratively attempts to find the optimal coefficients that can be applied to the features data in a logistic calculation that predicts the probability for each class label value. To train the model, you will fit the logistic regression algorithm to the training data.

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
model = lr.fit(preppedData)
print ("Model trained!")

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Model trained!


### Test the model

You need to perform the same feature engineering transformations to the test data as you applied to the training data (in this case, encode the island name and normalize the measurements). Then, you can use the model to predict labels for the features in the test data and compare the predicted labels to the actual known labels.

In [0]:
# Prepare the test data
indexedTestData = indexer.fit(test).transform(test).drop("Island")
vectorizedTestData = numericColVector.transform(indexedTestData)
scaledTestData = minMax.fit(vectorizedTestData).transform(vectorizedTestData)
preppedTestData = featVect.transform(scaledTestData)[col("featuresVector").alias("features"), col("Species").alias("label")]
   
# Get predictions
prediction = model.transform(preppedTestData)
predicted = prediction.select("features", "probability", col("prediction").astype("Int"), col("label").alias("trueLabel"))
display(predicted.limit(9))

features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.10126588390798179, 0.5952381222696814, 0.23214285714285712, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(0.8294058579580259, 0.038026612709396015, 0.13256752933257793))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.1434599753766933, 0.3690476812202672, 0.375, 0.14516129032258066))","Map(vectorType -> dense, length -> 3, values -> List(0.7236973028229982, 0.1263491713921393, 0.14995352578486257))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.24050635356319272, 0.6190475109212744, 0.0, 0.16129032258064516))","Map(vectorType -> dense, length -> 3, values -> List(0.7795747520120259, 0.039322401425116504, 0.18110284656285766))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.25316454881226913, 0.40476187773031863, 0.42857142857142855, 0.29838709677419356))","Map(vectorType -> dense, length -> 3, values -> List(0.601306138564913, 0.20571754327737657, 0.19297631815771044))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.27426159454662485, 0.4880953056742035, 0.4464285714285714, 0.27419354838709675))","Map(vectorType -> dense, length -> 3, values -> List(0.606325134177877, 0.17895352859875285, 0.21472133722337003))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.2911392955171836, 0.5238095021842549, 0.21428571428571427, 0.20967741935483872))","Map(vectorType -> dense, length -> 3, values -> List(0.6781942736556413, 0.0954020053245875, 0.2264037210197712))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.31645568601533636, 0.5476191179011717, 0.21428571428571427, 0.1935483870967742))","Map(vectorType -> dense, length -> 3, values -> List(0.6610112799404089, 0.09180046890129213, 0.24718825115829887))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.3544304327202509, 0.5714285065527647, 0.23214285714285712, 0.0967741935483871))","Map(vectorType -> dense, length -> 3, values -> List(0.6276471031174391, 0.08030532299218829, 0.2920475738903726))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(1.0, 0.3586497774840479, 0.6785713231482425, 0.33928571428571425, 0.2903225806451613))","Map(vectorType -> dense, length -> 3, values -> List(0.6092332064208497, 0.11349471375481081, 0.27727207982433943))",0,0


The results include the following columns:

features: The prepared features data from the test dataset.

probability: The probability calculated by the model for each class. This consists of a vector containing three probability values (because there are three classes) which add up to a total of 1.0 (its assumed that there’s a 100% probability that the penguin belongs to one of the three species classes).

prediction: The predicted class label (the one with the highest probability).

trueLabel: The actual known label value from the test data.

To evaluate the effectiveness of the model, you could simply compare the predicted and true labels in these results. However, you can get more meaningful metrics by using a model evaluator - in this case, a multiclass (because there are multiple possible class labels) classification evaluator.

Use the following code to get evaluation metrics for a classification model based on the results from the test data:

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Individual class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighted (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

Accuracy: 0.943089430894309

Individual class metrics:
Class 0
	Precision: 0.9538461538461539
	Recall: 0.96875
	F1 Score: 0.9612403100775193
Class 1
	Precision: 0.9459459459459459
	Recall: 1.0
	F1 Score: 0.9722222222222222
Class 2
	Precision: 0.9047619047619048
	Recall: 0.7916666666666666
	F1 Score: 0.8444444444444444
Overall Precision: 0.94202071275242
Overall Recall: 0.943089430894309
Overall F1 Score: 0.9415758072309405


The evaluation metrics that are calculated for multiclass classification include:

- Accuracy: The proportion of overall predictions that were correct.
- Per-class metrics:
  - Precision: The proportion of predictions of this class that were correct.
  - Recall: The proportion of actual instances of this class that were correctly predicted.
  - F1 score: A combined metric for precision and recall
- Combined (weighted) precision, recall, and F1 metrics for all classes.

## Use a pipeline

You trained your model by performing the required feature engineering steps and then fitting an algorithm to the data. To use the model with some test data to generate predictions (referred to as inferencing), you had to apply the same feature engineering steps to the test data. 

A more efficient way to build and use models is to encapsulate the transformers used to prepare the data and the model used to train it in a pipeline.

### Use the following code to create a pipeline that 
### encapsulates the data preparation and model training steps:

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
   
catFeature = "Island"
numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
   
# Define the feature engineering and model training algorithm steps
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=10, regParam=0.3)
   
# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained!")

Downloading artifacts:   0%|          | 0/45 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Model trained!


### Use the following code to apply the pipeline to the test data:



In [0]:
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Species").alias("trueLabel"))
display(predicted.limit(9))

Features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.05283024625958137, 0.6202531920623882, 0.2542372881355932, 0.05555555555555555))","Map(vectorType -> dense, length -> 3, values -> List(0.8352270587863826, 0.06458507412353533, 0.10018786709008196))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.09056609531618513, 0.37974692865587095, 0.3898305084745763, 0.18055555555555555))","Map(vectorType -> dense, length -> 3, values -> List(0.6952581926243824, 0.19791951109897707, 0.10682229627664055))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.17735851935620578, 0.6455695010629846, 0.03389830508474576, 0.19444444444444445))","Map(vectorType -> dense, length -> 3, values -> List(0.8023228784136749, 0.06413523113393665, 0.1335418904523884))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.18867924528301885, 0.41772151287502485, 0.4406779661016949, 0.3125))","Map(vectorType -> dense, length -> 3, values -> List(0.5752420607128577, 0.2936451474692828, 0.1311127918178596))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.20754716981132074, 0.5063291979684083, 0.4576271186440678, 0.2916666666666667))","Map(vectorType -> dense, length -> 3, values -> List(0.5946825189574656, 0.2576849905543497, 0.14763249048818478))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.22264156701429835, 0.5443037821875621, 0.23728813559322035, 0.2361111111111111))","Map(vectorType -> dense, length -> 3, values -> List(0.6911358111445404, 0.1478564325835306, 0.16100775627192898))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.24528301886792453, 0.5696203326246769, 0.23728813559322035, 0.2222222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.6816041754642713, 0.14283495209084515, 0.17556087244488358))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.279245340599204, 0.5949366416252734, 0.2542372881355932, 0.1388888888888889))","Map(vectorType -> dense, length -> 3, values -> List(0.6635413661071513, 0.12900274364052758, 0.20745589025232106))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2830188679245283, 0.7088606357192532, 0.3559322033898305, 0.3055555555555556))","Map(vectorType -> dense, length -> 3, values -> List(0.6377639044336078, 0.16612188030888317, 0.19611421525750913))",0,0


# Run the following code to create a pipeline that uses a 
# Decision tree algorithm:

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import DecisionTreeClassifier
   
catFeature = "Island"
numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
   
# Define the feature engineering and model steps
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
algo = DecisionTreeClassifier(labelCol="Species", featuresCol="Features", maxDepth=10)
   
# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained! using DecisionTreeClassifier")

Downloading artifacts:   0%|          | 0/45 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Model trained! using DecisionTreeClassifier


### Run the following code to use the new pipeline using Decision tree algorithm, with the test data:

In [0]:
# Get predictions
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Species").alias("trueLabel"))
   
# Generate evaluation metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                    evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighed (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

Accuracy: 0.9512195121951219

Individual class metrics:
Class 0
	Precision: 1.0
	Recall: 0.90625
	F1 Score: 0.9508196721311475
Class 1
	Precision: 0.9722222222222222
	Recall: 1.0
	F1 Score: 0.9859154929577464
Class 2
	Precision: 0.8275862068965517
	Recall: 1.0
	F1 Score: 0.9056603773584906
Overall Precision: 0.9584540385633742
Overall Recall: 0.9512195121951219
Overall F1 Score: 0.9519947181017752


# Save the model

In [0]:
model.save("/models/penguin.model")

In [0]:
model.save("/Workspace/MicrosoftLearnings/models/penguin.model")

In [0]:
%sh

pwd
cd models
pwd
ls -l

/Workspace/MicrosoftLearnings
/Workspace/MicrosoftLearnings/models
total 0


In [0]:
%python
%fs ls /Workspace/MicrosoftLearnings/models/

path,name,size,modificationTime
dbfs:/Workspace/MicrosoftLearnings/models/penguin.model/,penguin.model/,0,1758453247000


Now, when you’ve been out and spotted a new penguin, you can load the saved model and use it to predict the penguin’s species based on your measurements of its features. Using a model to generate predictions from new data is called inferencing.

Run the following code to load the model and use it to predict the species for a new penguin observation:

In [0]:
from pyspark.ml.pipeline import PipelineModel

persistedModel = PipelineModel.load("/models/penguin.model")
   
newData = spark.createDataFrame ([{"Island": "Biscoe",
                                  "CulmenLength": 47.6,
                                  "CulmenDepth": 14.5,
                                  "FlipperLength": 215,
                                  "BodyMass": 5400}])


predictions = persistedModel.transform(newData)
display(predictions.select("Island", "CulmenDepth", "CulmenLength", "FlipperLength", "BodyMass", col("prediction").alias("PredictedSpecies")))

Island,CulmenDepth,CulmenLength,FlipperLength,BodyMass,PredictedSpecies
Biscoe,14.5,47.6,215,5400,1.0
