In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

data = spark.read.format("csv").option("header", "true").load("file:/Workspace/Users/anshu.india@outlook.com/databricks/data/penguins.csv")
data = data.dropna().select(col("Island").astype("string"),
                           col("CulmenLength").astype("float"),
                           col("CulmenDepth").astype("float"),
                           col("FlipperLength").astype("float"),
                           col("BodyMass").astype("float"),
                           col("Species").astype("int")
                         )
display(data.sample(0.2))

splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,42.0,20.2,190.0,4250.0,0
Torgersen,38.6,21.2,191.0,3800.0,0
Torgersen,36.6,17.8,185.0,3700.0,0
Torgersen,42.5,20.7,197.0,4500.0,0
Dream,40.9,18.9,184.0,3900.0,0
Dream,38.8,20.0,190.0,3950.0,0
Dream,42.2,18.5,180.0,3550.0,0
Dream,40.8,18.4,195.0,3900.0,0
Biscoe,42.0,19.5,200.0,4050.0,0
Biscoe,41.4,18.6,191.0,3700.0,0


Training Rows: 233  Testing Rows: 98


In [0]:
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

# Start an MLflow run
with mlflow.start_run():
   catFeature = "Island"
   numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
 
   # parameters
   maxIterations = 5
   regularization = 0.5

   # Define the feature engineering and model steps
   catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
   numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
   numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
   featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
   algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=maxIterations, regParam=regularization)

   # Chain the steps as stages in a pipeline
   pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])

   # Log training parameter values
   print ("Training Logistic Regression model...")
   mlflow.log_param('maxIter', algo.getMaxIter())
   mlflow.log_param('regParam', algo.getRegParam())
   model = pipeline.fit(train)
  
   # Evaluate the model and log metrics
   prediction = model.transform(test)
   metrics = ["accuracy", "weightedRecall", "weightedPrecision"]
   for metric in metrics:
       evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction", metricName=metric)
       metricValue = evaluator.evaluate(prediction)
       print("%s: %s" % (metric, metricValue))
       mlflow.log_metric(metric, metricValue)

       
   # Log the model itself
   unique_model_name = "classifier-" + str(time.time())
   mlflow.spark.log_model(model, unique_model_name, mlflow.spark.get_default_conda_env())
   modelpath = "/model/%s" % (unique_model_name)
   mlflow.spark.save_model(model, modelpath)
   
   print("Experiment run complete.")

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-1496208477121782>, line 1[0m
[0;32m----> 1[0m [38;5;28;01mimport[39;00m [38;5;21;01mmlflow[39;00m
[1;32m      2[0m [38;5;28;01mimport[39;00m [38;5;21;01mmlflow[39;00m[38;5;21;01m.[39;00m[38;5;21;01mspark[39;00m
[1;32m      3[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m [38;5;28;01mimport[39;00m Pipeline

[0;31mModuleNotFoundError[0m: No module named 'mlflow'

In [0]:
def train_penguin_model(training_data, test_data, maxIterations, regularization):
   import mlflow
   import mlflow.spark
   from pyspark.ml import Pipeline
   from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
   from pyspark.ml.classification import LogisticRegression
   from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   import time

   # Start an MLflow run
   with mlflow.start_run():

       catFeature = "Island"
       numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]

       # Define the feature engineering and model steps
       catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
       numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
       numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
       featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
       algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=maxIterations, regParam=regularization)

       # Chain the steps as stages in a pipeline
       pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])

       # Log training parameter values
       print ("Training Logistic Regression model...")
       mlflow.log_param('maxIter', algo.getMaxIter())
       mlflow.log_param('regParam', algo.getRegParam())
       model = pipeline.fit(training_data)

       # Evaluate the model and log metrics
       prediction = model.transform(test_data)
       metrics = ["accuracy", "weightedRecall", "weightedPrecision"]
       for metric in metrics:
           evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction", metricName=metric)
           metricValue = evaluator.evaluate(prediction)
           print("%s: %s" % (metric, metricValue))
           mlflow.log_metric(metric, metricValue)


       # Log the model itself
       unique_model_name = "classifier-" + str(time.time())
       mlflow.spark.log_model(model, unique_model_name, mlflow.spark.get_default_conda_env())
       modelpath = "/model/%s" % (unique_model_name)
       mlflow.spark.save_model(model, modelpath)

       print("Experiment run complete.")

In [0]:
train_penguin_model(train, test, 10, 0.2)

[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-1496208477121784>, line 1[0m
[0;32m----> 1[0m [43mtrain_penguin_model[49m[43m([49m[43mtrain[49m[43m,[49m[43m [49m[43mtest[49m[43m,[49m[43m [49m[38;5;241;43m10[39;49m[43m,[49m[43m [49m[38;5;241;43m0.2[39;49m[43m)[49m

File [0;32m<command-1496208477121783>, line 2[0m, in [0;36mtrain_penguin_model[0;34m(training_data, test_data, maxIterations, regularization)[0m
[1;32m      1[0m [38;5;28;01mdef[39;00m [38;5;21mtrain_penguin_model[39m(training_data, test_data, maxIterations, regularization):
[0;32m----> 2[0m    [38;5;28;01mimport[39;00m [38;5;21;01mmlflow[39;00m
[1;32m      3[0m    [38;5;28;01mimport[39;00m [38;5;21;01mmlflow[39;00m[38;5;21;01m.[39;00m[38;5;21;01mspark[39;00m
[1;32m      4[0m    [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark