In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
import mlflow
import mlflow.spark
import sys
import time
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# Start a Spark session specifically for this job
spark = SparkSession.builder.appName("mlflow_example").getOrCreate()

# Path to your dataset
filename = "/FileStore/tables/bank_full.csv"

target_variable_name = "y"
from pyspark.sql import functions as F
df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
df = df.withColumn('label', F.when(F.col("y") == 'yes', 1).otherwise(0))
df = df.drop('y')
train, test = df.randomSplit([0.7, 0.3], seed=12345)

for k, v in df.dtypes:
    if v not in ['string']:
        print(k)

df = df.select(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'label'])


from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

#assemble individual columns to one column - 'features'
def assemble_vectors(df, features_list, target_variable_name):
    stages = []
    #assemble vectors
    assembler = VectorAssembler(inputCols=features_list, outputCol='features')
    stages = [assembler]
    #select all the columns + target + newly created 'features' column
    selectedCols = [target_variable_name, 'features']
    #use pipeline to process sequentially
    pipeline = Pipeline(stages=stages)
    #assembler model
    assembleModel = pipeline.fit(df)
    #apply assembler model on data
    df = assembleModel.transform(df).select(selectedCols)

    return df



age
balance
day
duration
campaign
pdays
previous
label


In [0]:
#exclude target variable and select all other feature vectors
features_list = df.columns
#features_list = char_vars #this option is used only for ChiSqselector
features_list.remove('label')



In [0]:
# apply the function on our dataframe
assembled_train_df = assemble_vectors(train, features_list, 'label')


Downloading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
assembled_test_df = assemble_vectors(test, features_list, 'label')


Downloading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
print(sys.argv[1])

In [0]:
print(sys.argv[2])

In [0]:

print(sys.argv[3])

In [0]:
assembled_test_df = assemble_vectors(test, features_list, target_variable_name)

In [0]:
# Set the MLflow tracking URI and set up the experiment
mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Shared/MLflow Example")

# Automatically log all parameters, metrics, and models to MLflow
mlflow.spark.autolog()

# Model hyperparameters
maxBinsVal = 32 #float(sys.argv[1]) if len(sys.argv) > 3 else 20
maxDepthVal = 5 #float(sys.argv[2]) if len(sys.argv) > 3 else 5

# Start an MLflow run
with mlflow.start_run():
    classifier = RandomForestClassifier(labelCol='label', featuresCol='features', maxBins=maxBinsVal, maxDepth=maxDepthVal)
    stages_tree = [classifier]
    pipeline_tree = Pipeline(stages=stages_tree)

    # Train the model
    print('Running RFModel')
    RFmodel = pipeline_tree.fit(assembled_train_df)
    print('Completed training RFModel')

    # Make predictions
    predictions = RFmodel.transform(assembled_test_df)
    evaluator = BinaryClassificationEvaluator()

    # Evaluate the model
    roc_auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
    print("Test Area Under ROC: " + str(roc_auc))

    # Log model and metrics
    mlflow.log_param("maxBins", maxBinsVal)
    mlflow.log_param("maxDepth", maxDepthVal)
    mlflow.log_metric("ROC", roc_auc)
    mlflow.spark.log_model(RFmodel, "model")
    print(mlflow.get_artifact_uri())




Running RFModel
Completed training RFModel
Test Area Under ROC: 0.8452161588891514


2024/05/16 01:36:55 INFO mlflow.spark: Inferring pip requirements by reloading the logged model from the databricks artifact repository, which can be time-consuming. To speed up, explicitly specify the conda_env or pip_requirements when calling log_model().


Downloading artifacts:   0%|          | 0/14 [00:00<?, ?it/s]

2024/05/16 01:36:56 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

dbfs:/databricks/mlflow-tracking/2770898139939067/9dacb2f386984eb99371005a025be97e/artifacts
