In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 5, Finished, Available, Finished)

In [23]:
import mlflow

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 25, Finished, Available, Finished)

In [4]:
# df = spark.sql("SELECT * FROM lakehouseSilver.dfcontratosfinal")
# display(df.limit(5))

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 6, Finished, Available, Finished)

In [5]:
dfChurn = spark.table("ML_Contratos_Churn")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 7, Finished, Available, Finished)

In [6]:
trainDF, testDF = dfChurn.randomSplit([0.7, 0.3], seed=42)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 8, Finished, Available, Finished)

In [7]:
#  Entrenar modelo base: Logistic Regression

lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="label")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 9, Finished, Available, Finished)

In [8]:
lrModel = lr.fit(trainDF)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 10, Finished, Available, Finished)

In [9]:
lrPredictions = lrModel.transform(testDF)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 11, Finished, Available, Finished)

In [10]:
# Evaluación modelo base

binaryEval = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
multiEval = MulticlassClassificationEvaluator(labelCol="label", metricName="f1")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 12, Finished, Available, Finished)

In [11]:
roc_auc_lr = binaryEval.evaluate(lrPredictions)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 13, Finished, Available, Finished)

In [12]:
f1_lr = multiEval.evaluate(lrPredictions)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 14, Finished, Available, Finished)

In [13]:
print(f"Logistic Regression -> ROC-AUC: {roc_auc_lr:.4f}, F1-score: {f1_lr:.4f}")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 15, Finished, Available, Finished)

Logistic Regression -> ROC-AUC: 0.8167, F1-score: 0.7662


In [14]:
# Random Forest

rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label", numTrees=100, maxDepth=10)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 16, Finished, Available, Finished)

In [15]:
rfModel = rf.fit(trainDF)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 17, Finished, Available, Finished)

In [16]:
rfPredictions = rfModel.transform(testDF)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 18, Finished, Available, Finished)

In [18]:
roc_auc_rf = binaryEval.evaluate(rfPredictions)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 20, Finished, Available, Finished)

In [19]:
f1_rf = multiEval.evaluate(rfPredictions)

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 21, Finished, Available, Finished)

In [20]:
print(f"Random Forest -> ROC-AUC: {roc_auc_rf:.4f}, F1-score: {f1_rf:.4f}")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 22, Finished, Available, Finished)

Random Forest -> ROC-AUC: 0.8997, F1-score: 0.8178


In [21]:
# Comparar modelos

bestModel = "RandomForest" if roc_auc_rf > roc_auc_lr else "LogisticRegression"

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 23, Finished, Available, Finished)

In [22]:
print(f"Mejor modelo: {bestModel}")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 24, Finished, Available, Finished)

Mejor modelo: RandomForest


In [25]:
# Registrar en MLflow

mlflow.set_experiment("ML_Contratos_Churn")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 27, Finished, Available, Finished)

2025/08/02 04:12:53 INFO mlflow.tracking.fluent: Experiment with name 'ML_Contratos_Churn' does not exist. Creating a new experiment.


<Experiment: artifact_location='', creation_time=1754107974616, experiment_id='21754eda-6a1c-4f8c-922c-cd12ab633754', last_update_time=None, lifecycle_stage='active', name='ML_Contratos_Churn', tags={}>

In [26]:
with mlflow.start_run():
    if bestModel == "RandomForest":
        mlflow.spark.log_model(rfModel, "RF_Churn_Model")
        mlflow.log_metric("ROC-AUC", roc_auc_rf)
        mlflow.log_metric("F1", f1_rf)
    else:
        mlflow.spark.log_model(lrModel, "LR_Churn_Model")
        mlflow.log_metric("ROC-AUC", roc_auc_lr)
        mlflow.log_metric("F1", f1_lr)


print("Modelo registrado en MLflow correctamente.")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 28, Finished, Available, Finished)



Modelo registrado en MLflow correctamente.


In [33]:
# Guardar modelo en Lakehouse


if bestModel == "RandomForest":
    rfModel.write().overwrite().save("Files/Models/Churn_RF")
else:
    lrModel.write().overwrite().save("/Files/Models/Churn_LR")

StatementMeta(, 81bbc644-31f7-489e-8cf8-e579db89b251, 35, Finished, Available, Finished)