In [1]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 3, Finished, Available, Finished)

**Objetivos de Prediccion
**

1. Predicción de cancelación de contrato.
2. Predicción de tipo de arrendamiento.
3. Segmentación de clientes.


In [2]:
dfContratosFinal = spark.sql("SELECT * FROM lakehouseSilver.dfcontratosfinal")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 4, Finished, Available, Finished)

In [3]:
display(dfContratosFinal.limit(5))

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 87ae6dd4-1c08-413f-83da-bd0fe8a527d4)

In [4]:
# Convertir target churn a binario

dfContratosFinal = dfContratosFinal.withColumn("churn_bin", when(col("churn") == "Si", 1).otherwise(0))

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 6, Finished, Available, Finished)

In [5]:
categorical_cols = ["moneda", "tipoArrendamiento", "desTipoArrendamiento", "Canal", "Industria", "Segmento", "valorCliente"]
numeric_cols = [
    "montoSaldoAnterior","montoLeasing","plazoOriginal","plazo","tasa",
    "montoDeposito","montoDepositoAbonado","montoSinSeguro","montoSeguroContrato",
    "montoMantenimiento","montoSeguroMantenimiento","montoCompraFinal",
    "montoProveedor","montoValorFiscal","diasGraciaPrimerPago","montoiva"
]

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 7, Finished, Available, Finished)

In [6]:
# Eliminar columnas determinadas

dfContratosFinal = dfContratosFinal.drop("numContrato","idCliente","nombreCliente","fechaInicio","fechaPrimerPago","fechaUltimoPago","fechaCancelacion","anioCancelacion")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 8, Finished, Available, Finished)

In [7]:
# Indexar y codificar variables categóricas

indexers = [StringIndexer(inputCol=c, outputCol=c+"_index", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCols=[c+"_index"], outputCols=[c+"_vec"]) for c in categorical_cols]

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 9, Finished, Available, Finished)

In [8]:
# VectorAssembler (features)

assembler = VectorAssembler(
    inputCols=[c+"_vec" for c in categorical_cols] + numeric_cols,
    outputCol="features"
)

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 10, Finished, Available, Finished)

In [9]:
# Escalado

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 11, Finished, Available, Finished)

In [10]:
# 6. Pipeline

pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler])
pipelineModel = pipeline.fit(dfContratosFinal)

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 12, Finished, Available, Finished)

In [11]:
dfPrepared = pipelineModel.transform(dfContratosFinal)

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 13, Finished, Available, Finished)

#### Datasets finales

In [12]:
# Clasificación binaria (churn)

dfChurn = dfPrepared.select("scaledFeatures", col("churn_bin").alias("label"))

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 14, Finished, Available, Finished)

In [13]:
# Clasificación multiclase (tipo arrendamiento) => se emplea el índice creado para desTipoArrendamiento

dfTipoArr = dfPrepared.select("scaledFeatures", col("desTipoArrendamiento_index").alias("label"))

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 15, Finished, Available, Finished)

In [14]:
# Clustering

dfCluster = dfPrepared.select("scaledFeatures")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 16, Finished, Available, Finished)

In [15]:
# Guardar como tablas Delta en Lakehouse

dfChurn.write.format("delta").mode("overwrite").saveAsTable("ML_Contratos_Churn")

dfTipoArr.write.format("delta").mode("overwrite").saveAsTable("ML_Contratos_TipoArrendamiento")

dfCluster.write.format("delta").mode("overwrite").saveAsTable("ML_Contratos_Clustering")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 17, Finished, Available, Finished)

In [16]:
# Guardar el pipeline para usar en predicciones futuras

pipelineModel.write().overwrite().save("Files/Models/pipeline_preprocesamiento")

StatementMeta(, e4343899-3633-40d3-82b0-c1116e1dedb6, 18, Finished, Available, Finished)