In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Iniciar Sesión
spark = SparkSession.builder.appName("Fase4_ValidacionCruzada").master("local[*]").getOrCreate()

# 2. Cargar datos
df = spark.read.parquet("/opt/spark-data/processed/secop_final_ready.parquet")
df_final = df.withColumnRenamed("valor_del_contrato", "label").withColumnRenamed("features_scaled", "features")

# 3. Dividir datos (70/30)
train, test = df_final.randomSplit([0.7, 0.3], seed=42)

# 4. Configurar el Modelo Base
lr = LinearRegression(featuresCol="features", labelCol="label")

# 5. CREAR LA GRILLA (ParamGrid)
# Spark probará todas las combinaciones de estos valores automáticamente
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# 6. CONFIGURAR CROSS VALIDATOR
# numFolds=3 significa que dividirá el set de entreno en 3 partes para validar
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="rmse"),
                          numFolds=3) 

# 7. Ejecutar la búsqueda (Esto tardará un poco más porque entrena varios modelos)
print("Buscando la mejor configuración con Validación Cruzada...")
cv_model = crossval.fit(train)

# 8. Ver el resultado del mejor modelo
best_model = cv_model.bestModel
print(f" Mejor regParam: {best_model._java_obj.getRegParam()}")
print(f" Mejor elasticNetParam: {best_model._java_obj.getElasticNetParam()}")

# 9. Evaluar en el set de prueba
predictions = cv_model.transform(test)
rmse = RegressionEvaluator(metricName="rmse").evaluate(predictions)
print(f" RMSE Final del mejor modelo: {rmse}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/14 16:23:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/14 16:37:50 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/14 16:37:50 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/14 16:37:50 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
                                                                                

Buscando la mejor configuración con Validación Cruzada...


26/02/14 16:38:01 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
26/02/14 16:38:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/02/14 16:38:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/02/14 16:38:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


 Mejor regParam: 0.1
 Mejor elasticNetParam: 0.0
 RMSE Final del mejor modelo: 33.4953328899712
