In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Sesi칩n de Spark
spark = SparkSession.builder.appName("Fase4_Optimizacion").master("local[*]").getOrCreate()

# 2. Cargar datos
df = spark.read.parquet("/opt/spark-data/processed/secop_final_ready.parquet")
df_final = df.withColumnRenamed("valor_del_contrato", "label").withColumnRenamed("features_scaled", "features")
train, test = df_final.randomSplit([0.7, 0.3], seed=42)

# 3. Configurar el Modelo y la Grilla
lr = LinearRegression(featuresCol="features", labelCol="label")
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# 4. ESTRATEGIA R츼PIDA: TrainValidationSplit
# A diferencia de CrossValidator, este solo divide una vez (80% entreno interno, 20% validaci칩n)
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(metricName="rmse"),
                           trainRatio=0.8) # Proporci칩n de la divisi칩n 칰nica

# 5. Entrenar (Ver치s que es mucho m치s r치pido que el anterior)
print("Ejecutando TrainValidationSplit...")
tvs_model = tvs.fit(train)

# 6. Evaluaci칩n Final
predictions = tvs_model.transform(test)
rmse = RegressionEvaluator(metricName="rmse").evaluate(predictions)

print(f" RMSE con TrainValidationSplit: {rmse}")
print(f" Mejores par치metros encontrados: ")
print(f"- RegParam: {tvs_model.bestModel._java_obj.getRegParam()}")
print(f"- ElasticNetParam: {tvs_model.bestModel._java_obj.getElasticNetParam()}")

# 7. GUARDAR EL MODELO DEFINITIVO
# Este modelo es el que usaremos en la Fase 5 de MLOps
tvs_model.bestModel.write().overwrite().save("/opt/spark-data/models/mejor_modelo_secop")
print(" Modelo definitivo guardado en la carpeta 'models'")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/14 16:45:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/14 16:45:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/14 16:45:55 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/14 16:45:55 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
26/02/14 16:45:55 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
                                                                                

Ejecutando TrainValidationSplit...


26/02/14 16:46:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
26/02/14 16:46:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/02/14 16:46:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/02/14 16:46:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK



游늵 RMSE con TrainValidationSplit: 33.4953328899712
游끥 Mejores par치metros encontrados: 
- RegParam: 0.1
- ElasticNetParam: 0.0


                                                                                

 Modelo definitivo guardado en la carpeta 'models'
