In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 1. Iniciar Sesión
spark = SparkSession.builder.appName("Fase3_Regularizacion").master("local[*]").getOrCreate()

# 2. Cargar tus datos procesados
df = spark.read.parquet("/opt/spark-data/processed/secop_final_ready.parquet")
df_final = df.withColumnRenamed("valor_del_contrato", "label").withColumnRenamed("features_scaled", "features")

# 3. Dividir datos
train, test = df_final.randomSplit([0.7, 0.3], seed=42)

# 4. Configurar 3 modelos para comparar:
# Modelo A: Sin regularización (el que ya hiciste)
lr_normal = LinearRegression(featuresCol="features", labelCol="label", elasticNetParam=0, regParam=0)

# Modelo B: Lasso (L1) - Ayuda a seleccionar variables
lr_lasso = LinearRegression(featuresCol="features", labelCol="label", elasticNetParam=1, regParam=0.1)

# Modelo C: Ridge (L2) - Ayuda a suavizar el modelo
lr_ridge = LinearRegression(featuresCol="features", labelCol="label", elasticNetParam=0, regParam=0.1)

# 5. Entrenar los modelos
print("Entrenando modelos...")
model_normal = lr_normal.fit(train)
model_lasso = lr_lasso.fit(train)
model_ridge = lr_ridge.fit(train)

# 6. Función para evaluar rápido
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")

print("\n=== COMPARACIÓN DE RESULTADOS (RMSE) ===")
print(f"Normal: {evaluator.evaluate(model_normal.transform(test))}")
print(f"Lasso (L1): {evaluator.evaluate(model_lasso.transform(test))}")
print(f"Ridge (L2): {evaluator.evaluate(model_ridge.transform(test))}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/14 16:12:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/14 16:12:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/14 16:12:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/14 16:12:48 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
26/02/14 16:12:48 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
                                                                                

Entrenando modelos...


26/02/14 16:12:58 WARN Instrumentation: [91da457e] regParam is zero, which might cause numerical instability and overfitting.
26/02/14 16:12:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
26/02/14 16:12:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/02/14 16:12:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
26/02/14 16:12:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
26/02/14 16:12:59 WARN Instrumentation: [91da457e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.



=== COMPARACIÓN DE RESULTADOS (RMSE) ===
Normal: 2.6744819836137257
Lasso (L1): 13484.182739836348
Ridge (L2): 33.4953328899712
