In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, avg
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1. Iniciar Sesión
spark = SparkSession.builder.appName("Fase3_Logistica").master("local[*]").getOrCreate()

# 2. Cargar tus datos procesados
df = spark.read.parquet("/opt/spark-data/processed/secop_final_ready.parquet")

# 3. CREAR VARIABLE OBJETIVO (Binaria)
# Vamos a marcar como 1 los contratos que están por encima del promedio y 0 el resto
mean_val = df.select(avg("valor_del_contrato")).collect()[0][0]
df_log = df.withColumn("label", when(df["valor_del_contrato"] > mean_val, 1).otherwise(0))

# Renombrar features para el modelo
df_log = df_log.withColumnRenamed("features_scaled", "features")

# 4. Dividir datos (70% entreno, 30% prueba)
train, test = df_log.randomSplit([0.7, 0.3], seed=42)

# 5. Configurar y Entrenar Regresión Logística
log_reg = LogisticRegression(featuresCol="features", labelCol="label")
log_model = log_reg.fit(train)

# 6. Predicciones
predictions = log_model.transform(test)

# 7. Evaluación (AUC-ROC)
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)

print(f" Área bajo la curva (AUC): {auc}")
print("\n=== MUESTRA DE CLASIFICACIÓN (Real vs Predicción) ===")
predictions.select("label", "prediction", "probability").show(10)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/14 16:04:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/14 16:04:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/14 16:04:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
26/02/14 16:04:25 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
26/02/14 16:04:37 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
26/02/14 16:04:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/02/14 16:04:40 WARN InstanceBuilder: Failed to load

 Área bajo la curva (AUC): 1.0

=== MUESTRA DE CLASIFICACIÓN (Real vs Predicción) ===
+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    0|       0.0|[0.99999999990605...|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|[0.99999999957685...|
|    0|       0.0|[0.99999999999999...|
|    0|       0.0|[0.99999999999992...|
|    0|       0.0|[0.99999999999999...|
|    0|       0.0|[0.99999999995757...|
+-----+----------+--------------------+
only showing top 10 rows

