In [None]:
print("Hola Spark desde VS Code 🚀")

In [17]:
from pyspark.sql import SparkSession

# Cerrar sesión actual
try:
    spark.stop()
    print("✅ Sesión anterior cerrada")
except:
    print("ℹ️ No había sesión previa")

# Reiniciar con configuración explícita del JAR
spark = (SparkSession.builder
    .appName("Validacion-incremental")
    .master("spark://spark-master:7077")
    .config("spark.executor.instances", "1")
    .config("spark.executor.cores", "1")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "1g")
    .config("spark.jars", "/opt/spark/jars/mssql-jdbc-13.2.0.jre8.jar")
    .config("spark.driver.extraClassPath", "/opt/spark/jars/mssql-jdbc-13.2.0.jre8.jar")
    .config("spark.executor.extraClassPath", "/opt/spark/jars/mssql-jdbc-13.2.0.jre8.jar")
    .enableHiveSupport()
    .getOrCreate())

print("✅ Spark version:", spark.version)
print("✅ Master URL:", spark.sparkContext.master)
print("✅ Spark reiniciado con driver JDBC")

✅ Sesión anterior cerrada
✅ Spark version: 2.4.5
✅ Master URL: spark://spark-master:7077
✅ Spark reiniciado con driver JDBC


In [18]:
# Importar configuración
import sys
sys.path.append("/scripts/config")
from db_config import db_config

# Probar conexión
try:
    df_test = (spark.read.format("jdbc")
        .option("url", db_config["jdbc_url"])
        .option("user", db_config["user"])
        .option("password", db_config["password"])
        .option("driver", db_config["driver"])
        .option("query", "SELECT 1 AS test_col")
        .load())
    
    print("✅ ¡Conexión JDBC exitosa!")
    df_test.show()
    
except Exception as e:
    print(f"❌ Error: {str(e)}")



✅ ¡Conexión JDBC exitosa!


[Stage 0:>                                                          (0 + 1) / 1]

+--------+
|test_col|
+--------+
|       1|
+--------+



                                                                                

In [19]:
def read_sql_query(query: str):
    """
    Ejecuta un query en SQL Server usando la configuración JDBC del proyecto.
    Retorna un DataFrame de Spark.
    """
    return (spark.read.format("jdbc")
        .option("url", db_config["jdbc_url"])
        .option("user", db_config["user"])
        .option("password", db_config["password"])
        .option("driver", db_config["driver"])
        .option("query", query)   # 👈 usar dbtable, no query
        .load())

In [20]:
df_pedidos = read_sql_query("SELECT  * FROM dbo.Pedidos")

In [21]:
df_pedidos.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------+---------+------+-----------+--------------------+----------+
|PedidoID|ClienteID| Monto|FechaPedido|          CreateTime|UpdateTime|
+--------+---------+------+-----------+--------------------+----------+
|       1|        1|150.50| 2025-09-01|2025-09-14 19:36:...|      null|
|       2|        2|200.00| 2025-09-02|2025-09-14 19:36:...|      null|
|       3|        3| 50.00| 2025-09-03|2025-09-14 19:36:...|      null|
|       4|        4|300.00| 2025-09-04|2025-09-14 19:36:...|      null|
|       5|        5|120.00| 2025-09-05|2025-09-14 19:36:...|      null|
|       6|        1| 80.00| 2025-09-06|2025-09-14 19:36:...|      null|
|       7|        2| 60.00| 2025-09-07|2025-09-14 19:36:...|      null|
|       8|        3| 90.00| 2025-09-08|2025-09-14 19:36:...|      null|
|       9|        4|110.00| 2025-09-09|2025-09-14 19:36:...|      null|
|      10|        5| 75.00| 2025-09-10|2025-09-14 19:36:...|      null|
|      11|        7| 95.00| 2025-09-11|2025-09-14 19:36:...|    

                                                                                

In [22]:
df_pedidos.write.mode("overwrite").parquet("hdfs://namenode:8020/bronze/df_pedidos/")

                                                                                

In [23]:
df_validacion = spark.read.parquet("hdfs://namenode:8020/bronze/df_pedidos")
df_validacion.show(30)



[Stage 4:>                                                          (0 + 1) / 1]

+--------+---------+------+-----------+--------------------+----------+
|PedidoID|ClienteID| Monto|FechaPedido|          CreateTime|UpdateTime|
+--------+---------+------+-----------+--------------------+----------+
|       1|        1|150.50| 2025-09-01|2025-09-14 19:36:...|      null|
|       2|        2|200.00| 2025-09-02|2025-09-14 19:36:...|      null|
|       3|        3| 50.00| 2025-09-03|2025-09-14 19:36:...|      null|
|       4|        4|300.00| 2025-09-04|2025-09-14 19:36:...|      null|
|       5|        5|120.00| 2025-09-05|2025-09-14 19:36:...|      null|
|       6|        1| 80.00| 2025-09-06|2025-09-14 19:36:...|      null|
|       7|        2| 60.00| 2025-09-07|2025-09-14 19:36:...|      null|
|       8|        3| 90.00| 2025-09-08|2025-09-14 19:36:...|      null|
|       9|        4|110.00| 2025-09-09|2025-09-14 19:36:...|      null|
|      10|        5| 75.00| 2025-09-10|2025-09-14 19:36:...|      null|
|      11|        7| 95.00| 2025-09-11|2025-09-14 19:36:...|    

                                                                                

In [24]:
spark.stop()
