In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("pz4-ingesta-test")
    .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")
    .getOrCreate()
)

print("Spark listo ")


Spark listo 


In [6]:
import os
from pyspark.sql import functions as F

# === 1) Vars de conexión desde el entorno ===
PG_HOST = os.environ.get("PG_HOST", "postgres")
PG_PORT = os.environ.get("PG_PORT", "5432")
PG_DB   = os.environ.get("PG_DB", "nyc_taxi")
PG_USER = os.environ.get("PG_USER", "nyc_user")
PG_PWD  = os.environ.get("PG_PASSWORD", "nyc_password")
RAW     = os.environ.get("PG_SCHEMA_RAW", "raw")

jdbc_url = f"jdbc:postgresql://{PG_HOST}:{PG_PORT}/{PG_DB}"
props = {
    "driver": "org.postgresql.Driver",
    "user": PG_USER,
    "password": PG_PWD,
    "stringtype": "unspecified",  # evita issues con types en write
    "sslmode": "disable"
}

# === 2) Escribimos una tabla mínima de prueba en raw.t_ping ===
df = spark.createDataFrame(
    [(1, "ok"), (2, "ok")],
    ["id", "status"]
).withColumn("ts_utc", F.current_timestamp())

target_table = f"{RAW}.t_ping"

# drop previo por si existe (no falla si no existe)
spark.read.jdbc(jdbc_url, "information_schema.tables", properties=props)  # fuerza el driver
spark.sql(f"select 1").collect()  # no-op para calentar

# write (overwrite) vía JDBC
(df.write
   .mode("overwrite")
   .jdbc(jdbc_url, target_table, properties=props))

# === 3) Leemos de vuelta para validar ===
out = spark.read.jdbc(jdbc_url, target_table, properties=props)
print("Filas en raw.t_ping =", out.count())
out.orderBy("id").show(truncate=False)


Filas en raw.t_ping = 2
+---+------+--------------------------+
|id |status|ts_utc                    |
+---+------+--------------------------+
|1  |ok    |2025-11-08 22:30:49.271737|
|2  |ok    |2025-11-08 22:30:49.271737|
+---+------+--------------------------+



In [4]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("pz4-reinit")
         .config("spark.jars.packages", "org.postgresql:postgresql:42.7.4")
         .getOrCreate())
print("Spark OK:", spark.version, "| Ping:", spark.range(1).count())


Spark OK: 3.5.3 | Ping: 1


In [8]:
import os
DATA_ROOT = os.environ.get("DATA_ROOT")
RAW = os.environ.get("PG_SCHEMA_RAW", "raw")

print("DATA_ROOT =", DATA_ROOT)


DATA_ROOT = /home/jovyan/work/data/trip-data


In [9]:
from pyspark.sql import functions as F

service = "green"
year = 2015
month = 1
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (spark.read.parquet(file)
      .withColumn("service_type", F.lit(service))
      .withColumn("source_year",  F.lit(year))
      .withColumn("source_month", F.lit(month)))

target = f"{RAW}.{service}_taxi_trip"

# Escribe en lotes moderados y pocas particiones para no tumbar la JVM
(df.coalesce(8)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print("✅ green 2015-01 cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/green_tripdata_2015-01.parquet
✅ green 2015-01 cargado en raw.green_taxi_trip


In [13]:
from pyspark.sql import functions as F

service = "yellow"
year = 2015
month = 2
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (spark.read.parquet(file)
      .withColumn("service_type", F.lit(service))
      .withColumn("source_year",  F.lit(year))
      .withColumn("source_month", F.lit(month)))

target = f"{RAW}.{service}_taxi_trip"
df = df.withColumn("source_path", F.lit(file))
# micro-batch seguro
(df.coalesce(8)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/yellow_tripdata_2015-02.parquet
✅ yellow 2015-02 cargado en raw.yellow_taxi_trip


In [14]:
from pyspark.sql import functions as F

service = "yellow"
year = 2015
month = 3
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (spark.read.parquet(file)
      .withColumn("service_type", F.lit(service))
      .withColumn("source_year",  F.lit(year))
      .withColumn("source_month", F.lit(month))
      .withColumn("source_path",  F.lit(file))  # trazabilidad requerida por la tabla
)

target = f"{RAW}.{service}_taxi_trip"

(df.coalesce(8)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/yellow_tripdata_2015-03.parquet
✅ yellow 2015-03 cargado en raw.yellow_taxi_trip


In [18]:
from pyspark.sql import functions as F

service = "green"   # ← estás cargando green 2015-02
year = 2015
month = 2
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

# 1) Reconstruimos df en una sola cadena, incluyendo source_path
df = (
    spark.read.parquet(file)
         .withColumn("service_type", F.lit(service))
         .withColumn("source_year",  F.lit(year))
         .withColumn("source_month", F.lit(month))
         .withColumn("source_path",  F.lit(file))
)

# 2) Validamos que source_path SÍ esté en el DF
df.printSchema()

# 3) Insert
target = f"{RAW}.{service}_taxi_trip"
(df.coalesce(4)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/green_tripdata_2015-02.parquet
root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: integer (nullable = true)
 |-- service_type: st

In [19]:
from pyspark.sql import functions as F

service = "green"
year = 2015
month = 3
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (
    spark.read.parquet(file)
         .withColumn("service_type", F.lit(service))
         .withColumn("source_year",  F.lit(year))
         .withColumn("source_month", F.lit(month))
         .withColumn("source_path",  F.lit(file))
)

target = f"{RAW}.{service}_taxi_trip"

(df.coalesce(4)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/green_tripdata_2015-03.parquet
✅ green 2015-03 cargado en raw.green_taxi_trip


In [20]:
from pyspark.sql import functions as F

service = "yellow"
year = 2015
month = 4
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (
    spark.read.parquet(file)
         .withColumn("service_type", F.lit(service))
         .withColumn("source_year",  F.lit(year))
         .withColumn("source_month", F.lit(month))
         .withColumn("source_path",  F.lit(file))
)

target = f"{RAW}.{service}_taxi_trip"

(df.coalesce(8)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/yellow_tripdata_2015-04.parquet
✅ yellow 2015-04 cargado en raw.yellow_taxi_trip


In [21]:
from pyspark.sql import functions as F

service = "green"
year = 2015
month = 4
file = f"{DATA_ROOT}/{service}_tripdata_{year}-{month:02d}.parquet"
print("Leyendo:", file)

df = (
    spark.read.parquet(file)
         .withColumn("service_type", F.lit(service))
         .withColumn("source_year",  F.lit(year))
         .withColumn("source_month", F.lit(month))
         .withColumn("source_path",  F.lit(file))
)

target = f"{RAW}.{service}_taxi_trip"

(df.coalesce(4)
   .write
   .mode("append")
   .option("truncate", "false")
   .option("batchsize", "20000")
   .jdbc(jdbc_url, target, properties=props))

print(f"✅ {service} {year}-{month:02d} cargado en", target)


Leyendo: /home/jovyan/work/data/trip-data/green_tripdata_2015-04.parquet
✅ green 2015-04 cargado en raw.green_taxi_trip
