In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder \
    .appName("SmallTestDataset") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
spark

In [4]:
#INPUT DATA
input_data_g = "/home/jupyter/proyect/ds_taxi_NY/green_tripdata_2024-01.parquet"
input_data_y = "/home/jupyter/proyect/ds_taxi_NY/yellow_tripdata_2024-01.parquet"

In [5]:
#FIltro registros
registros_mes = "01"
registros_year = "2025"

In [6]:
#FUNCIONES
def sumar_missing_per_var(x):
    """conteo de missing en todas las variables del dataset"""
    
    return x.select([F.count( F.when(F.col(c).isNull(),c)  ).alias(c) for c in x.columns]).show()

In [7]:
spark.conf.set("spark.sql.adaptive.enabled","True")
spark.conf.set("spark.sql.shuffle.partitions", 1)

In [8]:

#READ FILE GREEN
df_g = spark.read.parquet(input_data_g)

#SELECCION VARIABLES GREEN DATASET
lista_vars = ['VendorID',
 'lpep_pickup_datetime',
 'lpep_dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'passenger_count',
 'trip_distance',
 'tip_amount',
 'total_amount']

dfg = df_g.select(lista_vars)


# FEATURE duracion_s: duracion del viaje en segundos
dfg = dfg.withColumn('duracion_s', F.unix_timestamp("lpep_dropoff_datetime") - F.unix_timestamp('lpep_pickup_datetime') )\
         .withColumn('duracion_s', F.when(  F.col('duracion_s')<0 , F.col('duracion_s')*-1 ).otherwise(F.col('duracion_s')) )

# TRANSFORMACION REDONDEO TIEMPO, Y DATE GREEN TAXI
dfg = dfg.withColumn("date_init_trip", F.date_format(F.col("lpep_pickup_datetime"), "yyyy-MM-dd"))\
         .withColumn( 'hour_init_trip',     F.hour( F.col("lpep_pickup_datetime") ))\
         .drop(F.col('lpep_pickup_datetime'))\
         .drop( F.col('lpep_dropoff_datetime') )
         

# FEATURE tipoVehiculo: tipo vehiculo
dfg = dfg.withColumn('tipoVehiculo', F.lit('Green'))

dfg = dfg.withColumn('total_amount', F.round(F.col('total_amount')+F.col('tip_amount'),2)  )

# dfg = dfg.repartition("date_init_trip")
dfg1 = dfg.groupBy("date_init_trip", "PULocationID","DOLocationID").agg(F.sum('total_amount'), F.mean('total_amount'),F.sum("passenger_count") )
print(dfg1.rdd.getNumPartitions())
dfg1.explain()

dfg1.show(20)
print(dfg1.count())



1
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(2) HashAggregate(keys=[date_init_trip#72, PULocationID#5, DOLocationID#6], functions=[sum(total_amount#130), avg(total_amount#130), sum(passenger_count#7L)])
   +- ShuffleQueryStage 0
      +- Exchange hashpartitioning(date_init_trip#72, PULocationID#5, DOLocationID#6, 1), ENSURE_REQUIREMENTS, [id=#31]
         +- *(1) HashAggregate(keys=[date_init_trip#72, PULocationID#5, DOLocationID#6], functions=[partial_sum(total_amount#130), partial_avg(total_amount#130), partial_sum(passenger_count#7L)])
            +- *(1) Project [PULocationID#5, DOLocationID#6, passenger_count#7L, round((total_amount#16 + tip_amount#12), 2) AS total_amount#130, date_format(lpep_pickup_datetime#1, yyyy-MM-dd, Some(Etc/UTC)) AS date_init_trip#72]
               +- *(1) ColumnarToRow
                  +- FileScan parquet [lpep_pickup_datetime#1,PULocationID#5,DOLocationID#6,passenger_count#7L,tip_amount#12,total_amount#16] Batched:

In [9]:
spark.conf.get("spark.sql.adaptive.enabled")

'True'

In [19]:
spark.sparkContext.getConf().getAll() 

[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.driver.host', '24f1e701133a'),
 ('spark.app.submitTime', '1763308527964'),
 ('spark.executor.id', 'driver'),
 ('spark.app.startTime', '1763308528122'),
 ('spark.sql.warehouse.dir',
  'file:/home/jupyter/proyect/NYtaxi_dataEn