In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
spark = SparkSession.builder \
    .appName("SmallTestDataset") \
    .master("local[*]") \
    .getOrCreate()

In [3]:
spark

In [4]:
#INPUT DATA
input_data_g = "/home/jupyter/proyect/ds_taxi_NY/green_tripdata_2024-01.parquet"
input_data_y = "/home/jupyter/proyect/ds_taxi_NY/yellow_tripdata_2024-01.parquet"

In [5]:
#FIltro registros
registros_mes = "01"
registros_year = "2025"

In [6]:
#FUNCIONES
def sumar_missing_per_var(x):
    """conteo de missing en todas las variables del dataset"""
    
    return x.select([F.count( F.when(F.col(c).isNull(),c)  ).alias(c) for c in x.columns]).show()

In [7]:
spark.conf.set("spark.sql.adaptive.enabled","False")

In [None]:

#READ FILE GREEN
df_g = spark.read.parquet(input_data_g)

#SELECCION VARIABLES GREEN DATASET
lista_vars = ['VendorID',
 'lpep_pickup_datetime',
 'lpep_dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'passenger_count',
 'trip_distance',
 'tip_amount',
 'total_amount']

dfg = df_g.select(lista_vars)


# FEATURE duracion_s: duracion del viaje en segundos
dfg = dfg.withColumn('duracion_s', F.unix_timestamp("lpep_dropoff_datetime") - F.unix_timestamp('lpep_pickup_datetime') )\
         .withColumn('duracion_s', F.when(  F.col('duracion_s')<0 , F.col('duracion_s')*-1 ).otherwise(F.col('duracion_s')) )

# TRANSFORMACION REDONDEO TIEMPO, Y DATE GREEN TAXI
dfg = dfg.withColumn("date_init_trip", F.date_format(F.col("lpep_pickup_datetime"), "yyyy-MM-dd"))\
         .withColumn( 'hour_init_trip',     F.hour( F.col("lpep_pickup_datetime") ))\
         .drop(F.col('lpep_pickup_datetime'))\
         .drop( F.col('lpep_dropoff_datetime') )
         

# FEATURE tipoVehiculo: tipo vehiculo
dfg = dfg.withColumn('tipoVehiculo', F.lit('Green'))

dfg = dfg.withColumn('total_amount', F.round(F.col('total_amount')+F.col('tip_amount'),2)  )

dfg = dfg.repartition("date_init_trip")
dfg1 = dfg.groupBy("date_init_trip", "PULocationID","DOLocationID").agg(F.sum('total_amount'), F.mean('total_amount'),F.sum("passenger_count") )
print(dfg1.rdd.getNumPartitions())
dfg1.explain()

dfg1.show(20)




200
== Physical Plan ==
*(2) HashAggregate(keys=[date_init_trip#601, PULocationID#534, DOLocationID#535], functions=[sum(total_amount#659), avg(total_amount#659), sum(passenger_count#536L)])
+- Exchange hashpartitioning(date_init_trip#601, PULocationID#534, DOLocationID#535, 200), ENSURE_REQUIREMENTS, [id=#182]
   +- *(1) HashAggregate(keys=[date_init_trip#601, PULocationID#534, DOLocationID#535], functions=[partial_sum(total_amount#659), partial_avg(total_amount#659), partial_sum(passenger_count#536L)])
      +- *(1) Project [PULocationID#534, DOLocationID#535, passenger_count#536L, round((total_amount#545 + tip_amount#541), 2) AS total_amount#659, date_format(lpep_pickup_datetime#530, yyyy-MM-dd, Some(Etc/UTC)) AS date_init_trip#601]
         +- *(1) ColumnarToRow
            +- FileScan parquet [lpep_pickup_datetime#530,PULocationID#534,DOLocationID#535,passenger_count#536L,tip_amount#541,total_amount#545] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(

In [8]:
spark.conf.get("spark.sql.adaptive.enabled")

'False'