In [2]:
import snowflake.connector
import os
import pandas as pd
from snowflake.connector.errors import NotSupportedError

def sf_query(sql: str):
    """
    Ejecuta UNA sola sentencia SQL en Snowflake.
    - Si trae resultados, devuelve un DataFrame.
    - Si no, imprime confirmación.
    Maneja SHOW/DESCRIBE/SELECT aunque fetch_pandas_all no esté soportado.
    """
    conn = snowflake.connector.connect(
        user=os.getenv("SNOWFLAKE_USER"),
        password=os.getenv("SNOWFLAKE_PASSWORD"),
        account=os.getenv("SNOWFLAKE_HOST").split(".snowflakecomputing.com")[0],
        warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
        database=os.getenv("SNOWFLAKE_DATABASE"),
        role=os.getenv("SNOWFLAKE_ROLE"),
    )
    try:
        cur = conn.cursor()
        # Intentamos usar Arrow para que fetch_pandas_all funcione
        try:
            cur.execute("ALTER SESSION SET QUERY_RESULT_FORMAT=ARROW")
        except Exception:
            pass  # si falla, igual seguimos (haremos fallback)

        cur.execute(sql)

        if cur.description:  # hay result set (SELECT/SHOW/DESCRIBE)
            try:
                df = cur.fetch_pandas_all()  # rápido si hay Arrow/pyarrow
            except NotSupportedError:
                # Fallback sin Arrow
                rows = cur.fetchall()
                cols = [d[0] for d in cur.description]
                df = pd.DataFrame(rows, columns=cols)
            finally:
                cur.close()
            return df
        else:
            cur.close()
            print("Executed successfully.")
    finally:
        conn.close()


In [19]:
sf_query("SELECT 1;")


Unnamed: 0,1
0,1


In [20]:
sf_query("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA(), CURRENT_WAREHOUSE(), CURRENT_ROLE();")


Unnamed: 0,CURRENT_DATABASE(),CURRENT_SCHEMA(),CURRENT_WAREHOUSE(),CURRENT_ROLE()
0,DM_PSET3,,WH_DM,SYSADMIN


In [21]:
sf_query("SHOW WAREHOUSES LIKE 'WH_DM';")


Unnamed: 0,name,state,type,size,min_cluster_count,max_cluster_count,started_clusters,running,queued,is_default,...,pendings,failed,suspended,uuid,scaling_policy,owner_role_type,resource_constraint,warehouse_credit_limit,target_statement_size,disabled_reasons
0,WH_DM,STARTED,STANDARD,X-Small,1,1,1,0,0,N,...,0,0,0,1946880808,STANDARD,ROLE,STANDARD_GEN_1,,,


In [22]:
sf_query("ALTER SESSION SET STATEMENT_TIMEOUT_IN_SECONDS=120;")


Unnamed: 0,status
0,Statement executed successfully.


In [23]:
sf_query("""
CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS AS
SELECT
  service_type,
  pickup_datetime,
  dropoff_datetime,
  DATEDIFF('minute', pickup_datetime, dropoff_datetime) AS trip_duration_min,
  EXTRACT(HOUR FROM pickup_datetime) AS pickup_hour,
  TO_CHAR(pickup_datetime, 'DY') AS pickup_dow,
  IFF(DAYOFWEEK(pickup_datetime) IN (0,6), TRUE, FALSE) AS is_weekend,
  trip_distance,
  pu_location_id,
  do_location_id,
  fare_amount,
  tip_amount,
  tolls_amount,
  mta_tax,
  improvement_surcharge,
  congestion_surcharge,
  total_amount,
  payment_type,
  vendor_id,
  rate_code_id,
  store_and_fwd_flag,
  year,
  month,
  IFF(NULLIFZERO(fare_amount) IS NULL, NULL, tip_amount / NULLIFZERO(fare_amount)) AS tip_pct
FROM RAW.TRIPS_ALL
WHERE fare_amount > 0
  AND trip_distance BETWEEN 0.1 AND 100
  AND DATEDIFF('minute', pickup_datetime, dropoff_datetime) BETWEEN 1 AND 240;
""")


Unnamed: 0,status
0,View OBT_TRIPS successfully created.


In [24]:
sf_query("SHOW VIEWS IN SCHEMA ANALYTICS;")


Unnamed: 0,created_on,name,reserved,database_name,schema_name,owner,comment,text,is_secure,is_materialized,owner_role_type,change_tracking
0,2025-10-23 09:35:54.855000-07:00,OBT_TRIPS,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS AS\...,False,False,ROLE,OFF
1,2025-10-23 08:54:42.260000-07:00,OBT_TRIPS_CLEAN_V,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_CLE...,False,False,ROLE,OFF
2,2025-10-23 08:41:54.807000-07:00,OBT_TRIPS_V,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_V A...,False,False,ROLE,OFF


In [5]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("DM_PSET3")
    .getOrCreate()
)

print("✅ Spark inicializado correctamente")


✅ Spark inicializado correctamente


In [6]:
import os

csv_path = "/home/jovyan/work/datasets/taxi_zone_lookup.csv"
print("Existe el archivo?:", os.path.exists(csv_path), "->", csv_path)


Existe el archivo?: True -> /home/jovyan/work/datasets/taxi_zone_lookup.csv


In [7]:
zones_df = spark.read.option("header", True).csv(csv_path)
print("Filas leídas por Spark:", zones_df.count())
zones_df.show(5, truncate=False)


Filas leídas por Spark: 265
+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
+----------+-------------+-----------------------+------------+
only showing top 5 rows



In [9]:
import os

# (opcional) si usas .env en este notebook:
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

# Chequeo rápido (deberían imprimirse valores reales, no None)
print("SF_HOST:", os.getenv("SNOWFLAKE_HOST"))
print("SF_DB:", os.getenv("SNOWFLAKE_DATABASE"))
print("SF_WH:", os.getenv("SNOWFLAKE_WAREHOUSE"))
print("SF_ROLE:", os.getenv("SNOWFLAKE_ROLE"))
print("SF_USER set?:", os.getenv("SNOWFLAKE_USER") is not None)

def build_sf_options(schema: str = "ANALYTICS"):
    """
    Devuelve el diccionario de opciones para el conector Spark ↔ Snowflake.
    Usa las variables del .env que ya has configurado.
    """
    return {
        "sfURL": os.getenv("SNOWFLAKE_HOST"),           # xpc24435.us-east-1.snowflakecomputing.com
        "sfUser": os.getenv("SNOWFLAKE_USER"),
        "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
        "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),# WH_DM
        "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),  # DM_PSET3
        "sfSchema": schema,                             # por defecto ANALYTICS
        "sfRole": os.getenv("SNOWFLAKE_ROLE"),          # SYSADMIN
    }


SF_HOST: xpc24435.us-east-1.snowflakecomputing.com
SF_DB: DM_PSET3
SF_WH: WH_DM
SF_ROLE: SYSADMIN
SF_USER set?: True


In [10]:
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import col, trim

zones_clean = (
    zones_df.select(
        trim(col("LocationID")).cast(IntegerType()).alias("LOCATIONID"),
        trim(col("Borough")).cast(StringType()).alias("BOROUGH"),
        trim(col("Zone")).cast(StringType()).alias("ZONE"),
        trim(col("service_zone")).cast(StringType()).alias("SERVICE_ZONE"),
    )
)

sfOptions = build_sf_options(schema="ANALYTICS")

(
    zones_clean.write
    .format("snowflake")
    .options(**sfOptions)
    .option("dbtable", "ANALYTICS.TAXI_ZONES")
    .mode("overwrite")
    .save()
)

print("✅ Tabla ANALYTICS.TAXI_ZONES escrita correctamente.")


✅ Tabla ANALYTICS.TAXI_ZONES escrita correctamente.


In [11]:
sf_query("SHOW TABLES IN SCHEMA ANALYTICS;")

Unnamed: 0,created_on,name,database_name,schema_name,kind,comment,cluster_by,rows,bytes,owner,...,search_optimization_progress,search_optimization_bytes,is_external,enable_schema_evolution,owner_role_type,is_event,is_hybrid,is_iceberg,is_dynamic,is_immutable
0,2025-10-23 10:39:16.161000-07:00,TAXI_ZONES,DM_PSET3,ANALYTICS,TABLE,,,265,6144,SYSADMIN,...,,,N,N,ROLE,N,N,N,N,N


In [12]:
sf_query("SELECT COUNT(*) AS n FROM ANALYTICS.TAXI_ZONES;")

Unnamed: 0,N
0,265


In [13]:
sf_query("SELECT * FROM ANALYTICS.TAXI_ZONES LIMIT 5;")

Unnamed: 0,LOCATIONID,BOROUGH,ZONE,SERVICE_ZONE
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [14]:
sf_query("""
CREATE OR REPLACE TABLE ANALYTICS.PAYMENT_TYPE_DIM AS
SELECT * FROM VALUES
  (1,'Credit card'),(2,'Cash'),(3,'No charge'),(4,'Dispute'),(5,'Unknown'),(6,'Voided')
AS T(payment_type, payment_type_desc);
""")


Unnamed: 0,status
0,Table PAYMENT_TYPE_DIM successfully created.


In [15]:
sf_query("""
CREATE OR REPLACE TABLE ANALYTICS.RATE_CODE_DIM AS
SELECT * FROM VALUES
  (1,'Standard rate'),(2,'JFK'),(3,'Newark'),(4,'Nassau/Westchester'),
  (5,'Negotiated fare'),(6,'Group ride')
AS T(rate_code_id, rate_code_desc);
""")


Unnamed: 0,status
0,Table RATE_CODE_DIM successfully created.


In [16]:
sf_query("""
CREATE OR REPLACE TABLE ANALYTICS.VENDOR_DIM AS
SELECT * FROM VALUES
  (1,'Creative Mobile Technologies (CMT)'),
  (2,'VeriFone (VTS)')
AS T(vendor_id, vendor_name);
""")


Unnamed: 0,status
0,Table VENDOR_DIM successfully created.


In [17]:
sf_query("SELECT * FROM ANALYTICS.PAYMENT_TYPE_DIM;")



Unnamed: 0,PAYMENT_TYPE,PAYMENT_TYPE_DESC
0,1,Credit card
1,2,Cash
2,3,No charge
3,4,Dispute
4,5,Unknown
5,6,Voided


In [18]:

sf_query("SELECT * FROM ANALYTICS.RATE_CODE_DIM;")



Unnamed: 0,RATE_CODE_ID,RATE_CODE_DESC
0,1,Standard rate
1,2,JFK
2,3,Newark
3,4,Nassau/Westchester
4,5,Negotiated fare
5,6,Group ride


In [19]:

sf_query("SELECT * FROM ANALYTICS.VENDOR_DIM;")


Unnamed: 0,VENDOR_ID,VENDOR_NAME
0,1,Creative Mobile Technologies (CMT)
1,2,VeriFone (VTS)


In [21]:
sf_query("""
CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_ENRICHED AS
SELECT
  t.service_type,

  /* Tiempo */
  t.pickup_datetime,
  t.dropoff_datetime,
  TO_DATE(t.pickup_datetime)  AS pickup_date,
  TO_DATE(t.dropoff_datetime) AS dropoff_date,
  EXTRACT(HOUR FROM t.pickup_datetime)   AS pickup_hour,
  EXTRACT(HOUR FROM t.dropoff_datetime)  AS dropoff_hour,
  TO_CHAR(t.pickup_datetime,'DY')        AS day_of_week,
  t.year, t.month,

  /* Ubicación con nombres */
  t.pu_location_id,
  zpu.ZONE    AS pu_zone,
  zpu.BOROUGH AS pu_borough,
  t.do_location_id,
  zdo.ZONE    AS do_zone,
  zdo.BOROUGH AS do_borough,

  /* Viaje y tarifas */
  t.trip_distance,
  t.store_and_fwd_flag,
  t.fare_amount,
  t.tip_amount,
  t.tolls_amount,
  t.mta_tax,
  t.improvement_surcharge,
  t.congestion_surcharge,
  t.total_amount,

  /* Catálogos con descripción */
  t.payment_type,  p.payment_type_desc,
  t.vendor_id,     v.vendor_name,
  t.rate_code_id,  r.rate_code_desc,

  /* Placeholder porque t.trip_type NO está en OBT_TRIPS */
  NULL::NUMBER AS trip_type,

  /* Derivadas */
  t.trip_duration_min,
  t.tip_pct,
  IFF(
    NULLIFZERO(DATEDIFF('second', t.pickup_datetime, t.dropoff_datetime)) IS NULL,
    NULL,
    t.trip_distance / NULLIF( DATEDIFF('second', t.pickup_datetime, t.dropoff_datetime)/3600.0 , 0)
  ) AS avg_speed_mph

FROM ANALYTICS.OBT_TRIPS t
LEFT JOIN ANALYTICS.TAXI_ZONES       zpu ON zpu.LOCATIONID = t.pu_location_id
LEFT JOIN ANALYTICS.TAXI_ZONES       zdo ON zdo.LOCATIONID = t.do_location_id
LEFT JOIN ANALYTICS.PAYMENT_TYPE_DIM p   ON p.payment_type = t.payment_type
LEFT JOIN ANALYTICS.VENDOR_DIM       v   ON v.vendor_id     = t.vendor_id
LEFT JOIN ANALYTICS.RATE_CODE_DIM    r   ON r.rate_code_id  = t.rate_code_id;
""")


Unnamed: 0,status
0,View OBT_TRIPS_ENRICHED successfully created.


In [22]:
sf_query("SHOW VIEWS IN SCHEMA ANALYTICS;")



Unnamed: 0,created_on,name,reserved,database_name,schema_name,owner,comment,text,is_secure,is_materialized,owner_role_type,change_tracking
0,2025-10-23 09:35:54.855000-07:00,OBT_TRIPS,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS AS\...,False,False,ROLE,OFF
1,2025-10-23 08:54:42.260000-07:00,OBT_TRIPS_CLEAN_V,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_CLE...,False,False,ROLE,OFF
2,2025-10-23 10:53:15.823000-07:00,OBT_TRIPS_ENRICHED,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_ENR...,False,False,ROLE,OFF
3,2025-10-23 08:41:54.807000-07:00,OBT_TRIPS_V,,DM_PSET3,ANALYTICS,SYSADMIN,,CREATE OR REPLACE VIEW ANALYTICS.OBT_TRIPS_V A...,False,False,ROLE,OFF


In [23]:

sf_query("SELECT * FROM ANALYTICS.OBT_TRIPS_ENRICHED LIMIT 20;")


Unnamed: 0,SERVICE_TYPE,PICKUP_DATETIME,DROPOFF_DATETIME,PICKUP_DATE,DROPOFF_DATE,PICKUP_HOUR,DROPOFF_HOUR,DAY_OF_WEEK,YEAR,MONTH,...,PAYMENT_TYPE,PAYMENT_TYPE_DESC,VENDOR_ID,VENDOR_NAME,RATE_CODE_ID,RATE_CODE_DESC,TRIP_TYPE,TRIP_DURATION_MIN,TIP_PCT,AVG_SPEED_MPH
0,yellow,2015-01-01 00:11:33,2015-01-01 00:16:48,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,1,Credit card,1,Creative Mobile Technologies (CMT),1,Standard rate,,5,0.245614,11.428571
1,yellow,2015-01-01 00:18:24,2015-01-01 00:24:20,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,3,No charge,1,Creative Mobile Technologies (CMT),1,Standard rate,,6,0.0,9.101113
2,yellow,2015-01-01 00:26:19,2015-01-01 00:41:06,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,1,Credit card,1,Creative Mobile Technologies (CMT),1,Standard rate,,15,0.219697,14.20518
3,yellow,2015-01-01 00:45:26,2015-01-01 00:53:20,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,1,Credit card,1,Creative Mobile Technologies (CMT),1,Standard rate,,8,0.289024,15.949327
4,yellow,2015-01-01 00:59:21,2015-01-01 01:05:24,2015-01-01,2015-01-01,0,1,Thu,2015,1,...,3,No charge,1,Creative Mobile Technologies (CMT),1,Standard rate,,6,0.0,9.917388
5,yellow,2015-01-01 00:07:31,2015-01-01 00:11:32,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,2,Cash,1,Creative Mobile Technologies (CMT),1,Standard rate,,4,0.0,11.950287
6,yellow,2015-01-01 00:47:08,2015-01-01 00:54:50,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,2,Cash,1,Creative Mobile Technologies (CMT),1,Standard rate,,7,0.0,8.571451
7,yellow,2015-01-01 00:58:04,2015-01-01 01:11:56,2015-01-01,2015-01-01,0,1,Thu,2015,1,...,1,Credit card,1,Creative Mobile Technologies (CMT),1,Standard rate,,13,0.221311,12.548083
8,yellow,2015-01-01 00:29:25,2015-01-01 00:37:25,2015-01-01,2015-01-01,0,0,Thu,2015,1,...,2,Cash,1,Creative Mobile Technologies (CMT),1,Standard rate,,8,0.0,9.750024
9,yellow,2015-01-01 00:39:02,2015-01-01 01:02:37,2015-01-01,2015-01-01,0,1,Thu,2015,1,...,2,Cash,1,Creative Mobile Technologies (CMT),1,Standard rate,,23,0.0,10.939917


In [24]:
sf_query("""
SELECT
  year, month, pu_borough, pu_zone,
  COUNT(*) AS trips
FROM ANALYTICS.OBT_TRIPS_ENRICHED
GROUP BY 1,2,3,4
QUALIFY ROW_NUMBER() OVER (PARTITION BY year, month ORDER BY trips DESC) <= 10
ORDER BY year, month, trips DESC;
""")


Unnamed: 0,YEAR,MONTH,PU_BOROUGH,PU_ZONE,TRIPS
0,2015,1,Manhattan,Upper East Side South,464896
1,2015,1,Manhattan,Midtown Center,443712
2,2015,1,Manhattan,Upper East Side North,443005
3,2015,1,Manhattan,East Village,435128
4,2015,1,Manhattan,Times Sq/Theatre District,426120
...,...,...,...,...,...
1275,2025,8,Manhattan,Times Sq/Theatre District,98151
1276,2025,8,Manhattan,Upper East Side North,94894
1277,2025,8,Queens,LaGuardia Airport,93864
1278,2025,8,Manhattan,Murray Hill,89073


In [25]:
sf_query("""
SELECT
  year, month, do_borough, do_zone,
  COUNT(*) AS trips
FROM ANALYTICS.OBT_TRIPS_ENRICHED
GROUP BY 1,2,3,4
QUALIFY ROW_NUMBER() OVER (PARTITION BY year, month ORDER BY trips DESC) <= 10
ORDER BY year, month, trips DESC;
""")


Unnamed: 0,YEAR,MONTH,DO_BOROUGH,DO_ZONE,TRIPS
0,2015,1,Manhattan,Midtown Center,470763
1,2015,1,Manhattan,Upper East Side North,455323
2,2015,1,Manhattan,Upper East Side South,409400
3,2015,1,Manhattan,Murray Hill,405025
4,2015,1,Manhattan,Times Sq/Theatre District,399599
...,...,...,...,...,...
1275,2025,8,Manhattan,Midtown East,86278
1276,2025,8,Manhattan,East Chelsea,80972
1277,2025,8,Manhattan,Clinton East,75495
1278,2025,8,Manhattan,Union Sq,74284


In [26]:
sf_query("""
SELECT
  year, month, pu_borough,
  ROUND(SUM(total_amount),2) AS total_amount_sum,
  ROUND(AVG(tip_pct),3)      AS tip_pct_avg,
  COUNT(*)                   AS n
FROM ANALYTICS.OBT_TRIPS_ENRICHED
GROUP BY 1,2,3
ORDER BY 1,2,3;
""")


Unnamed: 0,YEAR,MONTH,PU_BOROUGH,TOTAL_AMOUNT_SUM,TIP_PCT_AVG,N
0,2015,1,Bronx,1.209429e+06,0.031,92259
1,2015,1,Brooklyn,1.231765e+07,0.124,786643
2,2015,1,EWR,1.087899e+04,0.456,142
3,2015,1,Manhattan,1.645825e+08,0.145,11964887
4,2015,1,,1.654462e+05,1.709,4504
...,...,...,...,...,...,...
1019,2025,8,Manhattan,6.334510e+07,0.185,2649360
1020,2025,8,,5.434201e+04,0.104,562
1021,2025,8,Queens,2.395135e+07,0.131,371073
1022,2025,8,Staten Island,1.031046e+04,0.029,253


In [27]:
sf_query("""
SELECT
  pu_borough,
  ROUND(AVG(avg_speed_mph),2) AS avg_speed,
  ROUND(MIN(avg_speed_mph),2) AS min_speed,
  ROUND(MAX(avg_speed_mph),2) AS max_speed
FROM ANALYTICS.OBT_TRIPS_ENRICHED
WHERE avg_speed_mph IS NOT NULL
GROUP BY 1
ORDER BY 2 DESC;
""")


Unnamed: 0,PU_BOROUGH,AVG_SPEED,MIN_SPEED,MAX_SPEED
0,EWR,398.71,0.06,19927.97
1,,100.86,0.04,96582.73
2,Staten Island,31.49,0.05,19567.83
3,Queens,20.34,0.03,145539.57
4,Bronx,15.3,0.03,27697.84
5,Brooklyn,13.27,0.03,65467.63
6,Unknown,12.43,0.03,66618.71
7,Manhattan,10.93,0.03,97122.3


In [28]:
sf_query("""
SELECT
  payment_type_desc,
  ROUND(AVG(tip_pct),3) AS avg_tip_pct,
  COUNT(*) AS n
FROM ANALYTICS.OBT_TRIPS_ENRICHED
WHERE tip_pct IS NOT NULL
GROUP BY 1
ORDER BY 2 DESC;
""")


Unnamed: 0,PAYMENT_TYPE_DESC,AVG_TIP_PCT,N
0,Credit card,0.234,564201865
1,,0.061,15941307
2,Unknown,0.025,2099
3,Dispute,0.001,1827273
4,No charge,0.0,2646757
5,Cash,0.0,249833049


In [29]:
sf_query("""
SELECT
  pu_borough, pu_zone,
  do_borough, do_zone,
  ROUND(AVG(tip_pct),3) AS avg_tip_pct,
  COUNT(*) AS n
FROM ANALYTICS.OBT_TRIPS_ENRICHED
WHERE tip_pct IS NOT NULL
GROUP BY 1,2,3,4
HAVING COUNT(*) > 500
ORDER BY avg_tip_pct DESC
LIMIT 10;
""")


Unnamed: 0,PU_BOROUGH,PU_ZONE,DO_BOROUGH,DO_ZONE,AVG_TIP_PCT,N
0,Queens,North Corona,,Outside of NYC,37.425,589
1,Bronx,Schuylerville/Edgewater Park,,Outside of NYC,27.068,815
2,Queens,Whitestone,,Outside of NYC,22.359,809
3,Queens,Cambria Heights,,Outside of NYC,22.057,614
4,Bronx,Mount Hope,,Outside of NYC,21.465,840
5,Queens,South Jamaica,,Outside of NYC,14.501,2904
6,Manhattan,Bloomingdale,,Outside of NYC,13.649,1146
7,Queens,South Ozone Park,,Outside of NYC,12.799,3341
8,Queens,Springfield Gardens South,,Outside of NYC,11.917,6483
9,Queens,Willets Point,,Outside of NYC,11.65,695


In [30]:
sf_query("""
SELECT
  pickup_hour,
  ROUND(AVG(trip_duration_min),2) AS avg_duration_min,
  ROUND(AVG(trip_distance),2)     AS avg_distance_mi,
  ROUND(AVG(tip_pct),3)           AS avg_tip_pct,
  COUNT(*) AS n
FROM ANALYTICS.OBT_TRIPS_ENRICHED
GROUP BY 1
ORDER BY 1;
""")


Unnamed: 0,PICKUP_HOUR,AVG_DURATION_MIN,AVG_DISTANCE_MI,AVG_TIP_PCT,N
0,0,13.1,3.51,0.163,27450290
1,1,12.25,3.35,0.163,19558090
2,2,11.71,3.28,0.16,13944481
3,3,11.82,3.52,0.153,10012448
4,4,12.61,4.37,0.143,7681595
5,5,12.74,4.95,0.144,7688749
6,6,12.31,3.84,0.149,16773048
7,7,13.4,3.13,0.16,28921297
8,8,14.54,2.82,0.162,36699258
9,9,14.66,2.77,0.158,38126891
