In [1]:
import darkwing as dw

In [2]:
t0 = dw.Table('data/yellow_tripdata_2010-01.parquet')
t0.columns

['vendor_id',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'pickup_longitude',
 'pickup_latitude',
 'rate_code',
 'store_and_fwd_flag',
 'dropoff_longitude',
 'dropoff_latitude',
 'payment_type',
 'fare_amount',
 'surcharge',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'total_amount']

In [3]:
count_rows = "select format('{:.2e}', 1.0*count(*)) as num_rows"
t0.do(count_rows)

┌──────────┐
│ num_rows │
│ varchar  │
├──────────┤
│ 1.49e+07 │
└──────────┘

In [4]:
t1 = t0.do(
    'where (pickup_longitude != 0) and (pickup_latitude != 0)',
    'where total_amount > 0',
    
    'select *, h3_latlng_to_cell(pickup_latitude, pickup_longitude, 12) as hexid',
    'select * replace ( h3_h3_to_string(hexid) as hexid )',
    'select cast(pickup_datetime as timestamp) as ts, hexid, total_amount as amt',
)

In [5]:
t1

┌─────────────────────┬─────────────────┬───────────────────┐
│         ts          │      hexid      │        amt        │
│      timestamp      │     varchar     │      double       │
├─────────────────────┼─────────────────┼───────────────────┤
│ 2010-01-26 07:41:00 │ 8c2a100d45b01ff │               5.0 │
│ 2010-01-30 23:31:00 │ 8c2a107258e61ff │              16.3 │
│ 2010-01-18 20:22:20 │ 8c2a1008b82b5ff │              12.7 │
│ 2010-01-09 01:18:00 │ 8c2a100d65653ff │              14.3 │
│ 2010-01-18 19:10:14 │ 8c2a100d22945ff │              6.67 │
│ 2010-01-17 09:18:00 │ 8c2a10725ac5bff │               6.6 │
│ 2010-01-09 13:49:00 │ 8c2a100d620b7ff │               7.4 │
│ 2010-01-09 00:25:00 │ 8c2a1072c86abff │              12.3 │
│ 2010-01-27 18:15:00 │ 8c2a100d2bb69ff │              12.0 │
│ 2010-01-08 16:05:00 │ 8c2a107250403ff │              10.2 │
│          ·          │        ·        │                ·  │
│          ·          │        ·        │                ·  │
│       

In [6]:
t2 = t1.alias('tbl1').do("""
select
      a.hexid
    , a.ts as ts1
    , b.ts as ts2
    , a.amt as amt1
    , b.amt as amt2
from
    tbl1 as a
inner join
    tbl1 as b
using
    (hexid)
""")

In [7]:
# renders quickly because DuckDB is computing just a few rows to create a preview
t2

┌─────────────────┬─────────────────────┬─────────────────────┬───────────────────┬───────────────────┐
│      hexid      │         ts1         │         ts2         │       amt1        │       amt2        │
│     varchar     │      timestamp      │      timestamp      │      double       │      double       │
├─────────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┤
│ 8c2a1008b0231ff │ 2010-01-02 13:28:13 │ 2010-01-14 19:35:00 │               7.2 │               6.8 │
│ 8c2a100d6c999ff │ 2010-01-02 11:48:34 │ 2010-01-07 14:38:33 │               5.4 │               4.2 │
│ 8c2a100888e3dff │ 2010-01-10 14:49:00 │ 2010-01-31 12:54:00 │               5.4 │               6.2 │
│ 8c2a1008bb9b1ff │ 2010-01-08 14:46:12 │ 2010-01-16 13:27:51 │              19.4 │               5.8 │
│ 8c2a103b0374dff │ 2010-01-08 16:03:15 │ 2010-01-04 23:48:48 │              51.0 │              32.3 │
│ 8c2a100892491ff │ 2010-01-06 18:56:38 │ 2010-01-09 08:44:00 │ 

In [8]:
# renders slowly because you have to do the full join
t2.do(count_rows)

┌──────────┐
│ num_rows │
│ varchar  │
├──────────┤
│ 1.05e+10 │
└──────────┘

In [9]:
# print(str(t2.rel.explain()))

In [10]:
t3 = t2.do(
    'where ts1 < ts2',
    'where ts2 < ts1 + interval 1 minute',
    'select hexid, max(abs(amt1-amt2)) as diff group by 1',
    'where diff > 0'
    'order by diff',
)

In [11]:
t3.do(count_rows)

┌──────────┐
│ num_rows │
│ varchar  │
├──────────┤
│ 2.86e+04 │
└──────────┘

In [12]:
t3.df()

Unnamed: 0,hexid,diff
0,8c2a100d2d12bff,0.01
1,8c2a100d2a94dff,0.02
2,8c2a107253359ff,0.02
3,8c2a100d67a93ff,0.02
4,8c2a100891611ff,0.02
...,...,...
28563,8c2a10aa2cb13ff,175.88
28564,8c2a100f52815ff,180.45
28565,8c2a108f664e7ff,203.00
28566,8c2a100d676d7ff,212.37


In [13]:
# note the df will be different from what you'd get if you preview t3
# t3
# meh, maybe we don't care here