# DuckDB unleashed: Scratching the surface a little deeper

In [1]:
import duckdb
import pandas as pd
pd.set_option('display.max_columns', 50)

## Working with DuckDB connections

In [2]:
# Creates in-memory database
c =  duckdb.connect()
c.execute("select * from '../data/transactions.parquet' limit 5").df()

Unnamed: 0,transaction_time,id,shop
0,2022-02-11 00:28:50,480571,46004
1,2022-03-20 10:41:34,113107,746
2,2022-08-17 03:59:32,668523,49808
3,2022-09-08 06:24:04,731871,30563
4,2022-12-13 19:01:15,415373,18267


## Querying online sources

In [3]:
%%time
duckdb.sql(
    """
    select
    *
    from read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-09.parquet') as yc
    limit 5
    """
).df()

CPU times: user 1.02 s, sys: 38 ms, total: 1.06 s
Wall time: 1.09 s


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2024-09-01 00:05:51,2024-09-01 00:45:03,1,9.8,1,N,138,48,1,47.8,10.25,0.5,13.3,6.94,1.0,79.79,2.5,1.75
1,1,2024-09-01 00:59:35,2024-09-01 01:03:43,1,0.5,1,N,140,141,1,5.1,3.5,0.5,3.0,0.0,1.0,13.1,2.5,0.0
2,2,2024-09-01 00:25:00,2024-09-01 00:34:37,2,2.29,1,N,238,152,2,13.5,1.0,0.5,0.0,0.0,1.0,16.0,0.0,0.0
3,2,2024-09-01 00:31:00,2024-09-01 00:46:52,1,5.2,1,N,93,130,1,24.7,1.0,0.5,4.55,0.0,1.0,31.75,0.0,0.0
4,2,2024-09-01 00:11:57,2024-09-01 00:30:41,2,2.26,1,N,79,231,1,17.0,1.0,0.5,4.4,0.0,1.0,26.4,2.5,0.0


In [4]:
%%time
duckdb.sql(
    """
    select
    dayofweek(tpep_pickup_datetime) as dow
    , round(sum(yc.tip_amount)/sum(yc.total_amount - yc.tip_amount), 3) as tip_pct
    , round(mean(case when yc.tip_amount/(yc.total_amount - yc.tip_amount) < 0.1 then 1 else 0 end), 3) as share_under_10
    from read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-09.parquet') as yc
    group by dayofweek(tpep_pickup_datetime)
    """
)

CPU times: user 9.37 ms, sys: 1.71 ms, total: 11.1 ms
Wall time: 167 ms


┌───────┬─────────┬────────────────┐
│  dow  │ tip_pct │ share_under_10 │
│ int64 │ double  │     double     │
├───────┼─────────┼────────────────┤
│     0 │   0.125 │           0.43 │
│     1 │   0.135 │          0.375 │
│     2 │   0.135 │           0.37 │
│     3 │   0.137 │          0.362 │
│     4 │   0.135 │          0.371 │
│     5 │    0.13 │            0.4 │
│     6 │   0.122 │          0.425 │
└───────┴─────────┴────────────────┘

All 2024 data have also been uploaded in https://oss.buz.dev/ - courtesy of https://bsky.app/profile/jakthom.bsky.social

In [5]:
c =  duckdb.connect()
c.execute("ATTACH 'https://hive.buz.dev/nyc_taxi' AS nyc_taxi;")
c.execute("SHOW ALL TABLES;").df()
c.execute("DESC nyc_taxi.yellow_trips").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,VendorID,INTEGER,YES,,,
1,tpep_pickup_datetime,TIMESTAMP,YES,,,
2,tpep_dropoff_datetime,TIMESTAMP,YES,,,
3,passenger_count,BIGINT,YES,,,
4,trip_distance,DOUBLE,YES,,,
5,RatecodeID,BIGINT,YES,,,
6,store_and_fwd_flag,VARCHAR,YES,,,
7,PULocationID,INTEGER,YES,,,
8,DOLocationID,INTEGER,YES,,,
9,payment_type,BIGINT,YES,,,


### Foursquare place data

Again thanks to https://bsky.app/profile/jakthom.bsky.social

In [6]:
c =  duckdb.connect()
c.execute("ATTACH 'https://hive.buz.dev/foursquare' AS foursquare;")
c.execute("USE foursquare;")
c.execute("SHOW TABLES").df()

Unnamed: 0,name
0,categories
1,places


In [7]:
%time
c.execute("select * from places where country='DK' limit 7").df()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,po_box,country,date_created,date_refreshed,date_closed,tel,website,email,facebook_id,instagram,twitter,fsq_category_ids,fsq_category_labels,dt
0,1ca1519f47f14c4f0f7910a9,Freepete,55.683642,12.53274,"Rolfsvej 33, 4. sal. tv",Frederiksberg,Hovedstaden,2000,,,,DK,2015-11-26,2023-12-06,,23 29 00 00,,peter@hingebjerg.dk,,,,[63be6904847c3692a84b9b72],[Business and Professional Services > Media Ag...,2024-11-19
1,bfeb38ad0a564eb2aa55445b,Salon Cherie,55.637152,12.086013,Køgevej 44,Roskilde,Sjælland,4000,,,,DK,2013-10-24,2023-12-06,,46 35 28 88,,,,,,[4bf58dd8d48988d110951735],[Business and Professional Services > Health a...,2024-11-19
2,4ef8724e0e61a0846e3f4347,Hedelykke,56.442661,8.957332,Herningvej 66,Skive,,7800,,,,DK,2011-12-26,2020-11-23,,,,,,,,[4bf58dd8d48988d15b941735],[Landmarks and Outdoors > Farm],2024-11-19
3,fb77559581614f90ed4dbabb,Vintur.dk ApS,55.531863,12.204529,Bæktoften 10,Solrød Strand,Sjælland,2680,,,,DK,2017-09-01,2023-08-17,,22 38 78 08,,,,,,[4f04b08c2fb6e1c99f3db0bd],[Travel and Transportation > Travel Agency],2024-11-19
4,5a3d1dcc22cb41af42059da2,Danofix-Tape ApS,55.646326,12.1246,Betonvej 13,Roskilde,Sjælland,4000,,,,DK,2014-06-01,2023-08-19,,46 75 43 00,http://www.danofix.dk,danofix@danofix.dk,,,,,,2024-11-19
5,81becd8d369946547d3feb34,Grøndahl Trafikteknik,55.641527,12.594715,Stenlandsvej 25,København,Hovedstaden,2300,,,,DK,2015-05-06,2024-05-31,,32 55 05 01,http://www.atkins.dk,knudgm@image.dk,,,,[4d4b7105d754a06375d81259],[Business and Professional Services],2024-11-19
6,2359dbb22c134a5b12cbdde1,Egel-Data,54.948393,10.727639,"Peløkkevej 34, Peløkke",Rudkøbing,Syddanmark,5900,,,,DK,2014-09-08,2023-12-06,,62 51 14 50,,,,,,[63be6904847c3692a84b9b3e],[Business and Professional Services > Financia...,2024-11-19


### The problem that ChatGPT just can't get right

In [8]:
duckdb.sql("select state from 'https://raw.githubusercontent.com/jasonong/List-of-US-States/refs/heads/master/states.csv' where lower(state) LIKE '%r%';")

┌──────────────────────┐
│        State         │
│       varchar        │
├──────────────────────┤
│ Arizona              │
│ Arkansas             │
│ California           │
│ Colorado             │
│ Delaware             │
│ District of Columbia │
│ Florida              │
│ Georgia              │
│ Nebraska             │
│ New Hampshire        │
│ New Jersey           │
│ New York             │
│ North Carolina       │
│ North Dakota         │
│ Oregon               │
│ Maryland             │
│ Missouri             │
│ Rhode Island         │
│ South Carolina       │
│ Vermont              │
│ Virginia             │
│ West Virginia        │
├──────────────────────┤
│       22 rows        │
└──────────────────────┘

## DuckDB utilities

[Smart querying](https://duckdb.org/2022/05/04/friendlier-sql.html)

In [9]:
%%time
duckdb.sql("select * EXCLUDE(shop) from '../data/transactions.parquet' limit 5").df()

CPU times: user 16.4 ms, sys: 510 µs, total: 16.9 ms
Wall time: 9.74 ms


Unnamed: 0,transaction_time,id
0,2022-02-11 00:28:50,480571
1,2022-03-20 10:41:34,113107
2,2022-08-17 03:59:32,668523
3,2022-09-08 06:24:04,731871
4,2022-12-13 19:01:15,415373


[Statistical aggregates](https://duckdb.org/docs/sql/functions/aggregates.html#statistical-aggregates).

Yes, we are running 7 OLS regressions in SQL on an online data source. Because we can. 

In [10]:
%%time
duckdb.sql(
    """
    select
    dayname(tpep_pickup_datetime) as dow
    , quantile_disc(total_amount, 0.25) as p25
    , median(total_amount) as median_amt
    , quantile_disc(total_amount, 0.75) as p75
    , var_samp(total_amount) as sample_variance_amt
    , regr_slope(fare_amount, trip_distance) as beta_price_distance -- THE POWAAAAAAAAAAR
    from read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-09.parquet') as yc
    where trip_distance<50 and month(tpep_pickup_datetime)=9
    group by dayname(tpep_pickup_datetime)
    """
).df()

CPU times: user 1.3 s, sys: 184 ms, total: 1.48 s
Wall time: 2.05 s


Unnamed: 0,dow,p25,median_amt,p75,sample_variance_amt,beta_price_distance
0,Tuesday,16.4,21.9,32.0,581.918545,3.422733
1,Friday,16.3,21.48,30.72,564.106764,3.483349
2,Saturday,15.48,21.0,30.2,471.57398,3.432718
3,Thursday,16.5,22.21,32.3,583.601516,3.465931
4,Sunday,15.29,20.9,31.8,664.458268,3.438178
5,Monday,15.96,21.45,32.6,704.211246,3.416824
6,Wednesday,16.44,21.89,31.56,579.682262,3.460189


Backup in case NYC website puts me in waitlist...

In [11]:
%%time
c =  duckdb.connect()
c.execute("ATTACH 'https://hive.buz.dev/nyc_taxi' AS nyc_taxi;")
c.execute(
    """
    select
    dayname(tpep_pickup_datetime) as dow
    , quantile_disc(total_amount, 0.25) as p25
    , median(total_amount) as median_amt
    , quantile_disc(total_amount, 0.75) as p75
    , var_samp(total_amount) as sample_variance_amt
    , regr_slope(fare_amount, trip_distance) as beta_price_distance -- THE POWAAAAAAAAAAR
    from nyc_taxi.yellow_trips as yc
    where trip_distance<50 and month(tpep_pickup_datetime)=10
    group by dayname(tpep_pickup_datetime)
    """
).df()

CPU times: user 3.39 s, sys: 1.16 s, total: 4.55 s
Wall time: 26.3 s


<duckdb.duckdb.DuckDBPyConnection at 0x7f325e899530>