# show and tell: duckdb csv query
This is a quick recipe on using duckdb to query multiple csv files as a single dataset
[more duckdb info](https://duckdb.org/docs/)

In [1]:
import duckdb
from bs4 import BeautifulSoup
import requests
import os


## Setup duckdb database and load required extensions

In [2]:
conn = duckdb.connect(database=':memory:')

In [3]:
conn.install_extension("httpfs")
conn.load_extension("httpfs")
# conn.execute("INSTALL spatial;")
# conn.execute("LOAD spatial;")

## Get some meaningful files for demo

In [4]:
url = 'https://www.for.gov.bc.ca/ftp/HPR/external/!publish/BCWS_DATA_MART/2023/'
r = requests.get('https://www.for.gov.bc.ca/ftp/HPR/external/!publish/BCWS_DATA_MART/2023/')
s = BeautifulSoup(r.content, "html.parser")
csv_list = [url+os.path.basename(a['href']) for a in s.find_all('a', href=True) if '.csv' in a['href']]

## Filter 2023 files for November
if csv files are local duckdb can use glob syntax to filter files eg 'tmp\2023\2023-11-*.csv'
instead of a list. if you have parquet files this works over https. This was not the case for
csv (2023-11-28)

In [5]:
nov_data = [csv for csv in csv_list if '2023-11-' in csv]

## Read the data
for performance with csv it better just to create a table in your duckdb database straight away but one can query a list of csv directly by using  read_csv_auto({nov_data}) in place of 'weather' in the SQL  
eg..  
``` conn.sql(f'SELECT * FROM read_csv_auto({nov_data})') ```

In [6]:
conn.sql('DROP TABLE IF EXISTS weather')
conn.sql(f'CREATE TABLE weather AS SELECT * FROM read_csv_auto({nov_data})')

In [7]:
conn.sql(f"DESCRIBE weather")

┌────────────────────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│          column_name           │ column_type │  null   │   key   │ default │ extra │
│            varchar             │   varchar   │ varchar │ varchar │ varchar │ int32 │
├────────────────────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ STATION_CODE                   │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ STATION_NAME                   │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ DATE_TIME                      │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ HOURLY_PRECIPITATION           │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ HOURLY_TEMPERATURE             │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ HOURLY_RELATIVE_HUMIDITY       │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ HOURLY_WIND_SPEED              │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ HOURLY_WIND_DIRECTION          │ BIGINT  

## Top 3 precipitation events  
looks like some wet days at station 956 

In [8]:
conn.sql("SELECT STATION_CODE,DATE_TIME,PRECIPITATION FROM weather \
ORDER BY PRECIPITATION DESC LIMIT 3")

┌──────────────┬────────────┬───────────────┐
│ STATION_CODE │ DATE_TIME  │ PRECIPITATION │
│    int64     │   int64    │    double     │
├──────────────┼────────────┼───────────────┤
│          956 │ 2023111112 │         174.0 │
│          956 │ 2023110412 │         157.2 │
│         1398 │ 2023110412 │         156.2 │
└──────────────┴────────────┴───────────────┘

## Where to next, Pandas dataframe?

In [9]:
df = conn.sql("SELECT * FROM weather").to_df()
df

Unnamed: 0,STATION_CODE,STATION_NAME,DATE_TIME,HOURLY_PRECIPITATION,HOURLY_TEMPERATURE,HOURLY_RELATIVE_HUMIDITY,HOURLY_WIND_SPEED,HOURLY_WIND_DIRECTION,HOURLY_WIND_GUST,HOURLY_FINE_FUEL_MOISTURE_CODE,...,PRECIP_PLUVIO1_TOTAL,RN_1_PLUVIO2,PRECIP_PLUVIO2_STATUS,PRECIP_PLUVIO2_TOTAL,RN_1_RIT,PRECIP_RIT_STATUS,PRECIP_RIT_TOTAL,PRECIP_RGT,SOLAR_RADIATION_LICOR,SOLAR_RADIATION_CM3
0,11,SUMMIT,2023110100,0.0,6.4,58.0,0.0,129.0,0.0,88.927,...,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,
1,11,SUMMIT,2023110101,0.0,6.4,50.0,0.0,63.0,0.0,88.864,...,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,
2,11,SUMMIT,2023110102,0.0,7.5,46.0,0.0,352.0,0.0,88.848,...,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,
3,11,SUMMIT,2023110103,0.0,6.7,55.0,0.0,131.0,0.0,88.738,...,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,
4,11,SUMMIT,2023110104,0.0,6.8,58.0,0.0,125.0,0.0,88.600,...,0.0,0.0,0.0,0.0,,0.0,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170881,3191,GOATLICK,2023112805,0.0,-3.2,69.0,9.8,141.0,28.5,,...,,,,,,,,224.4,,
170882,3191,GOATLICK,2023112806,0.0,-3.3,69.0,9.0,144.0,26.4,,...,,,,,,,,224.4,,
170883,3191,GOATLICK,2023112807,0.0,-3.5,69.0,12.0,147.0,28.2,,...,,,,,,,,224.4,,
170884,3191,GOATLICK,2023112808,0.0,-3.6,70.0,9.3,145.0,27.1,,...,,,,,,,,224.4,,
