# Library Import

In [1]:
import duckdb
import pandas as pd

# Installing Extensions

In [2]:
# conn = duckdb.connect()
conn = duckdb.connect("python_api.db")
conn.install_extension("httpfs")
conn.load_extension("httpfs")

# Data Input

In [3]:
conn.sql('SELECT 42').show()

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [4]:
conn.read_csv('./data/cities.csv')

┌───────┬──────────────────┬─────────┬───────────┬───────────┬────────────┐
│  id   │       name       │ country │ latitude  │ longitude │ population │
│ int64 │     varchar      │ varchar │  double   │  double   │   int64    │
├───────┼──────────────────┼─────────┼───────────┼───────────┼────────────┤
│     1 │ Bombo            │ UGA     │    0.5833 │   32.5333 │      75000 │
│     2 │ Fort Portal      │ UGA     │     0.671 │    30.275 │      42670 │
│     3 │ Potenza          │ ITA     │    40.642 │    15.799 │      69060 │
│     4 │ Campobasso       │ ITA     │    41.563 │    14.656 │      50762 │
│     5 │ Aosta            │ ITA     │    45.737 │     7.315 │      34062 │
│     6 │ Mariehamn        │ ALD     │    60.097 │    19.949 │      10682 │
│     7 │ Ramallah         │ PSE     │  31.90294 │  35.20621 │      24599 │
│     8 │ Vatican City     │ VAT     │  41.90001 │  12.44781 │        832 │
│     9 │ Poitier          │ FRA     │  46.58329 │   0.33328 │      85960 │
│    10 │ Cl

In [5]:
conn.read_csv('./data/countries.csv')

┌───────┬─────────────────────────┬─────────────┬─────────────┬──────────────┬──────────┬───────────┐
│  id   │         Country         │ Alpha2_code │ Alpha3_code │ Numeric_code │ Latitude │ Longitude │
│ int64 │         varchar         │   varchar   │   varchar   │    int64     │  double  │  double   │
├───────┼─────────────────────────┼─────────────┼─────────────┼──────────────┼──────────┼───────────┤
│     1 │ Afghanistan             │ AF          │ AFG         │            4 │     33.0 │      65.0 │
│     2 │ Albania                 │ AL          │ ALB         │            8 │     41.0 │      20.0 │
│     3 │ Algeria                 │ DZ          │ DZA         │           12 │     28.0 │       3.0 │
│     4 │ American Samoa          │ AS          │ ASM         │           16 │ -14.3333 │    -170.0 │
│     5 │ Andorra                 │ AD          │ AND         │           20 │     42.5 │       1.6 │
│     6 │ Angola                  │ AO          │ AGO         │           24 │    

# DataFrames

In [6]:
pandas_df = pd.DataFrame({'a': [42]})
conn.query('SELECT * FROM pandas_df')

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

In [7]:
df = conn.read_csv('./data/cities.csv').df()
df.head()

Unnamed: 0,id,name,country,latitude,longitude,population
0,1,Bombo,UGA,0.5833,32.5333,75000
1,2,Fort Portal,UGA,0.671,30.275,42670
2,3,Potenza,ITA,40.642,15.799,69060
3,4,Campobasso,ITA,41.563,14.656,50762
4,5,Aosta,ITA,45.737,7.315,34062


# Result Conversion

In [8]:
conn.sql('SELECT 42').fetchall() # Python object

[(42,)]

In [10]:
conn.sql('SELECT 42').df() # Pandas DataFrame

Unnamed: 0,42
0,42


In [11]:
conn.sql('SELECT 42').fetchnumpy() # Numpy Arrays

{'42': array([42], dtype=int32)}

# Writing Data to Disk

In [12]:
conn.sql('SELECT 42').write_parquet('out.parquet')  # Write to a Parquet file
conn.sql('SELECT 42').write_csv('out.csv')  # Write to a CSV file
conn.sql("COPY (SELECT 42) TO 'out.parquet'")  # Copy to a parquet file

# Persistent Storage

In [13]:
# create a connection to a file called 'file.db'
conn = duckdb.connect('python_api.db')

# create a table and load data into it
conn.sql(
    'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("./data/cities.csv")'
)

# query the table
conn.table('cities').show()

# Note: connections also closed implicitly when they go out of scope

┌───────┬──────────────────┬─────────┬───────────┬───────────┬────────────┐
│  id   │       name       │ country │ latitude  │ longitude │ population │
│ int64 │     varchar      │ varchar │  double   │  double   │   int64    │
├───────┼──────────────────┼─────────┼───────────┼───────────┼────────────┤
│     1 │ Bombo            │ UGA     │    0.5833 │   32.5333 │      75000 │
│     2 │ Fort Portal      │ UGA     │     0.671 │    30.275 │      42670 │
│     3 │ Potenza          │ ITA     │    40.642 │    15.799 │      69060 │
│     4 │ Campobasso       │ ITA     │    41.563 │    14.656 │      50762 │
│     5 │ Aosta            │ ITA     │    45.737 │     7.315 │      34062 │
│     6 │ Mariehamn        │ ALD     │    60.097 │    19.949 │      10682 │
│     7 │ Ramallah         │ PSE     │  31.90294 │  35.20621 │      24599 │
│     8 │ Vatican City     │ VAT     │  41.90001 │  12.44781 │        832 │
│     9 │ Poitier          │ FRA     │  46.58329 │   0.33328 │      85960 │
│    10 │ Cl

In [14]:
# explicitly close the connection
conn.close()

In [15]:
with duckdb.connect('python_api.db') as conn:
    conn.sql(
        'CREATE TABLE IF NOT EXISTS cities AS FROM read_csv_auto("./data/cities.csv")'
    )
    conn.table('cities').show()
    # the context manager closes the connection automatically

┌───────┬──────────────────┬─────────┬───────────┬───────────┬────────────┐
│  id   │       name       │ country │ latitude  │ longitude │ population │
│ int64 │     varchar      │ varchar │  double   │  double   │   int64    │
├───────┼──────────────────┼─────────┼───────────┼───────────┼────────────┤
│     1 │ Bombo            │ UGA     │    0.5833 │   32.5333 │      75000 │
│     2 │ Fort Portal      │ UGA     │     0.671 │    30.275 │      42670 │
│     3 │ Potenza          │ ITA     │    40.642 │    15.799 │      69060 │
│     4 │ Campobasso       │ ITA     │    41.563 │    14.656 │      50762 │
│     5 │ Aosta            │ ITA     │    45.737 │     7.315 │      34062 │
│     6 │ Mariehamn        │ ALD     │    60.097 │    19.949 │      10682 │
│     7 │ Ramallah         │ PSE     │  31.90294 │  35.20621 │      24599 │
│     8 │ Vatican City     │ VAT     │  41.90001 │  12.44781 │        832 │
│     9 │ Poitier          │ FRA     │  46.58329 │   0.33328 │      85960 │
│    10 │ Cl

# Connection Object and Module

The connection object and the `duckdb` module can be used interchangeably – they support the same methods. The only difference is that when using the `duckdb` module a global in-memory database is used.

Note that if you are developing a package designed for others to use, and use DuckDB in the package, it is recommend that you create connection objects instead of using the methods on the `duckdb` module. That is because the `duckdb` module uses a shared global database – which can cause hard to debug issues if used from within multiple different packages

In [16]:
duckdb.sql('SELECT 42')

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘

In [17]:
con = duckdb.connect()
con.sql('SELECT 42')

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘