# PQD: Helper functions to run SQL on Pandas DataFrames

Leverages DuckDB, which has some nice SQL language extensions: https://duckdb.org/2022/05/04/friendlier-sql.html

In [1]:
import pdq
import pandas as pd

iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [2]:
iris.pdq.sql?

[0;31mSignature:[0m [0miris[0m[0;34m.[0m[0mpdq[0m[0;34m.[0m[0msql[0m[0;34m([0m[0ms[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m [0mtbl_name[0m[0;34m=[0m[0;34m'tbl'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Run a SQL query against Pandas DataFrame.

DataFrame will be referred to by string given in `tbl_name`.

Examples
--------
>>> df.sql('select * from tbl')
>>> df.sql('select * from new_tbl', tbl_name='new_tbl')
[0;31mFile:[0m      ~/work/pdq/src/pdq/_pandas.py
[0;31mType:[0m      method


In [3]:
iris.pdq.sql('select * from tbl')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
iris.pdq.sql("""
select
    species,
    count(*)
        as num,
from
    tbl
group by
    1
""")

Unnamed: 0,species,num
0,setosa,50
1,versicolor,50
2,virginica,50


Equivalently, can also run the function and specify the DataFrame/table explicitly.

In [5]:
pdq.sql("""
select
    species,
    count(*)
        as num,
from
    tbl
group by
    1
""", tbl=iris)

Unnamed: 0,species,num
0,setosa,50
1,versicolor,50
2,virginica,50


# Joining multiple tables/dataframes

In [6]:
s = """
select
    species,
    avg(petal_width)
        as avg_petal_width,
from
    tbl
group by
    1
"""

df2 = iris.pdq.sql(s)

df2

Unnamed: 0,species,avg_petal_width
0,setosa,0.246
1,versicolor,1.326
2,virginica,2.026


In [7]:
s = """
select
      iris.*
    , df2.avg_petal_width
        as species_avg_petal_width
from
    iris
left join
    df2
on
    iris.species = df2.species
"""

pdq.sql(s, iris=iris, df2=df2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_avg_petal_width
0,5.1,3.5,1.4,0.2,setosa,0.246
1,4.9,3.0,1.4,0.2,setosa,0.246
2,4.7,3.2,1.3,0.2,setosa,0.246
3,4.6,3.1,1.5,0.2,setosa,0.246
4,5.0,3.6,1.4,0.2,setosa,0.246
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,2.026
146,6.3,2.5,5.0,1.9,virginica,2.026
147,6.5,3.0,5.2,2.0,virginica,2.026
148,6.2,3.4,5.4,2.3,virginica,2.026


# Pandas DataFrame helper functions

A few helper functions for common patterns to extract data from dataframes.

## `df.as_list()`

In [8]:
out = iris.pdq.sql("""
select
    species
from
    tbl
group by
    1
""")

In [9]:
out

Unnamed: 0,species
0,setosa
1,versicolor
2,virginica


In [10]:
out.pdq.as_list()

['setosa', 'versicolor', 'virginica']

In [11]:
iris.pdq.sql("""
select
    species
from
    tbl
group by
    1
""").pdq.as_list()

['setosa', 'versicolor', 'virginica']

## `df.as_dict()`

In [12]:
out = iris.sql("""
select
    avg(sepal_length),
    avg(sepal_width),
    avg(petal_length),
    avg(petal_width),
from
    tbl
""")

AttributeError: 'DataFrame' object has no attribute 'sql'

In [None]:
out

In [None]:
out.as_dict()

In [None]:
iris.sql("""
select
    avg(sepal_length),
    avg(sepal_width),
    avg(petal_length),
    avg(petal_width),
from
    tbl
""").as_dict()

## `df.as_item()`

In [13]:
out = iris.sql("""
select
    count(*),
from
    tbl
""")

AttributeError: 'DataFrame' object has no attribute 'sql'

In [14]:
out

Unnamed: 0,species
0,setosa
1,versicolor
2,virginica


In [15]:
out.as_item()

AttributeError: 'DataFrame' object has no attribute 'as_item'

In [16]:
iris.sql("""
select
    count(*),
from
    tbl
""").as_item()

AttributeError: 'DataFrame' object has no attribute 'sql'

# PRQL

You can also query with PRQL:

- https://github.com/prql/prql
- https://prql-lang.org/

In [17]:
iris.pdq.prql('')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [21]:
iris.pdq.prql("""
filter sepal_length > 6
take 5
""")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,7.0,3.2,4.7,1.4,versicolor
1,6.4,3.2,4.5,1.5,versicolor
2,6.9,3.1,4.9,1.5,versicolor
3,6.5,2.8,4.6,1.5,versicolor
4,6.3,3.3,4.7,1.6,versicolor


In [22]:
iris.pdq.prql("""
filter sepal_length > 5 and sepal_length < 6
filter sepal_length < 6
derive new_col = sepal_length + sepal_width

group [species] (
  aggregate [
    the_average = average sepal_length,
    the_sum = sum new_col,
  ]
)
""")

Unnamed: 0,species,the_average,the_sum
0,setosa,5.313636,198.6
1,versicolor,5.604348,191.7
2,virginica,5.766667,51.1
