In [1]:
import polars as pl
import sqlite3

## Indexing Example

In [11]:
conn = sqlite3.connect('chinook.db')
df = pl.read_database('SELECT * FROM invoices',
                      conn)
df.head()

InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
i64,i64,str,str,str,str,str,str,f64
1,2,"""2009-01-01 00:00:00""","""Theodor-Heuss-Straße 34""","""Stuttgart""",,"""Germany""","""70174""",1.98
2,4,"""2009-01-02 00:00:00""","""Ullevålsveien 14""","""Oslo""",,"""Norway""","""0171""",3.96
3,8,"""2009-01-03 00:00:00""","""Grétrystraat 63""","""Brussels""",,"""Belgium""","""1000""",5.94
4,14,"""2009-01-06 00:00:00""","""8210 111 ST NW""","""Edmonton""","""AB""","""Canada""","""T6G 2C7""",8.91
5,23,"""2009-01-11 00:00:00""","""69 Salem Street""","""Boston""","""MA""","""USA""","""2113""",13.86


In [14]:
(
    df
    .group_by('BillingCountry')
    .agg(
        pl.len().alias('Number_Transactions'),
        pl.col('Total').sum().alias('Total_Revenue'),
        pl.col('Total').mean().alias('Avg_Transaction_Amt'),
    )
    .sort('Total_Revenue',descending=True)
)

BillingCountry,Number_Transactions,Total_Revenue,Avg_Transaction_Amt
str,u32,f64,f64
"""USA""",91,523.06,5.747912
"""Canada""",56,303.96,5.427857
"""France""",35,195.1,5.574286
"""Brazil""",35,190.1,5.431429
"""Germany""",28,156.48,5.588571
…,…,…,…
"""Denmark""",7,37.62,5.374286
"""Italy""",7,37.62,5.374286
"""Australia""",7,37.62,5.374286
"""Spain""",7,37.62,5.374286


## Declarative Example

In [7]:
df = pl.read_database('SELECT * FROM Employees',conn)
df.schema

Schema([('EmployeeId', Int64),
        ('LastName', String),
        ('FirstName', String),
        ('Title', String),
        ('ReportsTo', Int64),
        ('BirthDate', String),
        ('HireDate', String),
        ('Address', String),
        ('City', String),
        ('State', String),
        ('Country', String),
        ('PostalCode', String),
        ('Phone', String),
        ('Fax', String),
        ('Email', String)])

In [10]:
(
    df
    .filter(pl.col('Title').str.contains('Sales'))
    .select(
        (pl.col('FirstName') + ' ' + pl.col('LastName')).alias('Full_Name'),
        'Title'
    )
)

Full_Name,Title
str,str
"""Nancy Edwards""","""Sales Manager"""
"""Jane Peacock""","""Sales Support Agent"""
"""Margaret Park""","""Sales Support Agent"""
"""Steve Johnson""","""Sales Support Agent"""


## Speed Test
Polars can run in parrallel on all cpu cores, speeding things up compared to pandas

In [4]:
(
    pl.scan_csv('./big_file_test.csv')
    .group_by('label')
    .agg(
        pl.col('data').mean().alias('AVG'),
    )
    .collect()
)

label,AVG
str,f64
"""B""",-0.000198
"""C""",-0.000318
"""A""",4.4e-05


## Query Optimizer

In [None]:
(
    pl.scan_csv('./big_file_test.csv')
    .head(1000)
    .group_by('label')
    .agg(
        pl.col('data').mean().alias('AVG'),
    )
    .collect()
)

label,AVG
str,f64
"""B""",-0.000198
"""C""",-0.000318
"""A""",4.4e-05
