# Getting Started with Polars
[polars](https://www.youtube.com/watch?v=CJ0f45evuME&t=11s)

In [1]:
import numpy as np
import polars as pl

In [2]:
pl.__version__

'0.20.3'

In [10]:
print(dir(pl))



## Loading data

In [3]:
ls

mypl.ipynb       pl2023_2024.csv  polars-1.ipynb


In [4]:
ls -l

total 136
-rw-r--r--@ 1 birusod  staff  59063 Jan  3 22:19 mypl.ipynb
-rw-r--r--@ 1 birusod  staff    773 Jan  1 23:23 pl2023_2024.csv
-rw-r--r--@ 1 birusod  staff    582 Jan  3 23:06 polars-1.ipynb


pl.Categorical, pl.Float32, pl.Utf8,  pl.Int64, pl.Date, pl.String

In [22]:
url = 'https://raw.githubusercontent.com/cengel/R-data-wrangling/master/data/MS_trafficstops_bw_age.csv'
df = pl.read_csv(
    url,
    dtypes=[pl.String, pl.Date, pl.String, pl.String, pl.String, pl.String, pl.Date, pl.String, pl.String, pl.Int64, pl.String],
    infer_schema_length=10000,
    ignore_errors = True,
    null_values = list('NA'))

In [23]:
df.head(4)

id,stop_date,county_name,county_fips,police_department,driver_gender,driver_birthdate,driver_race,officer_id,driver_age,violation
str,date,str,str,str,str,date,str,str,i64,str
"""MS-2013-00001""",2013-01-01,"""Jones""","""28067""","""Mississippi Hi…","""male""",1950-06-14,"""Black""","""J042""",63,"""Seat belt"""
"""MS-2013-00002""",2013-01-01,"""Lauderdale""","""28075""","""Mississippi Hi…","""male""",1967-04-06,"""Black""","""B026""",46,"""Careless drivi…"
"""MS-2013-00003""",2013-01-01,"""Pike""","""28113""","""Mississippi Hi…","""male""",1974-04-15,"""Black""","""M009""",39,"""Speeding"""
"""MS-2013-00004""",2013-01-01,"""Hancock""","""28045""","""Mississippi Hi…","""male""",1981-03-23,"""White""","""K035""",32,"""Speeding"""


In [24]:
df.shape

(211211, 11)

In [25]:
df.columns

['id',
 'stop_date',
 'county_name',
 'county_fips',
 'police_department',
 'driver_gender',
 'driver_birthdate',
 'driver_race',
 'officer_id',
 'driver_age',
 'violation']

## EDA

In [26]:
df.sample(5).transpose()

column_0,column_1,column_2,column_3,column_4
str,str,str,str,str
"""MS-2015-41113""","""MS-2015-32025""","""MS-2015-08128""","""MS-2014-17788""","""MS-2015-14894"""
"""2015-09-05""","""2015-07-29""","""2015-03-07""","""2014-04-09""","""2015-04-22"""
"""Hinds""","""Bolivar""","""Humphreys""","""Perry""","""Jasper"""
"""28049""","""28011""","""28053""","""28111""","""28061"""
"""Mississippi Hi…","""Mississippi Hi…","""Mississippi Hi…","""Mississippi Hi…","""Mississippi Hi…"
"""female""","""male""","""male""","""female""","""male"""
"""1985-02-08""","""1972-05-02""","""1981-01-08""","""1994-05-23""","""1993-04-26"""
"""Black""","""Black""","""Black""","""White""","""Black"""
"""C056""","""D025""","""B033""","""J042""","""H018"""
"""31""","""43""","""34""","""20""","""22"""


In [27]:
df.dtypes

[String,
 Date,
 String,
 String,
 String,
 String,
 Date,
 String,
 String,
 Int64,
 String]

In [28]:
df.describe()

describe,id,stop_date,county_name,county_fips,police_department,driver_gender,driver_birthdate,driver_race,officer_id,driver_age,violation
str,str,str,str,str,str,str,str,str,str,f64,str
"""count""","""211211""","""211211""","""211211""","""211211""","""211211""","""211211""","""211102""","""211211""","""211211""",211102.0,"""211211"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""109""","""0""","""0""",109.0,"""0"""
"""mean""",,,,,,,,,,35.295516,
"""std""",,,,,,,,,,13.580219,
"""min""","""MS-2013-00001""","""2013-01-01""","""Adams""","""28001""","""Mississippi Hi…","""NA""","""1930-01-11""","""Black""","""A003""",7.0,"""Breaks-Lights-…"
"""25%""",,,,,,,,,,24.0,
"""50%""",,,,,,,,,,32.0,
"""75%""",,,,,,,,,,44.0,
"""max""","""MS-2016-24297""","""2016-07-14""","""Yazoo""","""28163""","""Mississippi Hi…","""male""","""2007-10-21""","""White""","""W300""",86.0,"""Speeding"""


### Selecting columns

In [35]:
df.select(pl.col(pl.Int64)).describe()

describe,driver_age
str,f64
"""count""",211102.0
"""null_count""",109.0
"""mean""",35.295516
"""std""",13.580219
"""min""",7.0
"""25%""",24.0
"""50%""",32.0
"""75%""",44.0
"""max""",86.0


In [40]:
df.select('id', pl.col(pl.Date), 'driver_race', 'driver_age').head(3)

id,stop_date,driver_birthdate,driver_race,driver_age
str,date,date,str,i64
"""MS-2013-00001""",2013-01-01,1950-06-14,"""Black""",63
"""MS-2013-00002""",2013-01-01,1967-04-06,"""Black""",46
"""MS-2013-00003""",2013-01-01,1974-04-15,"""Black""",39


In [43]:
df.select(pl.selectors.date()).head(3)

stop_date,driver_birthdate
date,date
2013-01-01,1950-06-14
2013-01-01,1967-04-06
2013-01-01,1974-04-15


In [58]:
df.select(pl.selectors.matches('date'), (pl.selectors.contains('age') | pl.selectors.contains('race'))).head(3)

stop_date,driver_birthdate,driver_age,driver_race
date,date,i64,str
2013-01-01,1950-06-14,63,"""Black"""
2013-01-01,1967-04-06,46,"""Black"""
2013-01-01,1974-04-15,39,"""Black"""


In [60]:
df.select(pl.selectors.starts_with('driver')).head(3)

driver_gender,driver_birthdate,driver_race,driver_age
str,date,str,i64
"""male""",1950-06-14,"""Black""",63
"""male""",1967-04-06,"""Black""",46
"""male""",1974-04-15,"""Black""",39


### Converting Dtypes

In [66]:
drivers_df = df.select(pl.selectors.starts_with('driver'), 'county_fips')
drivers_df.head(2)

driver_gender,driver_birthdate,driver_race,driver_age,county_fips
str,date,str,i64,str
"""male""",1950-06-14,"""Black""",63,"""28067"""
"""male""",1967-04-06,"""Black""",46,"""28075"""


In [64]:
drivers_df.select(pl.col('driver_age').cast(pl.Float64)).head(3)

driver_age
f64
63.0
46.0
39.0


In [71]:
(drivers_df
    .with_columns(pl.col('county_fips')
    .cast(pl.Int64)
    )
.head(3)
)

driver_gender,driver_birthdate,driver_race,driver_age,county_fips
str,date,str,i64,i64
"""male""",1950-06-14,"""Black""",63,28067
"""male""",1967-04-06,"""Black""",46,28075
"""male""",1974-04-15,"""Black""",39,28113


In [73]:
(drivers_df
    .with_columns(pl.col(['county_fips', 'driver_age'])
    .cast(pl.Float64)
    )
.head(3)
)

driver_gender,driver_birthdate,driver_race,driver_age,county_fips
str,date,str,f64,f64
"""male""",1950-06-14,"""Black""",63.0,28067.0
"""male""",1967-04-06,"""Black""",46.0,28075.0
"""male""",1974-04-15,"""Black""",39.0,28113.0


In [74]:
(drivers_df
    .with_columns([
        pl.col('county_fips').cast(pl.Int64),
        pl.col('driver_age').cast(pl.Float64)
    ])
.head(3)
)

driver_gender,driver_birthdate,driver_race,driver_age,county_fips
str,date,str,f64,i64
"""male""",1950-06-14,"""Black""",63.0,28067
"""male""",1967-04-06,"""Black""",46.0,28075
"""male""",1974-04-15,"""Black""",39.0,28113


### Dropping columns

In [77]:
(drivers_df
    .with_columns(pl.col('driver_age').cast(pl.Float64))
    .drop('county_fips')
.head(3)
)

driver_gender,driver_birthdate,driver_race,driver_age
str,date,str,f64
"""male""",1950-06-14,"""Black""",63.0
"""male""",1967-04-06,"""Black""",46.0
"""male""",1974-04-15,"""Black""",39.0


In [78]:
(drivers_df
    .with_columns(pl.col('driver_age').cast(pl.Float64))
    .drop(['county_fips', 'driver_race'])
.head(3)
)

driver_gender,driver_birthdate,driver_age
str,date,f64
"""male""",1950-06-14,63.0
"""male""",1967-04-06,46.0
"""male""",1974-04-15,39.0


### Renaming columns

In [85]:
drivers_df.rename({"driver_age": "age"}).head(3)

driver_gender,driver_birthdate,driver_race,age,county_fips
str,date,str,i64,str
"""male""",1950-06-14,"""Black""",63,"""28067"""
"""male""",1967-04-06,"""Black""",46,"""28075"""
"""male""",1974-04-15,"""Black""",39,"""28113"""


In [86]:
(
    drivers_df
        .rename({
            "driver_age": "age", 
            "county_fips": "fips"})
        .head(3)
)

driver_gender,driver_birthdate,driver_race,age,fips
str,date,str,i64,str
"""male""",1950-06-14,"""Black""",63,"""28067"""
"""male""",1967-04-06,"""Black""",46,"""28075"""
"""male""",1974-04-15,"""Black""",39,"""28113"""


In [84]:
(
    drivers_df
        .select(
            pl.all().name.map(lambda name: name.upper().replace('DRIVER_', ''))
        )
        .head(3)
)

GENDER,BIRTHDATE,RACE,AGE,COUNTY_FIPS
str,date,str,i64,str
"""male""",1950-06-14,"""Black""",63,"""28067"""
"""male""",1967-04-06,"""Black""",46,"""28075"""
"""male""",1974-04-15,"""Black""",39,"""28113"""


In [82]:
(
    drivers_df
        .with_columns(pl.col('driver_age').cast(pl.Float64))
        .pipe(
            lambda d: d.rename({ col: col.replace('driver_', '').upper() for col in d.columns })
            )
)

GENDER,BIRTHDATE,RACE,AGE,COUNTY_FIPS
str,date,str,f64,str
"""male""",1950-06-14,"""Black""",63.0,"""28067"""
"""male""",1967-04-06,"""Black""",46.0,"""28075"""
"""male""",1974-04-15,"""Black""",39.0,"""28113"""
"""male""",1981-03-23,"""White""",32.0,"""28045"""
"""male""",1992-08-03,"""White""",20.0,"""28051"""
"""female""",1960-05-02,"""White""",53.0,"""28059"""
"""female""",1953-03-16,"""White""",60.0,"""28059"""
"""female""",1993-06-14,"""White""",20.0,"""28043"""
"""male""",1947-12-11,"""White""",65.0,"""28051"""
"""male""",1984-07-14,"""White""",28.0,"""28051"""
