# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.random_data(size=100_000, n_cols = 0).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,0.101301,0.241236,0.068629,-1.546608,-1820.064986,0,"""A"""
1,7.763778,0.688527,1.564067,-0.219875,2842.128922,0,"""A"""
2,6.692104,0.302039,2.184995,-2.038565,-107.693276,0,"""A"""
3,8.069798,0.589782,0.047991,-1.029594,52.045207,1,"""A"""
4,4.709925,0.289922,0.538721,1.08094,711.807154,1,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'uniform_1', 'fat_normal']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
2,6.692104,0.302039,2.184995,-2.038565,-107.693276,0,"""A"""
3,8.069798,0.589782,0.047991,-1.029594,52.045207,1,"""A"""
4,4.709925,0.289922,0.538721,1.08094,711.807154,1,"""A"""
7,9.679737,0.431594,1.131895,-0.739637,1269.613996,1,"""A"""
8,10.803224,0.546317,3.25459,-0.904231,243.572733,2,"""A"""
…,…,…,…,…,…,…,…
99991,10.25385,0.632615,0.263188,-0.388282,273.417495,2,"""C"""
99993,5.213348,0.199494,4.928223,-0.362607,1729.126716,0,"""C"""
99994,9.441603,0.347907,0.318096,-0.112797,242.457284,0,"""C"""
99998,7.374466,0.030731,3.606166,-0.582265,1290.937356,0,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
14,3.29052,0.62766,0.684437,-0.114606,1440.324971,0,"""A"""
15,10.033849,0.555832,0.265698,1.894409,-2266.65647,1,"""A"""
18,10.209092,0.411413,1.781309,1.502181,63.521515,1,"""A"""
23,5.461989,0.631279,2.069071,-0.735686,-1463.48885,0,"""A"""
27,8.902278,0.732549,2.877614,-0.049382,-1301.745218,0,"""A"""
…,…,…,…,…,…,…,…
99974,6.541671,0.052039,2.622168,-0.451139,-1270.959364,2,"""C"""
99976,1.147642,0.885223,0.704189,0.61381,344.861659,1,"""C"""
99985,7.321052,0.76491,6.039978,-0.407754,304.340042,1,"""C"""
99991,10.25385,0.632615,0.263188,-0.388282,273.417495,2,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33282
1,33654
2,33064


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16641
1,33654
2,33064


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    [
        (pl.col("flags") == 0, 0.5),
        (pl.col("flags") == 1, 0.3),
        (pl.col("flags") == 2, 0.4),
    ]
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16641
1,10097
2,13226


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4302
"""C""",5698


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9865
"""A""",1,9865
"""A""",2,9865
"""B""",0,9909
"""B""",1,9909
"""B""",2,9909
"""C""",0,13224
"""C""",1,13224
"""C""",2,13224


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9865
"""A""",1,9865
"""A""",2,9865
"""B""",0,9909
"""B""",1,9909
"""B""",2,9909
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...