# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.random_data(size=100_000, n_cols = 0).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,11.277262,0.624632,2.761204,0.69013,-1454.347819,1,"""A"""
1,9.611818,0.039018,0.324864,2.089952,865.90957,1,"""A"""
2,2.614335,0.319069,1.407749,0.345063,-1465.807074,1,"""A"""
3,7.681358,0.126396,0.385254,1.182632,731.339396,2,"""A"""
4,2.363063,0.928773,0.459981,-0.469134,45.54908,2,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'exp', 'flags']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,11.277262,0.624632,2.761204,0.69013,-1454.347819,1,"""A"""
2,2.614335,0.319069,1.407749,0.345063,-1465.807074,1,"""A"""
3,7.681358,0.126396,0.385254,1.182632,731.339396,2,"""A"""
5,10.589018,0.403235,7.328744,0.494549,201.411304,1,"""A"""
6,11.82124,0.180973,1.464906,-1.324346,0.966903,2,"""A"""
…,…,…,…,…,…,…,…
99990,10.030975,0.418153,0.104252,-0.001009,-479.039329,1,"""C"""
99994,9.230081,0.332427,3.23884,1.71897,-881.049847,1,"""C"""
99995,9.389632,0.139031,0.864656,-0.250149,422.365681,1,"""C"""
99997,4.19536,0.376186,0.116952,-1.095385,-1301.074908,1,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
3,7.681358,0.126396,0.385254,1.182632,731.339396,2,"""A"""
7,8.967688,0.618955,4.015997,-0.239592,-1671.293276,0,"""A"""
9,7.78085,0.662031,0.661247,1.205476,-1881.320156,2,"""A"""
10,1.237564,0.671289,1.156893,0.67844,-852.529121,1,"""A"""
15,3.664119,0.208572,0.446782,0.840965,431.532664,0,"""A"""
…,…,…,…,…,…,…,…
99984,10.732264,0.223313,2.383238,0.616628,-417.195116,2,"""C"""
99987,10.277887,0.225801,0.021146,1.193793,-2154.588823,2,"""C"""
99991,7.849471,0.795797,0.249024,1.466287,-594.551347,0,"""C"""
99996,7.311652,0.404233,6.796084,1.126061,802.388238,1,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33143
1,33346
2,33511


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16572
1,33346
2,33511


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16572
1,10004
2,13405


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4298
"""C""",5702


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9792
"""A""",1,9792
"""A""",2,9792
"""B""",0,9900
"""B""",1,9900
"""B""",2,9900
"""C""",0,13281
"""C""",1,13281
"""C""",2,13281


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9792
"""A""",1,9792
"""A""",2,9792
"""B""",0,9900
"""B""",1,9900
"""B""",2,9900
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...