# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample_and_split as ss

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("urandom_colsniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,urandom_colsniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,0.062469,0.925184,2.038866,0.497015,-1590.560212,1,"""A"""
1,11.530801,0.154774,1.624819,-0.948169,-108.364886,2,"""A"""
2,3.617283,0.421837,0.728838,-2.047607,-432.34494,2,"""A"""
3,6.818916,0.780322,0.385907,-1.184253,351.643934,1,"""A"""
4,3.180392,0.784782,1.286898,-0.368634,-363.649579,1,"""A"""


In [3]:
ss.random_cols(df.columns, 2, keep=["row_num"])

['row_num', 'flags', 'category']

In [4]:
# Random Sample
ss.sample(df, 0.6)  # by ratio

row_num,uniform_1,urandom_colsniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
82702,2.229162,0.638616,3.628657,-0.262958,-1152.329339,2,"""B"""
36410,4.760458,0.231047,3.969878,-0.293172,-122.635358,2,"""C"""
54968,0.065446,0.470443,0.159839,-0.226199,-238.227888,2,"""B"""
84666,0.147541,0.932781,0.390118,-0.192892,172.135128,1,"""B"""
72771,0.862842,0.890919,0.54582,-1.359223,-329.056312,1,"""C"""
…,…,…,…,…,…,…,…
37854,5.094334,0.566698,0.072688,-0.710074,1146.82983,1,"""C"""
29549,8.499257,0.595736,3.374975,0.17636,-421.08727,1,"""A"""
62450,9.764076,0.784551,0.542186,-0.80414,1434.21823,0,"""A"""
57317,5.196103,0.730802,1.057487,-0.122886,1279.089307,0,"""A"""


In [5]:
ss.sample(df, 30_000)  # by count

row_num,uniform_1,urandom_colsniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
99850,5.694136,0.518645,0.098432,-0.30822,713.130432,1,"""B"""
43644,1.957421,0.699121,0.933353,0.164465,802.775833,0,"""C"""
18868,2.486767,0.730802,6.490098,-0.235413,1052.811527,0,"""A"""
91747,3.450563,0.085919,2.052384,-1.881719,-708.1494,0,"""B"""
49021,10.403326,0.308978,0.77882,-0.200172,-986.116556,1,"""C"""
…,…,…,…,…,…,…,…
43535,1.866867,0.512924,0.69225,0.514711,87.594382,2,"""C"""
72369,11.145346,0.195492,0.364522,-0.158083,-783.034044,0,"""B"""
40448,1.924971,0.116269,0.693774,1.688503,178.085544,2,"""C"""
70747,0.168675,0.201661,0.8556,0.93096,1073.166777,0,"""A"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33244
1,33459
2,33297


In [8]:
# Downsample on one group
sa1 = ss.downsample(df, [(pl.col("flags") == 0, 0.5)])
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16622
1,33459
2,33297


In [9]:
# Downsample on multiple groups
sa2 = ss.downsample(
    df,
    [(pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),]
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16622
1,10037
2,13318


In [10]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [11]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = ss.volume_neutral(
    df,
    by=pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [12]:
# Volume neutral (10_000) by each category
vn = ss.volume_neutral(df, by=pl.col("category"), target_volume=10_000)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [13]:
# Volume neutral (10_000) by a more complicated condition
vn = ss.volume_neutral(
    df, by=pl.col("category") == "A", target_volume=10_000
)  # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4273
"""C""",5727


In [14]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = ss.volume_neutral(df, by=pl.col("flags"), control=pl.col("category"))
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9873
"""A""",1,9873
"""A""",2,9873
"""B""",0,9914
"""B""",1,9914
"""B""",2,9914
"""C""",0,13086
"""C""",1,13086
"""C""",2,13086


In [15]:
# We may not meet the target volume for all categories.
vn = ss.volume_neutral(df, by=pl.col("flags"), control=pl.col("category"), target_volume=10_000)  #
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9873
"""A""",1,9873
"""A""",2,9873
"""B""",0,9914
"""B""",1,9914
"""B""",2,9914
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

Split by ratios.

In [16]:
df_with_splits = ss.split_by_ratio(df, split_ratio=0.6, split_col="my_splits")
df_with_splits.group_by("my_splits").agg(cnt=pl.len())

my_splits,cnt
str,u32
"""test""",40000
"""train""",60000


In [17]:
df_with_splits = ss.split_by_ratio(df, split_ratio=[0.25, 0.4, 0.10, 0.25], split_col="my_splits")
df_with_splits.group_by("my_splits").agg(cnt=pl.len()).sort("my_splits")

my_splits,cnt
str,u32
"""split_0""",25000
"""split_1""",40000
"""split_2""",10000
"""split_3""",25000


In [18]:
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio={"train": 0.6, "test": 0.3, "validation1": 0.05, "validation2": 0.05},
    split_col="my_splits",
)

df_with_splits.group_by("my_splits").agg(cnt=pl.len()).sort("my_splits")

my_splits,cnt
str,u32
"""test""",30000
"""train""",60000
"""validation1""",5000
"""validation2""",5000


In [19]:
df_with_splits.head()

my_splits,row_num,uniform_1,urandom_colsniform_2,exp,normal,fat_normal,flags,category
str,i64,f64,f64,f64,f64,f64,i32,str
"""train""",92545,8.856753,0.141021,1.673329,-0.168888,400.572622,0,"""C"""
"""train""",38426,2.479773,0.696431,0.003894,0.726736,-979.803882,1,"""B"""
"""train""",68590,9.031966,0.33151,0.266588,1.235721,70.120394,2,"""C"""
"""train""",75155,10.696929,0.664846,0.014105,-0.910292,28.54135,2,"""C"""
"""train""",94734,6.577782,0.574089,2.973529,0.281565,772.904546,1,"""C"""


In [20]:
# Split with stratify
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio=0.6,
    split_col="my_splits",
    by="category",
    default_split_1="train",
    default_split_2="test",
)
df_with_splits.group_by(["category", "my_splits"]).agg(cnt=pl.len()).with_columns(
    pct_in_category=pl.col("cnt") / pl.col("cnt").sum().over("category")
).sort(["category", "my_splits"])

category,my_splits,cnt,pct_in_category
str,str,u32,f64
"""A""","""test""",12000,0.4
"""A""","""train""",18000,0.6
"""B""","""test""",12000,0.4
"""B""","""train""",18000,0.6
"""C""","""test""",16000,0.4
"""C""","""train""",24000,0.6


In [21]:
# If you need to do work with the splits individually
# you can filter, or use .partition_by("my_splits") to get separated dataframes