# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample_and_split as ss

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,1.42563,0.688449,1.088206,-1.078369,-1.638006,2,"""A"""
1,7.802092,0.426101,2.444241,-0.686859,-1176.657982,2,"""A"""
2,4.831859,0.615212,2.494902,0.862842,-384.930746,2,"""A"""
3,0.066878,0.899334,3.8919,-1.176123,147.074087,1,"""A"""
4,9.138838,0.06889,2.742155,-0.055659,-52.394874,1,"""A"""


In [3]:
ss.random_cols(df.columns, 2, keep = ["row_num"])

['row_num', 'flags', 'category']

In [4]:
# Random Sample
ss.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,1.42563,0.688449,1.088206,-1.078369,-1.638006,2,"""A"""
2,4.831859,0.615212,2.494902,0.862842,-384.930746,2,"""A"""
4,9.138838,0.06889,2.742155,-0.055659,-52.394874,1,"""A"""
5,0.638747,0.120555,1.538502,1.718381,-89.007553,1,"""A"""
9,10.706496,0.841814,3.118728,0.711647,6.35076,2,"""A"""
…,…,…,…,…,…,…,…
99993,2.991675,0.424785,1.207897,-1.353958,1954.324746,1,"""C"""
99994,8.110839,0.10436,0.006254,-0.829466,-831.69489,1,"""C"""
99995,3.62334,0.571694,1.225951,0.053878,472.492566,1,"""C"""
99998,0.242871,0.236259,1.314217,-0.655093,598.864029,0,"""C"""


In [5]:
ss.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
2,4.831859,0.615212,2.494902,0.862842,-384.930746,2,"""A"""
5,0.638747,0.120555,1.538502,1.718381,-89.007553,1,"""A"""
11,9.847271,0.901699,0.656346,-1.812715,-1296.98553,2,"""A"""
14,9.185885,0.001916,4.715927,-0.615937,224.645881,2,"""A"""
18,3.683518,0.151237,5.535248,0.299322,-1615.920814,1,"""A"""
…,…,…,…,…,…,…,…
99990,8.829486,0.992849,2.151165,-1.396253,-315.051661,0,"""C"""
99991,5.539581,0.891651,1.35365,0.354068,-1455.428714,2,"""C"""
99995,3.62334,0.571694,1.225951,0.053878,472.492566,1,"""C"""
99997,1.328283,0.707218,0.307802,-0.220496,906.190389,2,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33511
1,33100
2,33389


In [7]:
# Downsample on one group
sa1 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16755
1,33100
2,33389


In [8]:
# Downsample on multiple groups
sa2 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16755
1,9930
2,13355


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = ss.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4301
"""C""",5699


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9839
"""A""",1,9839
"""A""",2,9839
"""B""",0,9959
"""B""",1,9959
"""B""",2,9959
"""C""",0,13302
"""C""",1,13302
"""C""",2,13302


In [14]:
# We may not meet the target volume for all categories.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9839
"""A""",1,9839
"""A""",2,9839
"""B""",0,9959
"""B""",1,9959
"""B""",2,9959
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

Split by ratios.

In [15]:
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio = 0.6,
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
)


my_splits,cnt
str,u32
"""train""",60000
"""test""",40000


In [16]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = [0.25, 0.4, 0.10, 0.25],
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""split_0""",25000
"""split_1""",40000
"""split_2""",10000
"""split_3""",25000


In [17]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = {'train': 0.6, 'test': 0.3, 'validation1': 0.05, 'validation2': 0.05},
    split_col = "my_splits"
)

df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""test""",30000
"""train""",60000
"""validation1""",5000
"""validation2""",5000


In [18]:
df_with_splits.head()

my_splits,row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
str,i64,f64,f64,f64,f64,f64,i32,str
"""train""",46813,1.744022,0.729093,1.010762,1.274261,-661.762735,0,"""B"""
"""train""",62409,1.906721,0.537013,2.083227,0.652801,-360.7415,0,"""C"""
"""train""",86917,9.07718,0.220533,0.217066,1.681312,-760.02373,1,"""C"""
"""train""",39567,3.006608,0.143211,7.729981,1.055814,-115.14681,0,"""B"""
"""train""",76224,0.189889,0.101562,1.765515,-1.234191,618.427717,0,"""C"""


In [19]:
# If you need to do work with the splits individually
# you can filter, or use .partition_by("my_splits") to get separated dataframes