# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample_and_split as ss

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,2.375698,0.306533,4.211196,0.867607,-739.840101,2,"""A"""
1,10.704956,0.710361,0.061798,-0.090433,-1281.037206,1,"""A"""
2,6.569019,0.424319,0.31042,0.53873,1272.196366,1,"""A"""
3,9.460909,0.178204,0.108187,0.483564,-302.587195,1,"""A"""
4,0.472411,0.797066,0.574556,0.84752,-675.503736,2,"""A"""


In [3]:
ss.random_cols(df.columns, 2, keep = ["row_num"])

['row_num', 'normal', 'category']

In [4]:
# Random Sample
ss.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
1,10.704956,0.710361,0.061798,-0.090433,-1281.037206,1,"""A"""
2,6.569019,0.424319,0.31042,0.53873,1272.196366,1,"""A"""
3,9.460909,0.178204,0.108187,0.483564,-302.587195,1,"""A"""
4,0.472411,0.797066,0.574556,0.84752,-675.503736,2,"""A"""
6,8.959549,0.582041,4.157775,-0.617712,484.235326,2,"""A"""
…,…,…,…,…,…,…,…
99994,2.443397,0.005157,0.629397,-0.696746,1807.499997,0,"""C"""
99995,3.808145,0.033321,3.928965,0.155838,738.475897,1,"""C"""
99996,5.196431,0.567853,2.630441,1.422917,-408.294107,1,"""C"""
99998,3.155122,0.667723,5.467898,1.153467,746.408392,1,"""C"""


In [5]:
ss.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
6,8.959549,0.582041,4.157775,-0.617712,484.235326,2,"""A"""
7,6.399035,0.959055,0.781183,0.983751,-1225.996629,2,"""A"""
11,5.631642,0.437399,0.509954,1.553941,877.710016,2,"""A"""
12,0.713898,0.26459,0.232114,-0.308523,-1690.6116,1,"""A"""
14,5.208092,0.981489,0.387223,-1.128713,575.710573,2,"""A"""
…,…,…,…,…,…,…,…
99976,4.484445,0.819104,0.479851,0.217143,-299.902301,0,"""C"""
99983,5.41788,0.885326,1.96973,-0.039258,-463.50322,2,"""C"""
99988,1.326043,0.862608,0.067416,-0.098538,-775.818205,2,"""C"""
99990,0.392468,0.575246,1.844403,2.281576,-2783.843463,1,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33597
1,33200
2,33203


In [7]:
# Downsample on one group
sa1 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16798
1,33200
2,33203


In [8]:
# Downsample on multiple groups
sa2 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16798
1,9960
2,13281


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = ss.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4301
"""C""",5699


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9860
"""A""",1,9860
"""A""",2,9860
"""B""",0,9953
"""B""",1,9953
"""B""",2,9953
"""C""",0,13203
"""C""",1,13203
"""C""",2,13203


In [14]:
# We may not meet the target volume for all categories.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9860
"""A""",1,9860
"""A""",2,9860
"""B""",0,9953
"""B""",1,9953
"""B""",2,9953
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

Split by ratios.

In [15]:
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio = 0.6,
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
)


my_splits,cnt
str,u32
"""test""",40000
"""train""",60000


In [16]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = [0.25, 0.4, 0.10, 0.25],
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""split_0""",25000
"""split_1""",40000
"""split_2""",10000
"""split_3""",25000


In [17]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = {'train': 0.6, 'test': 0.3, 'validation1': 0.05, 'validation2': 0.05},
    split_col = "my_splits"
)

df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""test""",30000
"""train""",60000
"""validation1""",5000
"""validation2""",5000


In [18]:
df_with_splits.head()

my_splits,row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
str,i64,f64,f64,f64,f64,f64,i32,str
"""train""",10311,4.147665,0.907098,4.370451,0.130662,-75.323041,1,"""A"""
"""train""",8991,5.71963,0.588056,3.670106,-0.051855,1234.284869,1,"""A"""
"""train""",63140,6.902809,0.885146,2.477001,0.692887,-583.215671,1,"""C"""
"""train""",72456,0.944483,0.984328,0.607009,-0.828894,-2269.310795,1,"""C"""
"""train""",92763,5.747988,0.286253,3.47296,0.078784,-763.826196,2,"""C"""


In [19]:
# Split with stratify
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio = 0.6,
    split_col = "my_splits",
    by = "category",
    default_split_1 = "train",
    default_split_2 = "test"
)
df_with_splits.group_by(["category", "my_splits"]).agg(
    cnt = pl.len()
).with_columns(
    pct_in_category = pl.col("cnt") / pl.col("cnt").sum().over("category")
).sort(["category", "my_splits"])



category,my_splits,cnt,pct_in_category
str,str,u32,f64
"""A""","""test""",12000,0.4
"""A""","""train""",18000,0.6
"""B""","""test""",12000,0.4
"""B""","""train""",18000,0.6
"""C""","""test""",16000,0.4
"""C""","""train""",24000,0.6


In [20]:
# If you need to do work with the splits individually
# you can filter, or use .partition_by("my_splits") to get separated dataframes