# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample_and_split as ss

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,7.455579,0.646415,3.907644,-1.162823,102.141997,2,"""A"""
1,0.528006,0.859853,0.344567,-0.136473,181.935043,1,"""A"""
2,3.555607,0.972201,0.261143,-0.836478,-720.824478,1,"""A"""
3,5.548706,0.983658,1.640527,0.954563,1448.808151,1,"""A"""
4,2.390781,0.911941,9.608222,1.254891,-267.94636,1,"""A"""


In [3]:
ss.random_cols(df.columns, 2, keep = ["row_num"])

['row_num', 'exp', 'normal']

In [4]:
# Random Sample
ss.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,7.455579,0.646415,3.907644,-1.162823,102.141997,2,"""A"""
1,0.528006,0.859853,0.344567,-0.136473,181.935043,1,"""A"""
2,3.555607,0.972201,0.261143,-0.836478,-720.824478,1,"""A"""
6,5.735477,0.840533,2.331571,-0.45213,-1499.985468,1,"""A"""
8,11.755689,0.203643,4.712804,0.755243,-1087.54292,0,"""A"""
…,…,…,…,…,…,…,…
99991,6.748497,0.310911,3.920252,-2.111197,-370.769352,0,"""C"""
99993,7.696526,0.2563,0.978704,0.552231,321.112705,0,"""C"""
99994,0.679278,0.891505,1.31112,-0.737345,1258.803922,0,"""C"""
99997,4.186838,0.977635,1.447623,-0.618252,1181.83129,1,"""C"""


In [5]:
ss.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
2,3.555607,0.972201,0.261143,-0.836478,-720.824478,1,"""A"""
6,5.735477,0.840533,2.331571,-0.45213,-1499.985468,1,"""A"""
8,11.755689,0.203643,4.712804,0.755243,-1087.54292,0,"""A"""
12,1.001083,0.185386,2.589078,-0.753902,-1597.862794,0,"""A"""
13,1.341568,0.532354,1.213042,-0.917556,-346.481171,1,"""A"""
…,…,…,…,…,…,…,…
99984,8.458869,0.187026,0.256183,1.294262,926.261393,1,"""C"""
99985,4.861788,0.679973,1.629607,-0.146087,737.701739,2,"""C"""
99986,11.732498,0.421336,1.395327,-0.639032,-72.43559,2,"""C"""
99988,11.306076,0.766519,0.302939,-0.476744,972.509535,2,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33223
1,33206
2,33571


In [7]:
# Downsample on one group
sa1 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16611
1,33206
2,33571


In [8]:
# Downsample on multiple groups
sa2 = ss.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16611
1,9961
2,13428


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = ss.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = ss.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4231
"""C""",5769


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9970
"""A""",1,9970
"""A""",2,9970
"""B""",0,9891
"""B""",1,9891
"""B""",2,9891
"""C""",0,13199
"""C""",1,13199
"""C""",2,13199


In [14]:
# We may not meet the target volume for all categories.
vn = ss.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9970
"""A""",1,9970
"""A""",2,9970
"""B""",0,9891
"""B""",1,9891
"""B""",2,9891
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

Split by ratios.

In [15]:
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio = 0.6,
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
)


my_splits,cnt
str,u32
"""test""",40000
"""train""",60000


In [16]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = [0.25, 0.4, 0.10, 0.25],
    split_col = "my_splits"
)
df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""split_0""",25000
"""split_1""",40000
"""split_2""",10000
"""split_3""",25000


In [17]:
df_with_splits = ss.split_by_ratio(
    df, 
    split_ratio = {'train': 0.6, 'test': 0.3, 'validation1': 0.05, 'validation2': 0.05},
    split_col = "my_splits"
)

df_with_splits.group_by("my_splits").agg(
    cnt = pl.len()
).sort("my_splits")

my_splits,cnt
str,u32
"""test""",30000
"""train""",60000
"""validation1""",5000
"""validation2""",5000


In [18]:
df_with_splits.head()

my_splits,row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
str,i64,f64,f64,f64,f64,f64,i32,str
"""train""",45013,9.169039,0.717273,4.697014,1.204236,-487.318857,0,"""B"""
"""train""",16961,2.238217,0.856904,4.5812,0.482397,1066.509262,2,"""A"""
"""train""",15961,11.691381,0.173182,4.340948,-0.419331,-1188.824524,2,"""A"""
"""train""",70943,7.647667,0.482927,1.704144,1.292678,1796.367936,2,"""C"""
"""train""",78203,3.280742,0.238988,1.896998,-1.068015,-1565.01054,1,"""C"""


In [21]:
# Split with stratify
df_with_splits = ss.split_by_ratio(
    df,
    split_ratio = 0.6,
    split_col = "my_splits",
    by = "category",
    default_split_1 = "train",
    default_split_2 = "test"
)
df_with_splits.group_by(["category", "my_splits"]).agg(
    cnt = pl.len()
).with_columns(
    pct_in_category = pl.col("cnt") / pl.col("cnt").sum().over("category")
).sort(["category", "my_splits"])



category,my_splits,cnt,pct_in_category
str,str,u32,f64
"""A""","""test""",12000,0.4
"""A""","""train""",18000,0.6
"""B""","""test""",12000,0.4
"""B""","""train""",18000,0.6
"""C""","""test""",16000,0.4
"""C""","""train""",24000,0.6


In [None]:
# If you need to do work with the splits individually
# you can filter, or use .partition_by("my_splits") to get separated dataframes