# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.random_data(size=100_000, n_cols = 0).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,11.64451,0.124603,0.758623,2.060566,438.779809,0,"""A"""
1,2.748586,0.014969,7.090475,1.590593,993.017676,1,"""A"""
2,10.260554,0.10973,5.796965,-1.212692,642.298279,2,"""A"""
3,10.889005,0.908854,0.151211,1.197364,-525.448295,0,"""A"""
4,2.62101,0.310746,0.408312,0.394749,-498.125382,0,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'normal', 'category']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
2,10.260554,0.10973,5.796965,-1.212692,642.298279,2,"""A"""
3,10.889005,0.908854,0.151211,1.197364,-525.448295,0,"""A"""
4,2.62101,0.310746,0.408312,0.394749,-498.125382,0,"""A"""
5,6.417608,0.221039,1.678449,-1.45285,-322.599835,0,"""A"""
6,10.250766,0.140756,1.315467,-1.102321,1457.466441,2,"""A"""
…,…,…,…,…,…,…,…
99993,10.083477,0.830525,5.002596,0.529133,1469.820801,0,"""C"""
99994,5.732657,0.770301,2.194749,-1.622315,416.484199,1,"""C"""
99996,1.337781,0.172623,1.023945,0.086787,400.605295,2,"""C"""
99997,7.303107,0.56098,0.256317,-0.262152,185.775187,2,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,11.64451,0.124603,0.758623,2.060566,438.779809,0,"""A"""
1,2.748586,0.014969,7.090475,1.590593,993.017676,1,"""A"""
7,10.623024,0.576213,0.95697,-2.556449,496.157876,2,"""A"""
9,9.837957,0.877281,1.481505,-0.959475,1389.179288,0,"""A"""
23,0.030308,0.908837,3.397795,-0.980877,-1279.96811,0,"""A"""
…,…,…,…,…,…,…,…
99976,7.226565,0.815021,0.394262,1.102591,819.351731,0,"""C"""
99979,8.91381,0.801717,5.664321,0.586003,10.872884,1,"""C"""
99982,3.839575,0.726429,1.208244,0.36872,-165.014912,2,"""C"""
99987,7.330535,0.586176,0.418036,-0.905389,503.257395,1,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33427
1,33329
2,33244


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16714
1,33329
2,33244


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    [
        (pl.col("flags") == 0, 0.5),
        (pl.col("flags") == 1, 0.3),
        (pl.col("flags") == 2, 0.4),
    ]
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16714
1,9999
2,13298


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4240
"""C""",5760


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9954
"""A""",1,9954
"""A""",2,9954
"""B""",0,9929
"""B""",1,9929
"""B""",2,9929
"""C""",0,13276
"""C""",1,13276
"""C""",2,13276


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9954
"""A""",1,9954
"""A""",2,9954
"""B""",0,9929
"""B""",1,9929
"""B""",2,9929
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...