# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.random_data(size=100_000, n_cols = 0).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,1.375634,0.919511,0.067392,-0.309129,-800.323135,2,"""A"""
1,2.659665,0.853205,3.206773,-0.350938,-1299.486949,2,"""A"""
2,4.643957,0.88178,0.819599,2.790163,-1608.589915,0,"""A"""
3,10.834849,0.95228,3.388264,0.117555,1010.71862,1,"""A"""
4,4.563482,0.589468,0.855739,-0.083827,-860.192229,2,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'fat_normal', 'flags']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
3,10.834849,0.95228,3.388264,0.117555,1010.71862,1,"""A"""
5,3.602171,0.864772,4.463937,-0.708025,-51.380391,1,"""A"""
8,8.039353,0.067563,2.049014,-1.49984,-307.088697,2,"""A"""
9,1.355068,0.477342,5.740475,0.440276,545.951635,0,"""A"""
10,3.199899,0.979296,6.24005,1.408848,-180.079712,1,"""A"""
…,…,…,…,…,…,…,…
99990,6.376287,0.595959,2.09459,-0.936174,164.523629,1,"""C"""
99994,0.311679,0.736031,1.331186,0.82819,647.814932,1,"""C"""
99996,5.218256,0.806905,0.462257,-0.07332,368.090563,0,"""C"""
99998,5.165351,0.728434,0.756417,-0.206222,231.545379,1,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
10,3.199899,0.979296,6.24005,1.408848,-180.079712,1,"""A"""
17,0.71813,0.472534,5.709439,-0.947008,-843.230748,1,"""A"""
23,10.434815,0.333791,3.876595,-0.359352,-2048.117448,2,"""A"""
27,10.179126,0.368184,1.754156,-0.858292,375.173934,0,"""A"""
28,1.038052,0.724587,8.525754,1.821027,2216.459529,0,"""A"""
…,…,…,…,…,…,…,…
99983,1.339939,0.822143,1.422642,0.128057,-364.864121,0,"""C"""
99986,7.957552,0.051115,2.50942,-0.344548,-129.235448,0,"""C"""
99993,0.407622,0.187425,0.496021,-2.373553,-1084.660588,2,"""C"""
99996,5.218256,0.806905,0.462257,-0.07332,368.090563,0,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33217
1,33344
2,33439


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16609
1,33344
2,33439


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    [
        (pl.col("flags") == 0, 0.5),
        (pl.col("flags") == 1, 0.3),
        (pl.col("flags") == 2, 0.4),
    ]
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16609
1,10004
2,13376


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4247
"""C""",5753


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9860
"""A""",1,9860
"""A""",2,9860
"""B""",0,9938
"""B""",1,9938
"""B""",2,9938
"""C""",0,13149
"""C""",1,13149
"""C""",2,13149


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9860
"""A""",1,9860
"""A""",2,9860
"""B""",0,9938
"""B""",1,9938
"""B""",2,9938
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...