# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,10.593157,0.21437,1.054066,-0.489076,-1650.392435,0,"""A"""
1,9.036933,0.392068,7.672009,-0.083204,-315.023785,1,"""A"""
2,2.12332,0.130388,0.382255,-0.668561,434.767472,2,"""A"""
3,1.226607,0.285949,1.200802,0.469201,-1524.582504,1,"""A"""
4,10.694611,0.245202,0.950469,-0.42295,1805.192107,2,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'uniform_2', 'normal']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,10.593157,0.21437,1.054066,-0.489076,-1650.392435,0,"""A"""
1,9.036933,0.392068,7.672009,-0.083204,-315.023785,1,"""A"""
2,2.12332,0.130388,0.382255,-0.668561,434.767472,2,"""A"""
3,1.226607,0.285949,1.200802,0.469201,-1524.582504,1,"""A"""
6,0.186466,0.753861,3.178885,1.630469,-1898.897546,0,"""A"""
…,…,…,…,…,…,…,…
99994,1.966948,0.345592,0.676595,1.46892,-389.847966,0,"""C"""
99996,1.091215,0.460275,2.204426,0.556299,-1483.426234,2,"""C"""
99997,9.237426,0.740545,0.360436,-0.35651,-775.448781,2,"""C"""
99998,5.752461,0.476507,1.755452,0.238971,1059.729354,0,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
5,1.161311,0.930557,0.57782,0.444369,963.682717,2,"""A"""
9,2.137823,0.0489,4.298631,-1.234245,1852.670961,1,"""A"""
11,0.07801,0.607029,1.628022,0.78909,677.725992,2,"""A"""
18,4.603517,0.830616,4.532298,0.009164,227.262853,0,"""A"""
22,10.909803,0.185106,0.373381,-0.298333,1281.782869,0,"""A"""
…,…,…,…,…,…,…,…
99981,1.777943,0.838998,3.563427,1.423849,-212.46092,2,"""C"""
99988,8.539298,0.855779,3.489416,1.447965,-60.630336,0,"""C"""
99990,10.8084,0.994461,2.093933,-0.811928,311.047933,2,"""C"""
99994,1.966948,0.345592,0.676595,1.46892,-389.847966,0,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33288
1,33335
2,33377


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16644
1,33335
2,33377


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16644
1,10001
2,13351


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4266
"""C""",5734


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9862
"""A""",1,9862
"""A""",2,9862
"""B""",0,9973
"""B""",1,9973
"""B""",2,9973
"""C""",0,13236
"""C""",1,13236
"""C""",2,13236


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9862
"""A""",1,9862
"""A""",2,9862
"""B""",0,9973
"""B""",1,9973
"""B""",2,9973
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...