# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.random_data(size=100_000, n_cols = 0).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,4.956064,0.790845,1.201162,-0.671454,-1222.053614,1,"""A"""
1,8.972744,0.89496,1.072396,0.261114,-24.227971,1,"""A"""
2,1.294361,0.962767,0.855414,0.093892,184.029752,2,"""A"""
3,0.759313,0.481386,1.158297,-1.497403,-904.57427,0,"""A"""
4,6.024539,0.528093,3.823704,-0.572181,1700.880363,0,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'fat_normal', 'flags']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
4,6.024539,0.528093,3.823704,-0.572181,1700.880363,0,"""A"""
5,5.364331,0.238052,4.788557,0.417112,-132.841882,1,"""A"""
6,0.257486,0.720676,1.102975,0.363163,-1175.364442,2,"""A"""
9,8.529999,0.994186,4.313919,-1.360975,674.937081,2,"""A"""
10,9.858746,0.509561,1.453179,0.910365,-79.215763,1,"""A"""
…,…,…,…,…,…,…,…
99990,2.764397,0.942385,0.044195,-0.73278,1569.240084,2,"""C"""
99991,6.603121,0.841659,1.898811,-1.07152,752.934449,2,"""C"""
99995,7.320427,0.420041,1.409902,2.074616,-2174.948455,0,"""C"""
99996,6.34049,0.185967,12.632619,-0.345696,275.279029,2,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,4.956064,0.790845,1.201162,-0.671454,-1222.053614,1,"""A"""
10,9.858746,0.509561,1.453179,0.910365,-79.215763,1,"""A"""
11,1.316456,0.321104,2.755456,-0.987854,-594.445564,2,"""A"""
12,1.033735,0.112864,4.39498,0.497926,-305.278576,0,"""A"""
22,3.103599,0.170745,0.569406,1.256892,-577.878977,2,"""A"""
…,…,…,…,…,…,…,…
99985,4.860036,0.62828,8.338849,-0.596043,40.187283,1,"""C"""
99989,8.524513,0.334461,0.581785,-0.897246,338.725898,0,"""C"""
99991,6.603121,0.841659,1.898811,-1.07152,752.934449,2,"""C"""
99993,4.61854,0.143395,6.73033,0.967123,-179.237561,2,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33400
1,33305
2,33295


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16700
1,33305
2,33295


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    [
        (pl.col("flags") == 0, 0.5),
        (pl.col("flags") == 1, 0.3),
        (pl.col("flags") == 2, 0.4),
    ]
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16700
1,9992
2,13318


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4281
"""C""",5719


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9917
"""A""",1,9917
"""A""",2,9917
"""B""",0,9907
"""B""",1,9907
"""B""",2,9907
"""C""",0,13230
"""C""",1,13230
"""C""",2,13230


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9917
"""A""",1,9917
"""A""",2,9917
"""B""",0,9907
"""B""",1,9907
"""B""",2,9907
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...