# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample as sa

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,2.050596,0.54114,0.137196,0.709152,164.865927,2,"""A"""
1,0.858691,0.228404,0.81963,-0.021791,-856.746017,0,"""A"""
2,6.792286,0.203664,0.70993,-0.397194,-1689.208398,2,"""A"""
3,8.145338,0.551938,5.37932,-0.948532,1485.885944,0,"""A"""
4,7.182131,0.763336,0.318751,0.206719,-463.325706,1,"""A"""


In [3]:
sa.random_cols(df, 2, keep = ["row_num"])

['row_num', 'uniform_2', 'fat_normal']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,2.050596,0.54114,0.137196,0.709152,164.865927,2,"""A"""
1,0.858691,0.228404,0.81963,-0.021791,-856.746017,0,"""A"""
2,6.792286,0.203664,0.70993,-0.397194,-1689.208398,2,"""A"""
3,8.145338,0.551938,5.37932,-0.948532,1485.885944,0,"""A"""
4,7.182131,0.763336,0.318751,0.206719,-463.325706,1,"""A"""
…,…,…,…,…,…,…,…
99991,6.317155,0.195067,0.483955,2.445583,-1035.578604,1,"""C"""
99993,2.902461,0.991713,0.04115,-0.360496,-487.118421,0,"""C"""
99994,4.297485,0.437202,3.617975,-0.461013,-1290.06655,1,"""C"""
99995,1.016358,0.761678,1.434325,2.122562,1788.577641,0,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
7,5.26556,0.19236,1.196749,-0.018913,-305.17364,1,"""A"""
9,3.574201,0.763915,2.504222,0.822841,826.240704,0,"""A"""
14,1.991566,0.863184,0.772406,-0.963031,459.468928,1,"""A"""
15,0.727227,0.691103,0.060641,-0.807559,100.655214,0,"""A"""
19,2.048383,0.475994,0.499137,-0.951941,2266.54163,1,"""A"""
…,…,…,…,…,…,…,…
99979,6.588156,0.15447,0.508626,0.058968,-493.155817,0,"""C"""
99980,7.436744,0.265509,4.017184,1.575691,681.375507,0,"""C"""
99985,2.766769,0.856925,1.521899,0.761686,1035.963792,1,"""C"""
99991,6.317155,0.195067,0.483955,2.445583,-1035.578604,1,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33442
1,33203
2,33355


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16721
1,33203
2,33355


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16721
1,9961
2,13342


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4261
"""C""",5739


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9978
"""A""",1,9978
"""A""",2,9978
"""B""",0,9978
"""B""",1,9978
"""B""",2,9978
"""C""",0,13194
"""C""",1,13194
"""C""",2,13194


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9978
"""A""",1,9978
"""A""",2,9978
"""B""",0,9978
"""B""",1,9978
"""B""",2,9978
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

To be added...