# Sampling and Splitting

## Sampling

Sampling for basic tabular datasets. (Not designed for time series as of now.)

In [1]:
import polars as pl
import polars_ds as pds
import polars_ds.sample_and_split as sa

In [2]:
df = pds.frame(size=100_000).with_columns(
    pds.random(0.0, 12.0).alias("uniform_1"),
    pds.random(0.0, 1.0).alias("uniform_2"),
    pds.random_exp(0.5).alias("exp"),
    pds.random_normal(0.0, 1.0).alias("normal"),
    pds.random_normal(0.0, 1000.0).alias("fat_normal"),
    (pds.random_int(0, 3)).alias("flags"),
    pl.Series(["A"] * 30_000 + ["B"] * 30_000 + ["C"] * 40_000).alias("category"),
)
df.head()

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
0,5.355462,0.227585,0.875413,1.255306,-1534.296075,0,"""A"""
1,3.143742,0.651711,2.12331,-0.27767,544.798771,0,"""A"""
2,9.585138,0.720147,1.04885,0.01982,2388.724441,0,"""A"""
3,11.73043,0.059602,3.624234,-1.177224,442.397518,0,"""A"""
4,1.310415,0.783836,3.70326,1.501242,189.064492,2,"""A"""


In [3]:
sa.random_cols(df.columns, 2, keep = ["row_num"])

['row_num', 'uniform_2', 'exp']

In [4]:
# Random Sample
sa.sample(df, 0.6) # by ratio

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
1,3.143742,0.651711,2.12331,-0.27767,544.798771,0,"""A"""
2,9.585138,0.720147,1.04885,0.01982,2388.724441,0,"""A"""
6,0.189662,0.0651,1.316939,-0.244435,748.995179,0,"""A"""
7,0.661346,0.874092,4.843038,0.31243,-383.659135,0,"""A"""
8,0.053801,0.983342,0.452362,0.312257,-386.689719,0,"""A"""
…,…,…,…,…,…,…,…
99994,7.536122,0.11414,2.847801,-0.916853,-1340.111513,2,"""C"""
99996,10.030577,0.939568,0.987719,0.701578,-768.062655,0,"""C"""
99997,5.118598,0.552395,2.390273,-2.57956,-1076.610099,0,"""C"""
99998,5.701428,0.521572,1.290974,-1.361779,5.278061,1,"""C"""


In [5]:
sa.sample(df, 30_000) # by count

row_num,uniform_1,uniform_2,exp,normal,fat_normal,flags,category
i64,f64,f64,f64,f64,f64,i32,str
10,9.781623,0.563868,4.488553,0.123101,1628.818496,1,"""A"""
11,4.508328,0.594697,3.877757,0.849688,-1242.37697,1,"""A"""
14,1.702338,0.776305,1.346987,0.481826,-403.30214,2,"""A"""
19,11.897234,0.55035,1.791477,0.861923,641.532776,2,"""A"""
22,4.077515,0.737717,1.093235,1.048444,1269.183071,2,"""A"""
…,…,…,…,…,…,…,…
99989,5.26012,0.479069,0.748342,-0.224175,-84.266224,1,"""C"""
99994,7.536122,0.11414,2.847801,-0.916853,-1340.111513,2,"""C"""
99995,10.490682,0.611692,0.384882,-0.474915,157.011096,2,"""C"""
99996,10.030577,0.939568,0.987719,0.701578,-768.062655,0,"""C"""


In [6]:
df.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,33381
1,33169
2,33450


In [7]:
# Downsample on one group
sa1 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5)
)
sa1.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16690
1,33169
2,33450


In [8]:
# Downsample on multiple groups
sa2 = sa.downsample(
    df, 
    (pl.col("flags") == 0, 0.5),
    (pl.col("flags") == 1, 0.3),
    (pl.col("flags") == 2, 0.4),
)
sa2.group_by("flags").len().sort("flags")

flags,len
i32,u32
0,16690
1,9950
2,13380


In [9]:
df.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",40000


In [10]:
# Volume neutral by each category, will take the greatest possible value so that we get neutral volume.
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",30000
"""B""",30000
"""C""",30000


In [11]:
# Volume neutral (10_000) by each category
vn = sa.volume_neutral(
    df,
    by = pl.col("category"),
    target_volume = 10_000
)
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",10000
"""C""",10000


In [12]:
# Volume neutral (10_000) by a more complicated condition
vn = sa.volume_neutral(
    df,
    by = pl.col("category") == "A",
    target_volume = 10_000
) # This makes sense because count for B + count for C = 10_000
vn.group_by("category").len().sort("category")

category,len
str,u32
"""A""",10000
"""B""",4220
"""C""",5780


In [13]:
# Volume neutral sample with a control level. Volume neutral happens under the category level, meaning
# the volume for each flag in each category is neutral.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category")
) 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9917
"""A""",1,9917
"""A""",2,9917
"""B""",0,9848
"""B""",1,9848
"""B""",2,9848
"""C""",0,13262
"""C""",1,13262
"""C""",2,13262


In [14]:
# We may not meet the target volume for all categories.
vn = sa.volume_neutral(
    df,
    by = pl.col("flags"),
    control = pl.col("category"),
    target_volume= 10_000
) # 
vn.group_by(["category", "flags"]).len().sort(["category", "flags"])

category,flags,len
str,i32,u32
"""A""",0,9917
"""A""",1,9917
"""A""",2,9917
"""B""",0,9848
"""B""",1,9848
"""B""",2,9848
"""C""",0,10000
"""C""",1,10000
"""C""",2,10000


## Splitting

Split by ratios.

In [15]:
print(df.shape)
train, test = sa.split_by_ratio(
    df,
    split_ratio = 0.6
)
print(train.shape)
print(test.shape)


(100000, 8)
(60000, 8)
(40000, 8)


In [16]:
print(df.shape)
for frame in sa.split_by_ratio(df, split_ratio = [0.25, 0.4, 0.10, 0.25]):
    print(frame.shape)

(100000, 8)
(25000, 8)
(40000, 8)
(10000, 8)
(25000, 8)
