## Stratified Sampling Example

In [23]:
import numpy as np

In [34]:
print("Creating an un-balanced dataset: multiples of 111 in first 1000 natural numbers")

# helper functions
is_multiple = lambda n, d: int(n%d == 0)
def class_stats(class_name, a):
    print("{} class size: {}  ({:.0%})".format(class_name, sum(a), sum(a)/len(a)))
    
# Generate the dataset
X = np.asarray([n for n in range(1, 1001)])
y = np.asarray([is_multiple(n, 111) for n in X])

# print class size
class_stats("positive", y)
class_stats("negative", 1 - y)

Creating an un-balanced dataset: multiples of 111 in first 1000 natural numbers
positive class size: 9  (1%)
negative class size: 991  (99%)


In [35]:
def stratified_sampling(X, y, sample_size):

    # count the classes
    classes = np.unique(y)

    # pick an equal-size sample from each class (strata)
    class_sample_size = sample_size//len(classes)
    class_samples = []
    for c in classes:
        idx_class = np.asarray([i for i in range(len(y)) if y[i] == c]) 
        idx_class_sample = np.random.choice(idx_class, 
                                            size=class_sample_size, 
                                            replace=(class_sample_size>len(idx_class)))
        class_samples.append(idx_class_sample)
    
    # mix all strata together
    idx_balanced_sample = np.concatenate(class_samples)
    np.random.shuffle(idx_balanced_sample)

    # return the  sample
    return  (X[idx_balanced_sample],  y[idx_balanced_sample])

In [38]:
print("Sampling a balanced training set with stratified sampling.")

# get a sample of size 10
(X_sample, y_sample) = stratified_sampling(X, y, 10)

# print class size
class_stats("positive", y_sample)
class_stats("negative", 1 - y_sample)

Sampling a balanced training set with stratified sampling.
positive class size: 5  (50%)
negative class size: 5  (50%)


In [40]:
for x_, y_ in zip(X_sample, y_sample):
    print(x_, y_)

555 1
119 0
70 0
444 1
111 1
333 1
777 1
964 0
491 0
793 0
