This example cames from here: https://towardsdatascience.com/data-sampling-methods-in-python-a4400628ea1b

Random sampling.

The simplest data sampling technique. 

Every sampled observation has the same probability of getting selected.



In [10]:
import numpy as np

# generating population data following Normal Distribution
N = 10000
mu = 10
std = 2
population_df = np.random.normal(mu,std,N)

# function that creates random sample 
def random_sampling(df, n):
    random_sample = np.random.choice(df,replace = False, size = n)
    return(random_sample)
randomSample = random_sampling(population_df,1000)


In [14]:
print(randomSample.mean())
randomSample.std()

10.070056430286966


2.0141120370695043

Sistematic sampling.

Probability sampling approach where the elements from a target population are selected from a random starting point and after a fixed sampling interval.

Extended version of probability sampling techniques.

Each member of the group is selected at regular periods to form a sample.

Sampling interval is calculated by dividing the entire population size by the desired sample size

Systematic Sampling usually produces a random sample but is not addressing the bias in the created sample

In [18]:
import pandas as pd
# generating population data following Normal Distribution
N = 10000
mu = 10
std = 2
population_df = np.random.normal(mu,std,N)

# function that creates random sample using Systematic Sampling
def systematic_sampling(df, step):
    id = pd.Series(np.arange(1,len(df),1))
    df = pd.Series(df)
    df_pd = pd.concat([id, df], axis = 1)
    df_pd.columns = ["id", "data"]
    # these indices will increase with the step amount not 1
    selected_index = np.arange(1,len(df),step)
    print(f'selected index is: {selected_index}')
    # using iloc for getting thee data with selected indices
    systematic_sampling = df_pd.iloc[selected_index]
    return(systematic_sampling)

n = 10
step = int(N/n)
sample = systematic_sampling(population_df, step)

selected index is: [   1 1001 2001 3001 4001 5001 6001 7001 8001 9001]


Cluster Sampling

Is a probability sampling technique

Here population is divided into multiple clusters (groups) based on certain clustering criteria.

Then clusters are randomly selected, by random sampling or systematic sampling.

In [3]:
import numpy as np
import pandas as pd

# Generating Population data 

#prive_vb generated using Uniform Distributions
price_vb = pd.Series(np.random.uniform(1,4,size = N))

#Id, as simple as that
id = pd.Series(np.arange(0,len(price_vb),1))

#event type, categorical variable with 3 possible outputs: type1, type2, type3
event_type = pd.Series(np.random.choice(["type1","type2","type3"],size = len(price_vb)))

#Binary variable: 0 - no click ; 1 - click
click = pd.Series(np.random.choice([0,1],size = len(price_vb)))
df = pd.concat([id,price_vb,event_type, click],axis = 1)
df.columns = ["id","price","event_type", "click"]
df

Unnamed: 0,id,price,event_type,click
0,0,2.038645,type1,1
1,1,1.819013,type1,0
2,2,1.977570,type2,0
3,3,1.269752,type3,1
4,4,3.154428,type1,1
...,...,...,...,...
9995,9995,1.175806,type1,0
9996,9996,2.912970,type2,0
9997,9997,2.977459,type1,1
9998,9998,2.792856,type2,0


Note that, Cluster Sampling usually produces a random sample but is not addressing the bias in the created sample.

In [4]:
def get_clustered_Sample(df, n_per_cluster, num_select_clusters):
    N = len(df)
    K = int(N/n_per_cluster)
    data = None
    for k in range(K):
        sample_k = df.sample(n_per_cluster)
        sample_k["cluster"] = np.repeat(k,len(sample_k))
        df = df.drop(index = sample_k.index)
        data = pd.concat([data,sample_k],axis = 0)

    random_chosen_clusters = np.random.randint(0,K,size = num_select_clusters)
    samples = data[data.cluster.isin(random_chosen_clusters)]
    return(samples)

sample = get_clustered_Sample(df = df, n_per_cluster = 100, num_select_clusters = 2)
sample

Unnamed: 0,id,price,event_type,click,cluster
9778,9778,1.231980,type3,1,26
1337,1337,2.300081,type3,0,26
5847,5847,1.182630,type1,1,26
5138,5138,1.456414,type2,1,26
3654,3654,2.840846,type3,1,26
...,...,...,...,...,...
1829,1829,1.952703,type2,0,28
5623,5623,1.331676,type1,1,28
4991,4991,1.143653,type2,1,28
1602,1602,2.889670,type3,1,28


Weighted Sampling.

In [5]:
def get_weighted_sample(df,n):
    def get_class_prob(x):
        weight_x = int(np.rint(n * len(x[x.click != 0]) / len(df[df.click != 0])))
        sampled_x = x.sample(weight_x).reset_index(drop=True)
        return (sampled_x)
        # we are grouping by the target class we use for the proportions

    weighted_sample = df.groupby('event_type').apply(get_class_prob)
    print(weighted_sample["event_type"].value_counts())
    return (weighted_sample)

sample = get_weighted_sample(df,100)
sample

type2    34
type1    33
type3    33
Name: event_type, dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,id,price,event_type,click
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
type1,0,5112,2.216927,type1,1
type1,1,7294,3.894546,type1,0
type1,2,8812,2.460551,type1,0
type1,3,6978,3.062773,type1,0
type1,4,5016,2.859222,type1,0
...,...,...,...,...,...
type3,28,8658,2.916439,type3,1
type3,29,8893,1.946792,type3,0
type3,30,7517,1.214592,type3,1
type3,31,5738,1.188607,type3,0


Stratified Sampling


In [6]:
def get_startified_sample(df,n,num_clusters_needed):
    N = len(df)
    num_obs_per_cluster = int(N/n)
    K = int(N/num_obs_per_cluster)

    def get_weighted_sample(df,num_obs_per_cluster):
        def get_sample_per_class(x):
            n_x = int(np.rint(num_obs_per_cluster*len(x[x.click !=0])/len(df[df.click !=0])))
            sample_x = x.sample(n_x)
            return(sample_x)
        weighted_sample = df.groupby("event_type").apply(get_sample_per_class)
        return(weighted_sample)

    stratas = None
    for k in range(K):
        weighted_sample_k = get_weighted_sample(df,num_obs_per_cluster).reset_index(drop = True)
        weighted_sample_k["cluster"] = np.repeat(k,len(weighted_sample_k))
        stratas = pd.concat([stratas, weighted_sample_k],axis = 0)
        df.drop(index = weighted_sample_k.index)
    selected_strata_clusters = np.random.randint(0,K,size = num_clusters_needed)
    stratified_samples = stratas[stratas.cluster.isin(selected_strata_clusters)]
    return(stratified_samples)

sample = get_startified_sample(df = df,n = 100,num_clusters_needed = 2)
sample

Unnamed: 0,id,price,event_type,click,cluster
0,4120,2.798796,type1,1,37
1,6874,1.374345,type1,0,37
2,7407,1.809010,type1,1,37
3,8576,3.980909,type1,1,37
4,6770,3.971080,type1,0,37
...,...,...,...,...,...
95,4234,1.511571,type3,1,63
96,3075,2.124351,type3,1,63
97,7406,3.998861,type3,1,63
98,234,2.829881,type3,0,63
