# Sampling Methods

We will try to explore the following probabilistic sampling methods:
- Simple Random Sampling
- Stratified Sampling
- Cluster Sampling
- Systematic Sampling

In [1]:
# Import required libraries
import numpy as np
import pandas as pd

In [2]:
# Set random seed
np.random.seed(124)

In [3]:
# Define total number of products
number_of_products = 10

# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# Store the real mean in a separate variable
real_mean = round(df['measure'].mean(),3)

# View data frame
df

Unnamed: 0,product_id,measure
0,1,10.144
1,2,9.769
2,3,9.331
3,4,11.159
4,5,9.266
5,6,9.626
6,7,9.491
7,8,10.818
8,9,10.461
9,10,9.64


In [4]:
real_mean

9.97

## 1. Simple Random Sampling

In [5]:
# Obtain simple random sample
simple_random_sample = df.sample(n=4).sort_values(by='product_id')

# Save the sample mean in a separate variable
simple_random_mean = round(simple_random_sample['measure'].mean(),3)

# View sampled data frame
simple_random_sample

Unnamed: 0,product_id,measure
3,4,11.159
4,5,9.266
5,6,9.626
6,7,9.491


In [7]:
simple_random_mean

9.886

## 2. Stratified Random Sampling

In [8]:
# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
       'product_strata':np.repeat([1,2], number_of_products/2).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df_stratified = pd.DataFrame(data)

# View data frame
df_stratified

Unnamed: 0,product_id,product_strata,measure
0,1,1,10.37
1,2,1,10.731
2,3,1,10.121
3,4,1,10.397
4,5,1,10.19
5,6,2,9.18
6,7,2,9.471
7,8,2,9.86
8,9,2,10.434
9,10,2,11.272


In [9]:
# Save the sample mean in a separate variable
stratified_random_mean = round(df_stratified['measure'].mean(),3)

stratified_random_mean

10.203

In [10]:
# Import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=4)

# Perform data frame split
for x, y in split.split(df_stratified, df_stratified['product_strata']):
    stratified_random_sample = df_stratified.iloc[y].sort_values(by='product_id')

# View sampled data frame
stratified_random_sample

Unnamed: 0,product_id,product_strata,measure
3,4,1,10.397
4,5,1,10.19
7,8,2,9.86
8,9,2,10.434


In [11]:
# Obtain the sample mean for each group
stratified_random_sample.groupby('product_strata').mean().drop(['product_id'],axis=1)

Unnamed: 0_level_0,measure
product_strata,Unnamed: 1_level_1
1,10.2935
2,10.147


## 3. Cluster Sampling

In [12]:
def cluster_sampling(df, number_of_clusters):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)

        # Create an empty list
        indexes = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indexes.append(i)
        cluster_sample = df.iloc[indexes]
        return(cluster_sample)
    
    except:
        print("The population cannot be divided into clusters of equal size!")
        
# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df,5)

# Save the sample mean in a separate variable
cluster_mean = round(cluster_sample['measure'].mean(),3)

# View sampled data frame
cluster_sample

Unnamed: 0,product_id,measure,cluster_id
2,3,9.331,2
3,4,11.159,2
6,7,9.491,4
7,8,10.818,4


In [13]:
cluster_mean

10.2

## 4. Systematic Random Sampling

In [14]:
# Define systematic sampling function
def systematic_sampling(df, step):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample
    
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 3)

# Save the sample mean in a separate variable
systematic_mean = round(systematic_sample['measure'].mean(),3)

# View sampled data frame
systematic_sample

Unnamed: 0,product_id,measure,cluster_id
0,1,10.144,1
3,4,11.159,2
6,7,9.491,4
9,10,9.64,5


In [15]:
systematic_mean

10.108

## Measure Mean Compare - between sampling methods

In [16]:
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean,stratified_random_mean,systematic_mean,cluster_mean],
           'real_mean':real_mean}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Stratified Sampling','Systematic Sampling','Cluster Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

Unnamed: 0,sample_mean,real_mean,abs_error
Simple Random Sampling,9.886,9.97,0.084
Systematic Sampling,10.108,9.97,0.138
Cluster Sampling,10.2,9.97,0.23
Stratified Sampling,10.203,9.97,0.233
