In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rd # random

# Input data files from Github repository
df_iris = pd.read_csv("https://raw.githubusercontent.com/abhijitpaul0212/DataSets/main/Iris.csv")

# Sampling Technique
When you conduct research about a group of people, itâ€™s rarely possible to collect data from every person in that group. Instead, you select a **sample**. The sample is the group of individuals who will actually participate in the research.

To draw valid conclusions from your results, you have to carefully decide how you will select a sample that is representative of the group as a whole. This is called a **sampling technique**.

Table of contents
* Simple Random Sampling
* Stratifed Sampling
* Systematic Sampling
* Cluster Sampling

### 1. Simple Random Sampling:

It involves randomly selecting subjects (entities) from a population. Each subject has an equal probability of being chosen from the population to form a sample (subpopulation) of the overall population.

In [None]:
# Randomly select a specified number of rows from the dataset

def random_sampling(df, sample_size, state=None):
    
    # get sample elements from dataframe 
    return df.sample(n=sample_size, replace=False, random_state=state)

In [None]:
df_random_sample = random_sampling(df=df_iris, sample_size=10, state=None)

display(df_random_sample.head(5))

### 2. Stratified Sampling

It is a sampling approach in which the population is separated into groups or strata depending on a particular characteristic (e.g. Categorical attributes). 

Then subjects from each stratum (the singular of strata) are randomly sampled.

In [None]:
def counts(df, variable):
    c = df[variable].value_counts(dropna=False)
    p = df[variable].value_counts(dropna=False, normalize=True).round(2)  # normalization gives the percentage count
    return pd.concat([c, p], axis=1, keys=['counts', '%'])

def stratified_sampling(df, sample_size, variable, state=None):
    
    # defining variables
    print("Defining variables")
    length = len(df)
    print(f"-- length: {length}")
    
    # get strata fraction    
    fraction = round(sample_size / length, 2)
    print(f"-- strata fraction: {fraction}")
    
    # get sample elements from dataframe of strata size
    df_stratified_sample = df.groupby(variable, group_keys=False).apply(lambda x: x.sample(frac=fraction))
    
    return df_stratified_sample

In [None]:
display(counts(df_iris, 'Species'))
print('')

df_iris_sample = stratified_sampling(df=df_iris, sample_size=10, variable='Species')
display(df_iris_sample.head(5))
print('')

display(counts(df_iris_sample, 'Species'))

We grouped the dataframe into different strata using the groupby() method on 'Species' feature which is of Categorical type.

For each group (stratum) we randomly sampled out 0.07(7%) of observation from it.

Now if we look above at the proportion for df_iris_sample and df_iris, we will see that the proportions for both dataframes are the same.

### 3. Systematic Sampling

It is defined as the type of Probability Sampling where a researcher can research on a targeted data from large set of data. 

Targeted data is chosen by selecting random starting point and from that after certain interval next element is chosen for sample. In this a small subset (sample) is extracted from large data.

In [None]:
def systematic_sampling(df, sample_size, state=None):
    
    # defining variables
    print("Defining variables")
    length = len(df)
    print(f"-- length: {length}")
    
    interval = length // sample_size
    print(f"-- systematic interval: {interval}")
    
    # get first element position
    rd.seed(state)    
    first = rd.randint(0, interval)  # first item is chosen randomly
    print(f"-- first element is at: {first} position")
    
    # get sample indexes systematically
    indexes = np.arange(first, length, step=interval)
    
    # get sample elements from dataframe
    df_systematic_sample = df.iloc[indexes]
    print(f"-- sample size: {df_systematic_sample.shape[0]}")
    return df_systematic_sample


In [None]:
df_sample_systematic = systematic_sampling(df=df_iris, sample_size=5, state=None)
print('')
display(df_sample_systematic.head(5))

### 4. Cluster Sampling

This is a probabilistic sampling method which is used when natural groups are present in the population.

Researchers divide a large population into smaller groups known as clusters, and then select randomly among the clusters to form a sample. All elements within the chosen clusters are included in the sample.

In [None]:
def cluster_sampling(df, cluster, state=None):
    
    # defining variables
    print("Defining variables")
    length = len(df)
    print(f"-- length: {length}")
    max_elements = int(length / cluster)
    print(f"-- max elements in cluster: {max_elements}")
    
    
    # cluster list creation
    cluster_list = []
    cluster_id = 0
    element_count = 0

    for _ in df.iterrows():
        cluster_list.append(cluster_id)
        element_count += 1
        if element_count > (max_elements - 1):
            element_count = 0  # resetting to zero for next group
            cluster_id += 1
            
    # allocating the elements in their respective clusters            
    df['cluster'] = cluster_list
    print("-- cluster list")
    print(df['cluster'].value_counts())
    
    # randomly choosing a cluster
    rd.seed(None)
    group_selected = rd.randint(0, cluster-1)
    print("-- cluster selected: ", group_selected)
    
    
    # defining the clustered sample
    df_cluster_sample = df[df['cluster'] == group_selected]
    print(f"-- cluster size: {df_cluster_sample.shape[0]}")
    return df_cluster_sample


In [None]:
df_cluster_sample = cluster_sampling(df=df_iris, cluster=10, state=None)
print('')
display(df_cluster_sample.head(5))