In [30]:

# Lab Exercises: Sampling Techniques
## This notebook demonstrates various probabilistic and non-probabilistic sampling methods using the Iris dataset. Run each section to see how samples differ.

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


1. Probabilistic Sampling Methods


In [24]:

# 1.1 Simple Random Sampling
# Select n rows uniformly at random.

simple_rand = df.sample(n=50, random_state=42)
print(simple_rand.shape)

(50, 6)


In [11]:

# 1.2 Systematic Sampling
# Choose every kᵗʰ record after a random start.
def systematic_sampling(data, k, seed=42):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=100)
print(sys_sample.shape)




(1, 6)


In [25]:


# 1.3 Stratified Random Sampling
# Sample within each class proportional to its size.
# Stratified sample: 30% from each species
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.6, random_state=42)
)
print(strata['target'].value_counts(normalize=True))

target
0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64


  strata = df.groupby('target', group_keys=False).apply(


In [23]:


# 1.4 Cluster Sampling
# Randomly select entire clusters, then use all items in chosen clusters.

# Define clusters by rounding sepal length
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=3, replace=False)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())



Clusters chosen: [6 5 4]
cluster
5    61
6    54
4    22
Name: count, dtype: int64


2. Non-Probabilistic Sampling Methods


In [26]:

# 2.1 Convenience Sampling
# Select the first n rows or any easily accessible subset.
# First 50 rows as a convenience sample
convenience = df.head(500)
print(convenience.shape)



(150, 6)


In [27]:


# 2.2 Judgmental (Selective) Sampling
# Manually pick samples based on domain knowledge.
# E.g., pick all Setosa and first 10 Versicolor
judgmental = pd.concat([
    df[df['target'] == 0],
    df[df['target'] == 1].head(100)
])
print(judgmental['target'].value_counts())


target
0    50
1    50
Name: count, dtype: int64


In [28]:


# 2.3 Snowball Sampling
# Begin with a small seed and expand via similarity (simulated by nearest neighbors here).
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=50, random_state=42)
nbrs = NearestNeighbors(n_neighbors=3).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)

(101, 6)


In [29]:
# 2.4 Quota Sampling
# Ensure the sample meets predefined quotas for each stratum.
# Quotas: 20 Setosa, 15 Versicolor, 10 Virginica
quota = pd.concat([
    df[df['target']==0].sample(n=10, random_state=42),
    df[df['target']==1].sample(n=25, random_state=42),
    df[df['target']==2].sample(n=40, random_state=42)
])
print(quota['target'].value_counts())


target
2    40
1    25
0    10
Name: count, dtype: int64
