<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/main/Exercises/day-4/Sampling/Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Lab Exercises: Sampling Techniques
## This notebook demonstrates various probabilistic and non-probabilistic sampling methods using the Iris dataset. Run each section to see how samples differ.
)

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()


## 1. Probabilistic Sampling Methods

In [None]:
# 1.1 Simple Random Sampling
# Select n rows uniformly at random.

# Simple random sample of 50 observations
simple_rand = df.sample(n=50, random_state=42)
print(simple_rand.shape)


In [None]:
# 1.2 Systematic Sampling
# Choose every kᵗʰ record after a random start.
def systematic_sampling(data, k, seed=42):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=10)
print(sys_sample.shape)



In [None]:
# 1.3 Stratified Random Sampling
# Sample within each class proportional to its size.
# Stratified sample: 30% from each species
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.3, random_state=42)
)
print(strata['target'].value_counts(normalize=True))


In [None]:
# 1.4 Cluster Sampling
# Randomly select entire clusters, then use all items in chosen clusters.

# Define clusters by rounding sepal length
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=2, replace=False)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())


## 2. Non-Probabilistic Sampling Methods

In [None]:
# 2.1 Convenience Sampling
# Select the first n rows or any easily accessible subset.
# First 50 rows as a convenience sample
convenience = df.head(50)
print(convenience.shape)


In [None]:
# 2.2 Judgmental (Selective) Sampling
# Manually pick samples based on domain knowledge.
# E.g., pick all Setosa and first 10 Versicolor
judgmental = pd.concat([
    df[df['target'] == 0],
    df[df['target'] == 1].head(10)
])
print(judgmental['target'].value_counts())


In [None]:
# 2.3 Snowball Sampling
# Begin with a small seed and expand via similarity (simulated by nearest neighbors here).
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=5, random_state=42)
nbrs = NearestNeighbors(n_neighbors=3).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)


In [None]:
# 2.4 Quota Sampling
# Ensure the sample meets predefined quotas for each stratum.
# Quotas: 20 Setosa, 15 Versicolor, 10 Virginica
quota = pd.concat([
    df[df['target']==0].sample(n=20, random_state=42),
    df[df['target']==1].sample(n=15, random_state=42),
    df[df['target']==2].sample(n=10, random_state=42)
])
print(quota['target'].value_counts())


# Instructions
- Run each cell and observe the sampled subsets.

- Compare sample sizes and class distributions.

- Modify parameters (n, k, frac, quotas) to explore their effects.

- Reflect on when each sampling method is appropriate in practice.