<a href="https://colab.research.google.com/github/diptidhande/AI-ML/blob/main/Exercises/day-4/Sampling/Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Lab Exercises: Sampling Techniques
## This notebook demonstrates various probabilistic and non-probabilistic sampling methods using the Iris dataset. Run each section to see how samples differ.


import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [2]:
iris

{'data':      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                  5.1               3.5                1.4               0.2
 1                  4.9               3.0                1.4               0.2
 2                  4.7               3.2                1.3               0.2
 3                  4.6               3.1                1.5               0.2
 4                  5.0               3.6                1.4               0.2
 ..                 ...               ...                ...               ...
 145                6.7               3.0                5.2               2.3
 146                6.3               2.5                5.0               1.9
 147                6.5               3.0                5.2               2.0
 148                6.2               3.4                5.4               2.3
 149                5.9               3.0                5.1               1.8
 
 [150 rows x 4 columns],
 'target': 0     

In [3]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [4]:
df['target']

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
145,2
146,2
147,2
148,2


In [5]:
df.columns.tolist()

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)',
 'target']

## 1. Probabilistic Sampling Methods

In [7]:
# 1.1 Simple Random Sampling
# Select n rows uniformly at random.

# Simple random sample of 50 observations
simple_rand = df.sample(n=50, random_state=42)
print(simple_rand.shape)


(50, 5)


In [8]:
print(simple_rand.head())

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
73                 6.1               2.8                4.7               1.2   
18                 5.7               3.8                1.7               0.3   
118                7.7               2.6                6.9               2.3   
78                 6.0               2.9                4.5               1.5   
76                 6.8               2.8                4.8               1.4   

     target  
73        1  
18        0  
118       2  
78        1  
76        1  


In [10]:
# 1.2 Systematic Sampling
# Choose every kᵗʰ record after a random start.
def systematic_sampling(data, k, seed=42):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=10)
print(sys_sample.shape)



(15, 5)


In [11]:
# 1.3 Stratified Random Sampling
# Sample within each class proportional to its size.
# Stratified sample: 30% from each species
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.3, random_state=42)
)
print(strata['target'].value_counts(normalize=True))


target
0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64


  strata = df.groupby('target', group_keys=False).apply(


In [12]:
# 1.4 Cluster Sampling
# Randomly select entire clusters, then use all items in chosen clusters.

# Define clusters by rounding sepal length
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=2, replace=False)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())


Clusters chosen: [4 7]
cluster
4    22
7    13
Name: count, dtype: int64


## 2. Non-Probabilistic Sampling Methods

In [13]:
# 2.1 Convenience Sampling
# Select the first n rows or any easily accessible subset.
# First 50 rows as a convenience sample
convenience = df.head(50)
print(convenience.shape)


(50, 6)


In [14]:
# 2.2 Judgmental (Selective) Sampling
# Manually pick samples based on domain knowledge.
# E.g., pick all Setosa and first 10 Versicolor
judgmental = pd.concat([
    df[df['target'] == 0],
    df[df['target'] == 1].head(10)
])
print(judgmental['target'].value_counts())


target
0    50
1    10
Name: count, dtype: int64


In [16]:
judgmental.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,cluster
0,5.1,3.5,1.4,0.2,0,5
1,4.9,3.0,1.4,0.2,0,4
2,4.7,3.2,1.3,0.2,0,4
3,4.6,3.1,1.5,0.2,0,4
4,5.0,3.6,1.4,0.2,0,5


In [22]:
# 2.3 Snowball Sampling
# Begin with a small seed and expand via similarity (simulated by nearest neighbors here).
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=10, random_state=42)
nbrs = NearestNeighbors(n_neighbors=3).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)


(27, 6)


In [23]:
nbrs

In [21]:
# 2.4 Quota Sampling
# Ensure the sample meets predefined quotas for each stratum.
# Quotas: 20 Setosa, 15 Versicolor, 10 Virginica
quota = pd.concat([
    df[df['target']==0].sample(n=20, random_state=42),
    df[df['target']==1].sample(n=15, random_state=42),
    df[df['target']==2].sample(n=10, random_state=42)
])
print(quota['target'].value_counts())


target
0    20
1    15
2    10
Name: count, dtype: int64


# Instructions
- Run each cell and observe the sampled subsets.

- Compare sample sizes and class distributions.

- Modify parameters (n, k, frac, quotas) to explore their effects.

- Reflect on when each sampling method is appropriate in practice.