In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [2]:
simple_rand = df.sample(n=50, random_state=42)
print(simple_rand.shape)

(50, 5)


In [3]:
def systematic_sampling(data, k, seed=42):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=10)
print(sys_sample.shape)

(15, 5)


In [4]:
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.3, random_state=42)
)
print(strata['target'].value_counts(normalize=True))

target
0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64


  strata = df.groupby('target', group_keys=False).apply(


In [5]:
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=2, replace=False)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())

Clusters chosen: [4 7]
cluster
4    22
7    13
Name: count, dtype: int64


In [6]:
convenience = df.head(50)
print(convenience.shape)

(50, 6)


In [7]:
judgmental = pd.concat([
    df[df['target'] == 0],
    df[df['target'] == 1].head(10)
])
print(judgmental['target'].value_counts())

target
0    50
1    10
Name: count, dtype: int64


In [8]:
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=5, random_state=42)
nbrs = NearestNeighbors(n_neighbors=3).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)

(13, 6)


In [9]:
quota = pd.concat([
    df[df['target']==0].sample(n=20, random_state=42),
    df[df['target']==1].sample(n=15, random_state=42),
    df[df['target']==2].sample(n=10, random_state=42)
])
print(quota['target'].value_counts())


target
0    20
1    15
2    10
Name: count, dtype: int64
