In [2]:
# Lab Exercises: Sampling Techniques
## This notebook demonstrates various probabilistic and non-probabilistic sampling methods using the Iris dataset. Run each section to see how samples differ.


import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
from os import XATTR_CREATE
# Lab Exercises: Sampling Techniques
## This notebook demonstrates various probabilistic and non-probabilistic sampling methods using the Iris dataset. Run each section to see how samples differ.


import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load Iris dataset into DataFrame
iris = load_iris(as_frame=True)
df = iris.frame
df['target'] = iris.target
df.head()
XATTR_CREATE

1

In [6]:
# 1.1 Simple Random Sampling
# Select n rows uniformly at random.

# Simple random sample of 50 observations
simple_rand = df.sample(n=50, random_state=42)
print(simple_rand.shape)

(50, 5)


In [9]:
# 1.1 Simple Random Sampling
# Select n rows uniformly at random.

# Simple random sample of 50 observations
simple_rand = df.sample(n=60, random_state=20)
print(simple_rand.shape)

(60, 5)


In [10]:
# 1.2 Systematic Sampling
# Choose every kᵗʰ record after a random start.
def systematic_sampling(data, k, seed=42):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=10)
print(sys_sample.shape)


(15, 5)


In [11]:
# 1.2 Systematic Sampling
# Choose every kᵗʰ record after a random start.
def systematic_sampling(data, k, seed=20):
    np.random.seed(seed)
    start = np.random.randint(0, k)
    indices = np.arange(start, len(data), k)
    return data.iloc[indices]

# Every 10th after a random start
sys_sample = systematic_sampling(df, k=2)
print(sys_sample.shape)


(75, 5)


In [12]:
# 1.3 Stratified Random Sampling
# Sample within each class proportional to its size.
# Stratified sample: 30% from each species
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.3, random_state=42)
)
print(strata['target'].value_counts(normalize=True))


target
0    0.333333
1    0.333333
2    0.333333
Name: proportion, dtype: float64


  strata = df.groupby('target', group_keys=False).apply(


In [15]:
# 1.3 Stratified Random Sampling
# Sample within each class proportional to its size.
# Stratified sample: 30% from each species
strata = df.groupby('target', group_keys=False).apply(
    lambda x: x.sample(frac=0.1, random_state=20)
)
print(strata['target'].value_counts(normalize=False))


target
0    5
1    5
2    5
Name: count, dtype: int64


  strata = df.groupby('target', group_keys=False).apply(


In [16]:
# 1.4 Cluster Sampling
# Randomly select entire clusters, then use all items in chosen clusters.

# Define clusters by rounding sepal length
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=2, replace=False)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())

Clusters chosen: [4 6]
cluster
6    54
4    22
Name: count, dtype: int64


In [17]:
# 1.4 Cluster Sampling
# Randomly select entire clusters, then use all items in chosen clusters.

# Define clusters by rounding sepal length
df['cluster'] = (df['sepal length (cm)'] // 1).astype(int)
clusters = df['cluster'].unique()

# Randomly pick 2 clusters
chosen = np.random.choice(clusters, size=0, replace=True)
cluster_sample = df[df['cluster'].isin(chosen)]
print("Clusters chosen:", chosen)
print(cluster_sample['cluster'].value_counts())

Clusters chosen: []
Series([], Name: count, dtype: int64)


In [18]:
# 2.1 Convenience Sampling
# Select the first n rows or any easily accessible subset.
# First 50 rows as a convenience sample
convenience = df.head(50)
print(convenience.shape)


(50, 6)


In [19]:
# 2.1 Convenience Sampling
# Select the first n rows or any easily accessible subset.
# First 50 rows as a convenience sample
convenience = df.head(29)
print(convenience.shape)


(29, 6)


In [20]:
# 2.2 Judgmental (Selective) Sampling
# Manually pick samples based on domain knowledge.
# E.g., pick all Setosa and first 10 Versicolor
judgmental = pd.concat([
    df[df['target'] == 0],
    df[df['target'] == 1].head(10)
])
print(judgmental['target'].value_counts())


target
0    50
1    10
Name: count, dtype: int64


In [21]:
# 2.2 Judgmental (Selective) Sampling
# Manually pick samples based on domain knowledge.
# E.g., pick all Setosa and first 10 Versicolor
judgmental = pd.concat([
    df[df['target'] == 1],
    df[df['target'] == 0].head(20)
])
print(judgmental['target'].value_counts())


target
1    50
0    20
Name: count, dtype: int64


In [22]:
# 2.3 Snowball Sampling
# Begin with a small seed and expand via similarity (simulated by nearest neighbors here).
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=5, random_state=42)
nbrs = NearestNeighbors(n_neighbors=3).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)


(13, 6)


In [23]:
# 2.3 Snowball Sampling
# Begin with a small seed and expand via similarity (simulated by nearest neighbors here).
from sklearn.neighbors import NearestNeighbors

# Seed: pick 5 random points
seed = df.sample(n=2, random_state=20)
nbrs = NearestNeighbors(n_neighbors=2).fit(df.drop(columns=['target','cluster']))
distances, indices = nbrs.kneighbors(seed.drop(columns=['target','cluster']))

# Snowball: union of seed and their neighbors
snowball_idx = set(seed.index)
for neigh in indices:
    snowball_idx.update(neigh)
snowball = df.loc[list(snowball_idx)]
print(snowball.shape)


(4, 6)


In [24]:

# 2.4 Quota Sampling
# Ensure the sample meets predefined quotas for each stratum.
# Quotas: 20 Setosa, 15 Versicolor, 10 Virginica
quota = pd.concat([
    df[df['target']==0].sample(n=20, random_state=42),
    df[df['target']==1].sample(n=15, random_state=42),
    df[df['target']==2].sample(n=10, random_state=42)
])
print(quota['target'].value_counts())


target
0    20
1    15
2    10
Name: count, dtype: int64


In [25]:

# 2.4 Quota Sampling
# Ensure the sample meets predefined quotas for each stratum.
# Quotas: 20 Setosa, 15 Versicolor, 10 Virginica
quota = pd.concat([
    df[df['target']==0].sample(n=5, random_state=20),
    df[df['target']==1].sample(n=10, random_state=20),
    df[df['target']==2].sample(n=15, random_state=20)
])
print(quota['target'].value_counts())


target
2    15
1    10
0     5
Name: count, dtype: int64


In [1]:
# Setup: Import libraries and load datasets
# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# For supervised and unsupervised tasks
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, adjusted_rand_score
from sklearn.cluster import KMeans

# For reinforcement learning task
!pip install gymnasium --quiet
import gymnasium as gym

Machine Learning Lab

In [3]:
# Part 1: Supervised Learning — Iris Dataset Classification
# Task:
# Train a Logistic Regression model to classify Iris flower species.
# Load data
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target

# Split data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train logistic regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

Test Accuracy: 1.0


In [6]:
# Part 1: Supervised Learning — Iris Dataset Classification
# Task:
# Train a Logistic Regression model to classify Iris flower species.
# Load data
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=20)

# Train logistic regression
model = LogisticRegression(max_iter=50)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       0.83      1.00      0.91         5
           2       1.00      0.80      0.89         5

    accuracy                           0.93        15
   macro avg       0.94      0.93      0.93        15
weighted avg       0.94      0.93      0.93        15

Test Accuracy: 0.9333333333333333


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [4]:
# Part 2: Unsupervised Learning — K-Means Clustering on Digits Dataset
#Task:
 #Cluster digit images into groups without label information.

# Load digits data
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

# K-means clustering (10 clusters for 10 digits)
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_digits)

# Evaluate clustering quality against true labels (adjusted rand index)
ari_score = adjusted_rand_score(y_digits, clusters)
print(f"Adjusted Rand Index (measures cluster-label agreement): {ari_score:.4f}")


Adjusted Rand Index (measures cluster-label agreement): 0.6669


In [5]:
# Part 2: Unsupervised Learning — K-Means Clustering on Digits Dataset
#Task:
 #Cluster digit images into groups without label information.

# Load digits data
digits = load_digits()
X_digits, y_digits = digits.data, digits.target

# K-means clustering (10 clusters for 10 digits)
kmeans = KMeans(n_clusters=10, random_state=20, n_init=20)
clusters = kmeans.fit_predict(X_digits)

# Evaluate clustering quality against true labels (adjusted rand index)
ari_score = adjusted_rand_score(y_digits, clusters)
print(f"Adjusted Rand Index (measures cluster-label agreement): {ari_score:.4f}")


Adjusted Rand Index (measures cluster-label agreement): 0.6703
