# Tutorial 09: Dataset Construction for ML

## Module 4: Model Development

---

## Learning Objectives

By the end of this tutorial, you will be able to:

1. **Apply effective data collection strategies** including active learning and augmentation
2. **Implement various labeling approaches** from hand labeling to weak supervision
3. **Use appropriate sampling strategies** for different scenarios
4. **Create proper train/validation/test splits** including time-based splitting
5. **Implement cross-validation techniques** for robust model evaluation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from typing import Tuple, List, Dict

from sklearn.datasets import make_classification, make_regression, load_iris, load_digits
from sklearn.model_selection import (
    train_test_split, StratifiedKFold, KFold, TimeSeriesSplit,
    cross_val_score, LeaveOneOut, GroupKFold
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans

np.random.seed(42)
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
print("Libraries imported successfully!")

## 1. Data Collection Strategies

Effective data collection is crucial for ML success.

### Collection Methods

| Method | Description | When to Use |
|--------|-------------|-------------|
| **Direct Collection** | Collect from primary sources | Have access to data source |
| **Active Learning** | Selectively label most informative samples | Limited labeling budget |
| **Data Augmentation** | Create variations of existing data | Limited training data |
| **Synthetic Generation** | Generate artificial data | Need more samples |

### 1.1 Active Learning

Active learning selects the most informative samples for labeling.

In [None]:
class ActiveLearner:
    """Simple active learning implementation."""
    
    def __init__(self, model, strategy='uncertainty'):
        self.model = model
        self.strategy = strategy
        
    def fit(self, X, y):
        self.model.fit(X, y)
        return self
    
    def query(self, X_pool, n_samples=10):
        """Select most informative samples from pool."""
        if self.strategy == 'uncertainty':
            return self._uncertainty_sampling(X_pool, n_samples)
        elif self.strategy == 'margin':
            return self._margin_sampling(X_pool, n_samples)
        elif self.strategy == 'entropy':
            return self._entropy_sampling(X_pool, n_samples)
        else:
            return self._random_sampling(X_pool, n_samples)
    
    def _uncertainty_sampling(self, X_pool, n_samples):
        """Select samples with highest uncertainty (lowest confidence)."""
        proba = self.model.predict_proba(X_pool)
        confidence = np.max(proba, axis=1)
        indices = np.argsort(confidence)[:n_samples]
        return indices
    
    def _margin_sampling(self, X_pool, n_samples):
        """Select samples with smallest margin between top two predictions."""
        proba = self.model.predict_proba(X_pool)
        sorted_proba = np.sort(proba, axis=1)[:, ::-1]
        margin = sorted_proba[:, 0] - sorted_proba[:, 1]
        indices = np.argsort(margin)[:n_samples]
        return indices
    
    def _entropy_sampling(self, X_pool, n_samples):
        """Select samples with highest entropy."""
        proba = self.model.predict_proba(X_pool)
        entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
        indices = np.argsort(entropy)[::-1][:n_samples]
        return indices
    
    def _random_sampling(self, X_pool, n_samples):
        """Random sampling baseline."""
        indices = np.random.choice(len(X_pool), n_samples, replace=False)
        return indices

print("ActiveLearner class defined!")

In [None]:
# Demonstrate active learning
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_classes=3, random_state=42)

# Split into initial labeled set, pool, and test set
X_initial, X_rest, y_initial, y_rest = train_test_split(X, y, train_size=50, random_state=42, stratify=y)
X_pool, X_test, y_pool, y_test = train_test_split(X_rest, y_rest, test_size=200, random_state=42)

# Initialize model and active learner
model = LogisticRegression(max_iter=1000, random_state=42)
al = ActiveLearner(model, strategy='uncertainty')

# Active learning loop
X_train, y_train = X_initial.copy(), y_initial.copy()
n_queries = 10
samples_per_query = 20

accuracy_history = []

for i in range(n_queries):
    # Train on current labeled data
    al.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, al.model.predict(X_test))
    accuracy_history.append((len(y_train), accuracy))
    
    if len(y_pool) == 0:
        break
        
    # Query most informative samples
    query_indices = al.query(X_pool, n_samples=min(samples_per_query, len(y_pool)))
    
    # Add to training set
    X_train = np.vstack([X_train, X_pool[query_indices]])
    y_train = np.concatenate([y_train, y_pool[query_indices]])
    
    # Remove from pool
    X_pool = np.delete(X_pool, query_indices, axis=0)
    y_pool = np.delete(y_pool, query_indices)

print("Active Learning Progress:")
for n_samples, acc in accuracy_history:
    print(f"  {n_samples} samples: {acc:.4f} accuracy")

In [None]:
# Compare active learning strategies
strategies = ['uncertainty', 'margin', 'entropy', 'random']
results = {}

for strategy in strategies:
    # Reset data
    X_train, y_train = X_initial.copy(), y_initial.copy()
    X_pool_copy, y_pool_copy = X_rest[:len(X_rest)-200].copy(), y_rest[:len(y_rest)-200].copy()
    
    al = ActiveLearner(LogisticRegression(max_iter=1000, random_state=42), strategy=strategy)
    accuracy_hist = []
    
    for _ in range(n_queries):
        al.fit(X_train, y_train)
        accuracy_hist.append((len(y_train), accuracy_score(y_test, al.model.predict(X_test))))
        
        if len(y_pool_copy) == 0:
            break
            
        query_idx = al.query(X_pool_copy, min(samples_per_query, len(y_pool_copy)))
        X_train = np.vstack([X_train, X_pool_copy[query_idx]])
        y_train = np.concatenate([y_train, y_pool_copy[query_idx]])
        X_pool_copy = np.delete(X_pool_copy, query_idx, axis=0)
        y_pool_copy = np.delete(y_pool_copy, query_idx)
    
    results[strategy] = accuracy_hist

# Plot comparison
plt.figure(figsize=(10, 6))
for strategy, hist in results.items():
    samples, accs = zip(*hist)
    plt.plot(samples, accs, marker='o', label=strategy.capitalize())

plt.xlabel('Number of Training Samples')
plt.ylabel('Test Accuracy')
plt.title('Active Learning Strategy Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

### 1.2 Data Augmentation

Data augmentation creates variations of existing data to increase dataset size.

In [None]:
class DataAugmenter:
    """Data augmentation for tabular and text data."""
    
    @staticmethod
    def add_noise(X, noise_level=0.1):
        """Add Gaussian noise to features."""
        noise = np.random.normal(0, noise_level, X.shape)
        return X + noise * np.std(X, axis=0)
    
    @staticmethod
    def feature_dropout(X, dropout_rate=0.1):
        """Randomly set features to zero."""
        mask = np.random.random(X.shape) > dropout_rate
        return X * mask
    
    @staticmethod
    def mixup(X1, X2, y1, y2, alpha=0.2):
        """Mixup augmentation: blend two samples."""
        lam = np.random.beta(alpha, alpha)
        X_mix = lam * X1 + (1 - lam) * X2
        y_mix = lam * y1 + (1 - lam) * y2
        return X_mix, y_mix
    
    @staticmethod
    def smote_single(X, y, target_class, n_samples=10, k=5):
        """Simple SMOTE-like oversampling for single class."""
        X_class = X[y == target_class]
        n_existing = len(X_class)
        
        synthetic_samples = []
        for _ in range(n_samples):
            idx = np.random.randint(0, n_existing)
            sample = X_class[idx]
            
            # Find k nearest neighbors
            distances = np.linalg.norm(X_class - sample, axis=1)
            neighbor_idx = np.argsort(distances)[1:k+1]
            neighbor = X_class[np.random.choice(neighbor_idx)]
            
            # Interpolate
            diff = neighbor - sample
            synthetic = sample + np.random.random() * diff
            synthetic_samples.append(synthetic)
        
        return np.array(synthetic_samples)

print("DataAugmenter class defined!")

In [None]:
# Demonstrate augmentation
X_small, y_small = make_classification(n_samples=100, n_features=10, random_state=42)

# Apply augmentations
X_noisy = DataAugmenter.add_noise(X_small, noise_level=0.1)
X_dropout = DataAugmenter.feature_dropout(X_small, dropout_rate=0.2)

# Combine original and augmented
X_augmented = np.vstack([X_small, X_noisy, X_dropout])
y_augmented = np.concatenate([y_small, y_small, y_small])

print(f"Original size: {len(X_small)}")
print(f"Augmented size: {len(X_augmented)}")

In [None]:
# Visualize augmentation effects
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].scatter(X_small[:, 0], X_small[:, 1], c=y_small, cmap='coolwarm', alpha=0.7)
axes[0].set_title('Original Data')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')

axes[1].scatter(X_noisy[:, 0], X_noisy[:, 1], c=y_small, cmap='coolwarm', alpha=0.7)
axes[1].set_title('With Noise Added')
axes[1].set_xlabel('Feature 1')
axes[1].set_ylabel('Feature 2')

axes[2].scatter(X_dropout[:, 0], X_dropout[:, 1], c=y_small, cmap='coolwarm', alpha=0.7)
axes[2].set_title('With Feature Dropout')
axes[2].set_xlabel('Feature 1')
axes[2].set_ylabel('Feature 2')

plt.tight_layout()
plt.show()

### 1.3 Synthetic Data Generation

In [None]:
class SyntheticDataGenerator:
    """Generate synthetic data based on existing data patterns."""
    
    @staticmethod
    def from_distribution(X, n_samples=100):
        """Generate samples from estimated distribution."""
        mean = np.mean(X, axis=0)
        cov = np.cov(X.T)
        return np.random.multivariate_normal(mean, cov, n_samples)
    
    @staticmethod
    def from_clusters(X, y, n_samples_per_class=50):
        """Generate samples around class centroids."""
        synthetic_X, synthetic_y = [], []
        
        for c in np.unique(y):
            X_class = X[y == c]
            mean = np.mean(X_class, axis=0)
            std = np.std(X_class, axis=0)
            
            # Generate around centroid
            samples = mean + np.random.randn(n_samples_per_class, len(mean)) * std
            synthetic_X.append(samples)
            synthetic_y.extend([c] * n_samples_per_class)
        
        return np.vstack(synthetic_X), np.array(synthetic_y)
    
    @staticmethod
    def bootstrap_sample(X, y, n_samples=None):
        """Generate bootstrap samples."""
        if n_samples is None:
            n_samples = len(X)
        indices = np.random.choice(len(X), n_samples, replace=True)
        return X[indices], y[indices]

print("SyntheticDataGenerator class defined!")

In [None]:
# Generate synthetic data
X_orig, y_orig = make_classification(n_samples=200, n_features=2, n_redundant=0, 
                                      n_clusters_per_class=1, random_state=42)

X_synthetic, y_synthetic = SyntheticDataGenerator.from_clusters(X_orig, y_orig, n_samples_per_class=100)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(X_orig[:, 0], X_orig[:, 1], c=y_orig, cmap='coolwarm', alpha=0.7)
axes[0].set_title(f'Original Data (n={len(X_orig)})')

axes[1].scatter(X_synthetic[:, 0], X_synthetic[:, 1], c=y_synthetic, cmap='coolwarm', alpha=0.7)
axes[1].set_title(f'Synthetic Data (n={len(X_synthetic)})')

plt.tight_layout()
plt.show()

## 2. Labeling Approaches

| Approach | Description | Quality | Cost |
|----------|-------------|---------|------|
| **Hand Labeling** | Human annotators | High | High |
| **Natural Labels** | Labels from user behavior | Medium | Low |
| **Weak Supervision** | Programmatic labeling | Medium | Low |
| **Semi-supervised** | Use unlabeled data | Medium | Medium |

### 2.1 Natural Label Extraction

In [None]:
def extract_natural_labels(interactions_df):
    """
    Extract natural labels from user interactions.
    
    Common patterns:
    - Clicks on recommendations -> positive
    - Skips -> negative
    - Purchases -> strong positive
    - Time spent -> engagement level
    """
    labels = []
    
    for _, row in interactions_df.iterrows():
        if row['action'] == 'purchase':
            label = 2  # Strong positive
        elif row['action'] == 'click':
            label = 1  # Positive
        elif row['action'] == 'skip':
            label = 0  # Negative
        else:
            label = -1  # Unknown
        labels.append(label)
    
    return labels

# Simulate user interaction data
np.random.seed(42)
n_interactions = 1000

interactions = pd.DataFrame({
    'user_id': np.random.randint(1, 100, n_interactions),
    'item_id': np.random.randint(1, 500, n_interactions),
    'action': np.random.choice(['click', 'skip', 'purchase', 'view'], n_interactions, 
                               p=[0.3, 0.4, 0.1, 0.2]),
    'timestamp': pd.date_range('2024-01-01', periods=n_interactions, freq='T')
})

interactions['label'] = extract_natural_labels(interactions)

print("Sample Interactions with Natural Labels:")
print(interactions.head(10))
print(f"\nLabel Distribution:")
print(interactions['label'].value_counts())

### 2.2 Weak Supervision with Labeling Functions

In [None]:
class WeakSupervisionLabeler:
    """Apply multiple weak labeling functions and combine."""
    
    def __init__(self, labeling_functions):
        self.labeling_functions = labeling_functions
    
    def apply(self, X):
        """Apply all labeling functions to data."""
        labels_matrix = []
        for lf in self.labeling_functions:
            labels_matrix.append([lf(x) for x in X])
        return np.array(labels_matrix).T
    
    def majority_vote(self, X):
        """Combine labels using majority voting."""
        labels_matrix = self.apply(X)
        final_labels = []
        
        for row in labels_matrix:
            valid_labels = row[row >= 0]  # Ignore abstentions (-1)
            if len(valid_labels) == 0:
                final_labels.append(-1)  # No valid labels
            else:
                # Majority vote
                counts = np.bincount(valid_labels.astype(int))
                final_labels.append(np.argmax(counts))
        
        return np.array(final_labels)

# Example: Text classification with labeling functions
def lf_contains_urgent(text):
    """Label as positive if contains 'urgent'."""
    if 'urgent' in text.lower():
        return 1
    return -1  # Abstain

def lf_contains_free(text):
    """Label as positive if contains 'free'."""
    if 'free' in text.lower():
        return 1
    return -1

def lf_contains_meeting(text):
    """Label as negative if contains 'meeting'."""
    if 'meeting' in text.lower():
        return 0
    return -1

def lf_short_text(text):
    """Short texts are often spam."""
    if len(text) < 20:
        return 1
    return -1

# Sample texts
texts = [
    "URGENT: You've won a free prize!",
    "Meeting scheduled for tomorrow at 3pm",
    "Free money waiting for you",
    "Can we reschedule our meeting?",
    "Act now! Limited time offer!",
    "Quarterly report attached for review"
]

labeler = WeakSupervisionLabeler([lf_contains_urgent, lf_contains_free, 
                                   lf_contains_meeting, lf_short_text])

labels_matrix = labeler.apply(texts)
final_labels = labeler.majority_vote(texts)

print("Weak Supervision Results:")
print(f"Labeling Functions: {len(labeler.labeling_functions)}")
print("\nLabels Matrix (rows=samples, cols=LFs):")
print(labels_matrix)
print(f"\nFinal Labels (majority vote): {final_labels}")

## 3. Sampling Strategies

| Strategy | Description | Use Case |
|----------|-------------|----------|
| **Random** | Uniform random selection | General purpose |
| **Stratified** | Preserve class proportions | Imbalanced classes |
| **Importance** | Weight by importance | Rare events |
| **Reservoir** | Fixed-size sample from stream | Streaming data |

In [None]:
class Sampler:
    """Various sampling strategies."""
    
    @staticmethod
    def random_sample(X, y, n_samples):
        """Simple random sampling."""
        indices = np.random.choice(len(X), n_samples, replace=False)
        return X[indices], y[indices]
    
    @staticmethod
    def stratified_sample(X, y, n_samples):
        """Stratified sampling preserving class proportions."""
        classes, counts = np.unique(y, return_counts=True)
        proportions = counts / len(y)
        
        sampled_X, sampled_y = [], []
        for c, prop in zip(classes, proportions):
            n_class = max(1, int(n_samples * prop))
            class_indices = np.where(y == c)[0]
            selected = np.random.choice(class_indices, min(n_class, len(class_indices)), replace=False)
            sampled_X.append(X[selected])
            sampled_y.extend([c] * len(selected))
        
        return np.vstack(sampled_X), np.array(sampled_y)
    
    @staticmethod
    def importance_sample(X, y, n_samples, weights):
        """Importance sampling with custom weights."""
        weights = weights / weights.sum()
        indices = np.random.choice(len(X), n_samples, replace=False, p=weights)
        return X[indices], y[indices]
    
    @staticmethod
    def reservoir_sample(stream, k):
        """Reservoir sampling for streaming data."""
        reservoir = []
        for i, item in enumerate(stream):
            if i < k:
                reservoir.append(item)
            else:
                j = np.random.randint(0, i + 1)
                if j < k:
                    reservoir[j] = item
        return reservoir

print("Sampler class defined!")

In [None]:
# Create imbalanced dataset
X_imb, y_imb = make_classification(n_samples=1000, n_features=10, n_classes=3,
                                    weights=[0.7, 0.2, 0.1], random_state=42)

print("Original Class Distribution:")
print(pd.Series(y_imb).value_counts())

# Compare sampling methods
n_samples = 200

X_rand, y_rand = Sampler.random_sample(X_imb, y_imb, n_samples)
X_strat, y_strat = Sampler.stratified_sample(X_imb, y_imb, n_samples)

# Create importance weights (favor minority classes)
class_weights = {0: 1, 1: 3, 2: 5}
weights = np.array([class_weights[c] for c in y_imb]).astype(float)
X_imp, y_imp = Sampler.importance_sample(X_imb, y_imb, n_samples, weights)

print(f"\nRandom Sampling: {pd.Series(y_rand).value_counts().to_dict()}")
print(f"Stratified Sampling: {pd.Series(y_strat).value_counts().to_dict()}")
print(f"Importance Sampling: {pd.Series(y_imp).value_counts().to_dict()}")

In [None]:
# Visualize sampling results
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

for ax, (name, y_sampled) in zip(axes.flat, [
    ('Original', y_imb),
    ('Random', y_rand),
    ('Stratified', y_strat),
    ('Importance', y_imp)
]):
    counts = pd.Series(y_sampled).value_counts().sort_index()
    ax.bar(counts.index, counts.values, color=['steelblue', 'coral', 'green'])
    ax.set_title(f'{name} Sampling (n={len(y_sampled)})')
    ax.set_xlabel('Class')
    ax.set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Demonstrate reservoir sampling for streaming
stream = range(10000)
sample_size = 100

reservoir = Sampler.reservoir_sample(stream, sample_size)

print(f"Reservoir Sampling Demo:")
print(f"Stream size: 10000")
print(f"Sample size: {len(reservoir)}")
print(f"Sample mean: {np.mean(reservoir):.1f} (expected: ~5000)")
print(f"Sample std: {np.std(reservoir):.1f}")

## 4. Data Splitting

### Splitting Strategies

| Strategy | Description | When to Use |
|----------|-------------|-------------|
| **Random** | Random assignment | IID data |
| **Stratified** | Preserve class ratios | Classification |
| **Time-based** | Chronological split | Time series |
| **Group-based** | Keep groups together | Grouped data |

In [None]:
# Standard train/validation/test split
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

# First split: train+val vs test
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: train vs val
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval
)  # 0.25 of 0.8 = 0.2

print("Standard Split (60/20/20):")
print(f"Train: {len(X_train)} samples")
print(f"Validation: {len(X_val)} samples")
print(f"Test: {len(X_test)} samples")
print(f"\nClass distribution maintained:")
print(f"Train: {np.bincount(y_train)}")
print(f"Val: {np.bincount(y_val)}")
print(f"Test: {np.bincount(y_test)}")

### 4.1 Time-Based Splitting

In [None]:
def time_based_split(X, y, timestamps, train_ratio=0.6, val_ratio=0.2):
    """
    Split data chronologically.
    Training data comes before validation, which comes before test.
    """
    # Sort by timestamp
    sorted_indices = np.argsort(timestamps)
    X_sorted = X[sorted_indices]
    y_sorted = y[sorted_indices]
    
    n = len(X)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    return {
        'X_train': X_sorted[:train_end],
        'y_train': y_sorted[:train_end],
        'X_val': X_sorted[train_end:val_end],
        'y_val': y_sorted[train_end:val_end],
        'X_test': X_sorted[val_end:],
        'y_test': y_sorted[val_end:]
    }

# Create time-series data
n_samples = 1000
timestamps = np.array([datetime(2024, 1, 1) + timedelta(hours=i) for i in range(n_samples)])
X_ts, y_ts = make_classification(n_samples=n_samples, n_features=10, random_state=42)

splits = time_based_split(X_ts, y_ts, timestamps)

print("Time-Based Split:")
print(f"Train: {len(splits['X_train'])} samples (earliest data)")
print(f"Val: {len(splits['X_val'])} samples")
print(f"Test: {len(splits['X_test'])} samples (most recent data)")

In [None]:
# Visualize time-based split
fig, ax = plt.subplots(figsize=(12, 4))

train_end = int(n_samples * 0.6)
val_end = int(n_samples * 0.8)

ax.axvspan(0, train_end, alpha=0.3, color='blue', label='Train')
ax.axvspan(train_end, val_end, alpha=0.3, color='orange', label='Validation')
ax.axvspan(val_end, n_samples, alpha=0.3, color='green', label='Test')

ax.scatter(range(n_samples), y_ts, c=y_ts, cmap='coolwarm', s=10, alpha=0.5)

ax.set_xlabel('Time Index')
ax.set_ylabel('Label')
ax.set_title('Time-Based Data Split')
ax.legend()
plt.tight_layout()
plt.show()

### 4.2 Group-Based Splitting

In [None]:
# Create data with groups (e.g., user IDs)
n_samples = 500
n_groups = 50

groups = np.random.randint(0, n_groups, n_samples)
X_grouped, y_grouped = make_classification(n_samples=n_samples, n_features=10, random_state=42)

# Split by groups
unique_groups = np.unique(groups)
np.random.shuffle(unique_groups)

n_train_groups = int(len(unique_groups) * 0.6)
n_val_groups = int(len(unique_groups) * 0.2)

train_groups = set(unique_groups[:n_train_groups])
val_groups = set(unique_groups[n_train_groups:n_train_groups + n_val_groups])
test_groups = set(unique_groups[n_train_groups + n_val_groups:])

train_mask = np.array([g in train_groups for g in groups])
val_mask = np.array([g in val_groups for g in groups])
test_mask = np.array([g in test_groups for g in groups])

print("Group-Based Split:")
print(f"Train: {train_mask.sum()} samples from {len(train_groups)} groups")
print(f"Val: {val_mask.sum()} samples from {len(val_groups)} groups")
print(f"Test: {test_mask.sum()} samples from {len(test_groups)} groups")
print(f"\nNo group appears in multiple splits: {len(train_groups & val_groups) == 0 and len(val_groups & test_groups) == 0}")

## 5. Cross-Validation

Cross-validation provides robust model evaluation by using multiple train/test splits.

In [None]:
# Load dataset
X, y = load_iris(return_X_y=True)
model = LogisticRegression(max_iter=1000, random_state=42)

# K-Fold CV
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
kfold_scores = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')

# Stratified K-Fold CV
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skfold_scores = cross_val_score(model, X, y, cv=skfold, scoring='accuracy')

print("Cross-Validation Results:")
print(f"\nK-Fold (5 folds):")
print(f"  Scores: {kfold_scores}")
print(f"  Mean: {kfold_scores.mean():.4f} (+/- {kfold_scores.std()*2:.4f})")

print(f"\nStratified K-Fold (5 folds):")
print(f"  Scores: {skfold_scores}")
print(f"  Mean: {skfold_scores.mean():.4f} (+/- {skfold_scores.std()*2:.4f})")

In [None]:
# Time Series Cross-Validation
X_ts = np.random.randn(200, 5)
y_ts = (X_ts[:, 0] + np.random.randn(200) * 0.1 > 0).astype(int)

tscv = TimeSeriesSplit(n_splits=5)

print("Time Series Cross-Validation:")
for fold, (train_idx, test_idx) in enumerate(tscv.split(X_ts)):
    print(f"Fold {fold + 1}: Train[0-{train_idx[-1]}], Test[{test_idx[0]}-{test_idx[-1]}]")

In [None]:
# Visualize CV splits
fig, axes = plt.subplots(3, 1, figsize=(12, 8))

cv_methods = [
    ('K-Fold', KFold(n_splits=5, shuffle=True, random_state=42)),
    ('Stratified K-Fold', StratifiedKFold(n_splits=5, shuffle=True, random_state=42)),
    ('Time Series', TimeSeriesSplit(n_splits=5))
]

X_viz = np.arange(100).reshape(-1, 1)
y_viz = np.array([0]*50 + [1]*50)

for ax, (name, cv) in zip(axes, cv_methods):
    for fold, (train, test) in enumerate(cv.split(X_viz, y_viz)):
        ax.scatter(train, [fold]*len(train), c='blue', marker='s', s=20, alpha=0.5)
        ax.scatter(test, [fold]*len(test), c='red', marker='s', s=20, alpha=0.5)
    ax.set_xlabel('Sample Index')
    ax.set_ylabel('Fold')
    ax.set_title(f'{name} Cross-Validation')
    ax.set_yticks(range(5))

axes[0].scatter([], [], c='blue', marker='s', label='Train')
axes[0].scatter([], [], c='red', marker='s', label='Test')
axes[0].legend()

plt.tight_layout()
plt.show()

In [None]:
# Group K-Fold
X_groups = np.random.randn(100, 5)
y_groups = np.random.randint(0, 2, 100)
groups = np.repeat(range(20), 5)  # 20 groups, 5 samples each

group_kfold = GroupKFold(n_splits=5)

print("Group K-Fold Cross-Validation:")
for fold, (train_idx, test_idx) in enumerate(group_kfold.split(X_groups, y_groups, groups)):
    train_groups = np.unique(groups[train_idx])
    test_groups = np.unique(groups[test_idx])
    print(f"Fold {fold + 1}: {len(train_groups)} train groups, {len(test_groups)} test groups")
    print(f"         Overlap check: {len(set(train_groups) & set(test_groups))} groups in common")

## 6. Hands-on Exercise

In [None]:
# Exercise: Build a Complete Dataset Construction Pipeline

print("Exercise: Complete Dataset Construction Pipeline")
print("="*60)

# Step 1: Create initial dataset
X_initial, y_initial = make_classification(
    n_samples=500, n_features=15, n_informative=10,
    n_classes=3, weights=[0.5, 0.3, 0.2], random_state=42
)

print(f"Step 1: Initial dataset - {len(X_initial)} samples")
print(f"Class distribution: {dict(zip(*np.unique(y_initial, return_counts=True)))}")

In [None]:
# Step 2: Apply augmentation
X_aug1 = DataAugmenter.add_noise(X_initial, noise_level=0.05)
X_augmented = np.vstack([X_initial, X_aug1])
y_augmented = np.concatenate([y_initial, y_initial])

print(f"Step 2: After augmentation - {len(X_augmented)} samples")

In [None]:
# Step 3: Stratified sampling
X_sampled, y_sampled = Sampler.stratified_sample(X_augmented, y_augmented, n_samples=600)

print(f"Step 3: After stratified sampling - {len(X_sampled)} samples")
print(f"Class distribution: {dict(zip(*np.unique(y_sampled, return_counts=True)))}")

In [None]:
# Step 4: Create train/val/test splits
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval
)

print(f"Step 4: Final splits")
print(f"  Train: {len(X_train)} samples")
print(f"  Val: {len(X_val)} samples")
print(f"  Test: {len(X_test)} samples")

In [None]:
# Step 5: Validate with cross-validation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation on training set
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')

print(f"Step 5: Cross-validation results")
print(f"  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

In [None]:
# Step 6: Final evaluation
model.fit(X_train_scaled, y_train)

val_acc = accuracy_score(y_val, model.predict(X_val_scaled))
test_acc = accuracy_score(y_test, model.predict(X_test_scaled))

print(f"Step 6: Final evaluation")
print(f"  Validation Accuracy: {val_acc:.4f}")
print(f"  Test Accuracy: {test_acc:.4f}")
print(f"\nPipeline complete!")

## 7. Summary

### Key Takeaways

1. **Data Collection**: Use active learning for efficient labeling, augmentation for more data
2. **Labeling**: Combine multiple approaches - natural labels, weak supervision, hand labeling
3. **Sampling**: Choose strategy based on data characteristics and goals
4. **Splitting**: Always use appropriate splits for your data type (time-based, group-based)
5. **Cross-Validation**: Essential for robust model evaluation

### Best Practices

| Task | Recommendation |
|------|----------------|
| Limited budget | Active learning with uncertainty sampling |
| Limited data | Data augmentation + synthetic generation |
| Imbalanced classes | Stratified sampling + oversampling |
| Time series | Time-based splitting + TimeSeriesSplit CV |
| Grouped data | Group-based splitting + GroupKFold CV |