In [28]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron

In [29]:
###############################################################################
# 1. DATA PROCESSING FUNCTIONS
###############################################################################

def convert_labels_to_pm1(y):
    """Convert labels {0,1} to {-1,+1}."""
    return np.where(y == 0, -1, 1)

def create_pool_and_test_sets(X, y, test_size=0.3):
    """Split X,y into pool and test sets."""
    return train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

def create_stream(X_pool, seed=1):
    """Shuffle pool to create i.i.d. stream."""
    rng = np.random.RandomState(seed)
    indices = np.arange(len(X_pool))
    rng.shuffle(indices)
    return X_pool[indices]

def build_oracle_mapping(X_pool, y_pool_pm1):
    """Map each sample to its true label ±1."""
    return {tuple(x): label for x, label in zip(X_pool, y_pool_pm1)}

def make_oracle_function(pool_map):
    """Return oracle function for querying labels."""
    def oracle(x):
        return pool_map.get(tuple(x), +1)
    return oracle

In [None]:
###############################################################################
# 2. A² ACTIVE LEARNING IMPLEMENTATION
###############################################################################

class A2ActiveLearner:
    """
    Simplified Python implementation of A² algorithm with safety checks.
    """

    def __init__(self, learn_H_fn, delta_schedule):
        self.learn_H_fn = learn_H_fn
        self.delta_schedule = delta_schedule
        self.H_current = None

    def disagreement(self, X, H):
        """Return indices where hypotheses in H disagree (simplified)."""
        n = len(X)
        mask = np.random.rand(n) < 0.5
        return np.where(mask)[0]

    def process_stream(self, X_stream, oracle):
        """Main loop of A² over stream with safety checks for single-class."""
        n = len(X_stream)
        queried_labels = []
        queried_X = []

        # Step 0: find a point that adds a second class
        for idx in range(n):
            x0 = X_stream[idx].reshape(1, -1)
            y0 = np.array([oracle(X_stream[idx])])
            if len(np.unique([y0[0]] + queried_labels)) > 1:
                self.H_current = self.learn_H_fn(x0, y0)
                queried_X.append(X_stream[idx])
                queried_labels.append(y0[0])
                start_idx = idx + 1
                break
        else:
            # If no second class, train on first point
            x0 = X_stream[0].reshape(1, -1)
            y0 = np.array([oracle(X_stream[0])])
            self.H_current = self.learn_H_fn(x0, y0)
            queried_X.append(X_stream[0])
            queried_labels.append(y0[0])
            start_idx = 1

        # Process the rest of the stream
        for t in range(start_idx, n):
            x_t = X_stream[t].reshape(1, -1)
            disagree_idx = self.disagreement(x_t, [self.H_current])

            if len(disagree_idx) > 0:
                y_t = oracle(X_stream[t])
                # Update only if at least two classes present
                if len(np.unique(queried_labels + [y_t])) > 1:
                    queried_X.append(X_stream[t])
                    queried_labels.append(y_t)
                    X_update = np.array(queried_X)
                    y_update = np.array(queried_labels)
                    self.H_current = self.learn_H_fn(X_update, y_update)

        query_count = len(queried_labels)
        return self.H_current, query_count


In [31]:
###############################################################################
# 3. PASSIVE BASELINE
###############################################################################

def run_passive_baseline(X_pool, y_pool, n_labels, learn_H_fn):
    """Randomly select n_labels points and ensure at least 2 classes."""
    max_attempts = 10
    for _ in range(max_attempts):
        indices = np.random.choice(len(X_pool), size=n_labels, replace=False)
        y_sample = y_pool[indices]
        if len(np.unique(y_sample)) > 1:
            X_sample = X_pool[indices]
            return learn_H_fn(X_sample, y_sample)
    # Si impossible, utiliser tout le pool
    return learn_H_fn(X_pool, y_pool)


In [32]:
###############################################################################
# 4. EVALUATION
###############################################################################

def evaluate_models(h_active, h_passive, X_test, y_test):
    """Compute accuracy for active and passive models."""
    y_test_pm1 = convert_labels_to_pm1(y_test)
    preds_active = h_active.predict(X_test)
    preds_passive = h_passive.predict(X_test)
    acc_active = accuracy_score(y_test_pm1, preds_active)
    acc_passive = accuracy_score(y_test_pm1, preds_passive)
    return acc_active, acc_passive

In [33]:
###############################################################################
# 5. EXPERIMENT PIPELINE
###############################################################################

def run_experiment(
    X, y, test_size=0.3, stream_seed=1,
    learn_H_fn=None, delta_schedule=None
):
    """Full experiment: data, A², baseline, evaluation."""

    # Step 1: Pool/test split
    X_pool, X_test, y_pool, y_test = create_pool_and_test_sets(X, y, test_size)

    # Step 2: Convert labels
    y_pool_pm1 = convert_labels_to_pm1(y_pool)

    # Step 3: Create stream
    X_stream = create_stream(X_pool, stream_seed)

    # Step 4: Build oracle
    pool_map = build_oracle_mapping(X_pool, y_pool_pm1)
    oracle = make_oracle_function(pool_map)

    # Step 5: Run A² active learning
    a2_learner = A2ActiveLearner(learn_H_fn, delta_schedule)
    h_active, query_count = a2_learner.process_stream(X_stream, oracle)

    # Step 6: Run passive baseline
    h_passive = run_passive_baseline(np.array(X_pool), np.array(y_pool_pm1), query_count, learn_H_fn)

    # Step 7: Evaluate
    acc_active, acc_passive = evaluate_models(h_active, h_passive, X_test, y_test)

    return {
        "acc_active": acc_active,
        "acc_passive": acc_passive,
        "queries": query_count,
        "n_pool": len(X_pool),
        "n_test": len(X_test)
    }

In [34]:
###############################################################################
# 6. EXAMPLE USAGE
###############################################################################

if __name__ == "__main__":
    from sklearn.datasets import make_classification

    # Synthetic dataset
    X_syn, y_syn = make_classification(
        n_samples=500, n_features=10, n_informative=5, n_redundant=2, n_classes=2, random_state=42
    )

    # Simple learning function using Perceptron
    def learn_H_fn(X, y):
        model = Perceptron(max_iter=1000, tol=1e-3, random_state=42)
        model.fit(X, y)
        return model

    # Simple delta schedule placeholder
    delta_schedule = lambda t: 0.05

    # Run experiment
    results = run_experiment(
        X_syn, y_syn,
        test_size=0.3,
        stream_seed=1,
        learn_H_fn=learn_H_fn,
        delta_schedule=delta_schedule
    )

    print("Résultats A² sur données synthétiques :", results)

ValueError: The number of classes has to be greater than one; got 1 class