In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import random
import math
import warnings
warnings.filterwarnings("ignore")

In [None]:
np.random.seed(0)
random.seed(0)

In [None]:
class A2ActiveLearner:
    def __init__(self, learn_H, delta_schedule):
        self.learn_H = learn_H
        self.delta_schedule = delta_schedule
        self.S = []  # inferred labels (x, y_hat)
        self.T = []  # queried true labels (x, y_true)
        self.query_count = 0

    @staticmethod
    def empirical_error(h, dataset):
        if len(dataset) == 0:
            return 0.0
        X = np.array([x for (x, y) in dataset])
        y = np.array([y for (x, y) in dataset])
        preds = h.predict(X)
        return np.mean(preds != y)

    def process_stream(self, X_stream, oracle):
        self.S = []
        self.T = []
        self.query_count = 0

        for n, x_n in enumerate(X_stream, start=1):
            # Train candidate hypotheses for hat{y} = +1 and -1
            h_pos = self.learn_H(self.S + [(x_n, +1)], self.T)
            h_neg = self.learn_H(self.S + [(x_n, -1)], self.T)

            err_pos = self.empirical_error(h_pos, self.S + self.T)
            err_neg = self.empirical_error(h_neg, self.S + self.T)

            Delta = self.delta_schedule(max(1, n - 1))

            inferred_label = None
            if err_neg - err_pos > Delta:
                inferred_label = +1
            elif err_pos - err_neg > Delta:
                inferred_label = -1

            if inferred_label is not None:
                self.S.append((x_n, inferred_label))
            else:
                y_n = oracle(x_n)
                self.T.append((x_n, y_n))
                self.query_count += 1

        # final hypothesis
        h_final = self.learn_H(self.S, self.T)
        return h_final, self.query_count



In [None]:
def learn_H_fn(S, T):
    data = S + T
    if len(data) == 0:
        class ConstantPred:
            def predict(self, X):
                X = np.asarray(X)
                return np.ones(len(X), dtype=int)
        return ConstantPred()
    X = np.array([x for (x, y) in data])
    y = np.array([y for (x, y) in data])
    y_sklearn = np.where(y == -1, 0, 1)
    unique = np.unique(y_sklearn)
    if len(unique) == 1:
        cls = unique[0]
        mapped_output = -1 if cls == 0 else 1
        class ConstantPredClass:
            def __init__(self, mapped_output):
                self.mapped_output = mapped_output
            def predict(self, X):
                X = np.asarray(X)
                return np.full(len(X), self.mapped_output, dtype=int)
        return ConstantPredClass(mapped_output)
    clf = LogisticRegression(max_iter=100, solver='liblinear')
    clf.fit(X, y_sklearn)
    class SklearnWrapper:
        def __init__(self, clf):
            self.clf = clf
        def predict(self, X):
            X = np.asarray(X)
            preds = self.clf.predict(X)
            return np.where(preds == 0, -1, 1)
    return SklearnWrapper(clf)

def delta_schedule(n):
    return 0.1 / math.sqrt(n)

def run_passive_baseline(X_pool, y_pool, n_queries, learn_H):
    if n_queries == 0:
        class ConstantPred:
            def predict(self, X):
                X = np.asarray(X)
                return np.ones(len(X), dtype=int)
        return ConstantPred()
    idx = np.random.choice(len(X_pool), size=n_queries, replace=False)
    X_train = X_pool[idx]
    y_train = y_pool[idx]
    y_train_mapped = np.where(y_train == 0, -1, 1)
    data = list(zip(X_train, y_train_mapped))
    clf = learn_H(data, [])
    return clf


def prepare_data_for_A2(X, y, test_size=0.3, stream_seed=1):
    y = np.array(y)

    # séparation pool/test
    X_pool, X_test, y_pool, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    # création flux
    rng = np.random.RandomState(stream_seed)
    indices = np.arange(len(X_pool))
    rng.shuffle(indices)
    X_stream = X_pool[indices]

    # conversion labels -> {-1, +1}
    y_pool_mapped = np.where(y_pool == 0, -1, 1)

    # copies utiles
    X_pool_arr = np.array(X_pool)
    y_pool_arr = np.array(y_pool)

    # mapping pour l'oracle
    pool_map = {tuple(x): label for x, label in zip(X_pool, y_pool_mapped)}

    def oracle(x):
        return pool_map.get(tuple(x), +1)

    return {
        "X_stream": X_stream,
        "X_test": X_test,
        "y_test": y_test,
        "X_pool_arr": X_pool_arr,
        "y_pool_arr": y_pool_arr,
        "oracle": oracle
    }


def run_experiment(X, y, test_size=0.3, stream_seed=1):
    
    data = prepare_data_for_A2(X, y, test_size, stream_seed)

    X_stream = data["X_stream"]
    X_test = data["X_test"]
    y_test = data["y_test"]
    X_pool_arr = data["X_pool_arr"]
    y_pool_arr = data["y_pool_arr"]
    oracle = data["oracle"]

    active = A2ActiveLearner(learn_H_fn, delta_schedule)
    h_active, query_count = active.process_stream(X_stream, oracle)

    h_passive = run_passive_baseline(X_pool_arr, y_pool_arr, query_count, learn_H_fn)

    y_test_mapped = np.where(y_test == 0, -1, 1)
    preds_active = h_active.predict(X_test)
    preds_passive = h_passive.predict(X_test)

    acc_active = accuracy_score(y_test_mapped, preds_active)
    acc_passive = accuracy_score(y_test_mapped, preds_passive)

    return {
        "acc_active": acc_active,
        "acc_passive": acc_passive,
        "queries": query_count,
        "n_pool": len(X_pool_arr),
        "n_test": len(X_test)
    }

## Création de données synthétiques:

In [None]:
# Run experiments with reduced synthetic size for speed
X_syn, y_syn = make_classification(n_samples=600, n_features=20, n_informative=10,
                                   n_redundant=5, n_clusters_per_class=2, flip_y=0.05, random_state=0)


## Importation de dataset : breast cancer 

*(https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic)*

In [None]:
bc = load_breast_cancer()
X_bc = bc.data
y_bc = bc.target

## Run les algos : A² et Passif

In [None]:
res_syn = run_experiment(X_syn, y_syn, test_size=0.3, stream_seed=2)

res_bc = run_experiment(X_bc, y_bc, test_size=0.3, stream_seed=3)


results_df = pd.DataFrame([
    {"dataset": "Synthetic", "acc_active": res_syn["acc_active"], "acc_passive": res_syn["acc_passive"], "queries": res_syn["queries"]},
    {"dataset": "BreastCancer", "acc_active": res_bc["acc_active"], "acc_passive": res_bc["acc_passive"], "queries": res_bc["queries"]},
])

print("Results summary:")
print(results_df.to_string(index=False))



In [None]:
for i, row in results_df.iterrows():
    plt.figure(figsize=(5,3))
    plt.title(f"{row['dataset']} — Test accuracy (Active vs Passive)")
    plt.bar([0,1], [row['acc_active'], row['acc_passive']])
    plt.xticks([0,1], ['A2 Active', 'Passive'])
    plt.ylabel("Accuracy")
    plt.tight_layout()
    plt.show()

try:
    from caas_jupyter_tools import display_dataframe_to_user
    display_dataframe_to_user("A2 vs Passive results", results_df)
except Exception:
    pass

results_df.to_csv("/mnt/data/a2_vs_passive_results.csv", index=False)
print("\nA CSV with the results was saved at /mnt/data/a2_vs_passive_results.csv")