In [30]:
# Set path to working dir
%cd "/content/drive/MyDrive/UNSW - Foundation of Cybersecurity/pcp/code/Federated"

/content/drive/MyDrive/UNSW - Foundation of Cybersecurity/pcp/code/Federated


In [130]:
import json
import numpy as np
import os
import random

from dataclasses import dataclass
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC



@dataclass
class Device:
    id: int
    nsamples: int
    X: np.ndarray # data (train)
    y: np.ndarray # target (train)
    Xt: np.ndarray # data (test)
    yt: np.ndarray # target (test)
    params: dict = None
    metrics: float = None

    def train(self):
        estimator = SVC(kernel="rbf")
        param_grid = {"C": [1e-1, 1, 1e1, 1e2, 1e3],
                "gamma": [1e-2, 1e-1, 1, 1e1]}
        clf = GridSearchCV(estimator, param_grid, cv=3)
        clf.fit(self.X, self.y)
        self.metrics = clf.best_score_
        self.params = clf.best_params_

    def send(self):
        estimator = SVC(kernel="rbf", gamma=self.params['gamma'], C=self.params['C'])
        estimator.fit(self.X, self.y)
        test_score = accuracy_score(self.yt, estimator.predict(self.Xt))
        return estimator, self.metrics, self.nsamples, self.Xt, self.yt, test_score


class Ensemble:
    def __init__(self, estimators):
        self.estimators = estimators
        self.n_models = len(estimators)

    def predict(self, X):
        scores = np.zeros(X.shape[0])
        for i in range(self.n_models):
            scores += self.estimators[i]["estimator"].predict(X)
        scores /= self.n_models
        return scores


class Server:
    def __init__(self):
        self.models = []
        self.ensemble = None

    def build_ensemble(self, k, ensemble_type):
        if ensemble_type == "random":
            random.shuffle(self.models)

        elif ensemble_type == "cv":
            self.models.sort(key=lambda x: -x["metrics"])

        elif ensemble_type == "data":
            self.models.sort(key=lambda x: -x["nsamples"])

        else:
            raise(ValueError("Incorrect `ensemble_type`"))

        self.ensemble = Ensemble(self.models[:k])

    def receive(self, estimator, metrics, nsamples, Xt, yt, test_score):
        model = {"estimator": estimator, "metrics": metrics, "nsamples": nsamples,
                 "X_test": Xt, "y_test": yt, "test_score": test_score}
        self.models.append(model)


def load_data(digit, n_devices, min_sample=100, max_sample=500):
    # Load data
    data_mnist = np.load(r'../data/mnist-sampled-N10000.npz')
    X = data_mnist['arr_0'] / 255.0
    y = np.int64(data_mnist['arr_1'] < digit)

    devices = []
    num_nsamples = np.random.lognormal(3, 2, (n_devices)).astype(int) # sample indices
    num_nsamples = [min(s + min_sample, max_sample) for s in num_nsamples]
    for c_id, c_nsamples in enumerate(num_nsamples):
        idx = np.array(random.sample(range(X.shape[0]), c_nsamples))
        X_train, X_test, y_train, y_test = train_test_split(X[idx], y[idx], test_size=0.3, random_state=0)
        devices.append(Device(c_id, y_train.shape[0], X_train, y_train, X_test, y_test))
    return devices


def protocol(digit, n_devices):
    devices = load_data(digit, n_devices)
    server = Server()
    for i, device in enumerate(devices):
        if 0.1*device.nsamples < np.sum(device.y) < 0.9*device.nsamples:
            device.train()
            server.receive(*device.send())
    return server, devices

In [131]:
server, devices = protocol(digit=6, n_devices=300)

In [133]:
test_local_model = np.mean([model['test_score'] for model in server.models])

server.build_ensemble(k=30, ensemble_type="random")
test_ensemble_random = np.mean([accuracy_score(model['y_test'],
                                               server.ensemble.predict(model['X_test']) > 0.5
                                               ) for model in server.models])

server.build_ensemble(k=30, ensemble_type="cv")
test_ensemble_cv = np.mean([accuracy_score(model['y_test'],
                                               server.ensemble.predict(model['X_test']) > 0.5
                                               ) for model in server.models])

server.build_ensemble(k=30, ensemble_type="data")
test_ensemble_data = np.mean([accuracy_score(model['y_test'],
                                               server.ensemble.predict(model['X_test']) > 0.5
                                               ) for model in server.models])

In [134]:
[test_local_model,
 test_ensemble_random,
 test_ensemble_cv,
 test_ensemble_data]

[0.808736776363779, 0.882190872722728, 0.9084644669913462, 0.925573049832254]

In [140]:
from sklearn.neural_network import MLPRegressor

def distill(X, y, server):
    y = server.ensemble.predict(X)
    model = MLPRegressor(solver="lbfgs", alpha=2e-5, hidden_layer_sizes=(5, 2), activation="logistic")
    model.fit(X, y)
    return model