Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

When I encountered an error while using simulation, I don't know where the problem lies #3712

Open
MicSif opened this issue Jul 4, 2024 · 0 comments
Labels
question Further information is requested

Comments

@MicSif
Copy link

MicSif commented Jul 4, 2024

What is your question?

My code looks like this:
`import numpy as np
import pandas as pd
import random
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from typing import Dict, List, Tuple
#from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

import tensorflow as tf
from tensorflow.python import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K

import flwr as fl
from flwr.common import Metrics
from flwr.simulation.ray_transport.utils import enable_tf_gpu_growth

import warnings
warnings.filterwarnings("ignore") # "error", "ignore", "always", "default", "module" or "once"
import matplotlib.pyplot as plt
import logging
def getdata(data_path):
dataset = np.load(data_path, allow_pickle=True)
return dataset

def process_data(data):
features = np.array(data['X_train'])
multi_labels = data['y_train']
y = np.array([0 if int(num) == 0 else 1 for num in multi_labels])
X = preprocessing.StandardScaler().fit(features).transform(features)
X_train_full, global_X_test, y_train_full, global_y_test = train_test_split(X, y, test_size=0.2, random_state=42)
return X_train_full, global_X_test, y_train_full, global_y_test

def create_data_splits(X, y, num_clients):
client_data = []
for i in range(num_clients):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=i)
client_data.append((X_train, y_train, X_val, y_val))
return client_data

def model():
model = Sequential()
model.add(Dense(200, input_shape=(1159,)))
model.add(Activation("relu"))
model.add(Dense(100))
model.add(Activation("relu"))
model.add(Dense(50))
model.add(Activation("relu"))
model.add(Dense(1))
model.add(Activation("sigmoid"))
lr = 0.05
optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=lr, decay=lr / 10, momentum=0.9)#随机梯度下降
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=["accuracy"])
return model

class FlowerClient(fl.client.NumPyClient):
def init(self, model, train_data, test_data):
self.model = model
self.train_data = train_data
self.test_data = test_data

def get_parameters(self, config):
    return self.model.get_weights()

def fit(self, parameters, config):
    self.model.set_weights(parameters)
    self.model.fit(self.train_data[0], self.train_data[1], epochs=1, batch_size=32)
    return self.model.get_weights(), len(self.train_data[0]), {}

def evaluate(self, parameters, config):
    self.model.set_weights(parameters)
    loss, accuracy = self.model.evaluate(self.test_data[0], self.test_data[1])
    return loss, len(self.test_data[0]), {"accuracy": accuracy}

def client_fn(cid: str) -> FlowerClient:
train_data = (client_data_splits[int(cid)][0], client_data_splits[int(cid)][1])
test_data = (client_data_splits[int(cid)][2], client_data_splits[int(cid)][3])
return FlowerClient(model(), train_data, test_data).to_client()

def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics:
"""Aggregation function for (federated) evaluation metrics, i.e. those returned by
the client's evaluate() method."""
# Multiply accuracy of each client by number of examples used
accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics]
examples = [num_examples for num_examples, _ in metrics]
# Aggregate and return custom metric (weighted average)
return {"accuracy": sum(accuracies) / sum(examples)}

def evaluate_fn(server_round, parameters, config):
cnn_model = model()
cnn_model.set_weights(parameters)
loss, accuracy = cnn_model.evaluate(global_X_test, global_y_test)
return loss, {"accuracy": accuracy}

def draw_result(history):
global_accuracy_centralised = history.metrics_centralized["accuracy"]
round = [data[0] for data in global_accuracy_centralised]
acc = [100.0 * data[1] for data in global_accuracy_centralised]
plt.plot(round, acc)
plt.grid()
plt.ylabel("Accuracy (%)")
plt.xlabel("Round")
plt.title("10 clients with 10 clients per round")
plt.show()

if name =="main":
VERBOSE = 0
NUM_CLIENTS = 10
path = "D:/Users/Administrator/PycharmProjects/dataset/data/gen_apigraph_drebin/2012-01to2012-12_selected.npz"
apigraph_data = getdata(path)
X_train_full, global_X_test, y_train_full, global_y_test = process_data(apigraph_data)
client_data_splits = create_data_splits(X_train_full, y_train_full, num_clients=10)
strategy = fl.server.strategy.FedAvg(
fraction_fit=0.1, # Sample 10% of available clients for training
fraction_evaluate=0.05, # Sample 5% of available clients for evaluation
min_fit_clients=2, # Never sample less than 10 clients for training
min_evaluate_clients=3, # Never sample less than 5 clients for evaluation
min_available_clients=int(
NUM_CLIENTS * 0.75
), # Wait until at least 75 clients are available
evaluate_metrics_aggregation_fn=weighted_average, # aggregates federated metrics
evaluate_fn=evaluate_fn, # global evaluation function
)
client_resources = {"num_cpus": 1, "num_gpus": 0.0}
# Start simulation
history = fl.simulation.start_simulation(
client_fn=client_fn,
num_clients=NUM_CLIENTS,
config=fl.server.ServerConfig(num_rounds=3),
strategy=strategy,
client_resources=client_resources,
)
draw_result(history)`

when i run this code,The running info is as follows:

INFO flwr 2024-07-04 21:20:39,045 | app.py:175 | Starting Flower simulation, config: ServerConfig(num_rounds=3, round_timeout=None) 2024-07-04 21:20:43,493 ERROR services.py:1207 -- Failed to start the dashboard , return code 1 2024-07-04 21:20:43,493 ERROR services.py:1232 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is. 2024-07-04 21:20:43,505 ERROR services.py:1276 -- The last 20 lines of C:\Users\Administrator\AppData\Local\Temp\ray\session_2024-07-04_21-20-40_370729_13952\logs\dashboard.log (it contains the error message from the dashboard): File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 680, in _load_unlocked File "<frozen importlib._bootstrap_external>", line 850, in exec_module File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\ray\dashboard\modules\reporter\reporter_agent.py", line 27, in <module> import ray._private.prometheus_exporter as prometheus_exporter File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\ray\_private\prometheus_exporter.py", line 17, in <module> from opencensus.common.transports import sync File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\opencensus\common\transports\sync.py", line 16, in <module> from opencensus.trace import execution_context File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\opencensus\trace\__init__.py", line 15, in <module> from opencensus.trace.span import Span File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\opencensus\trace\span.py", line 32, in <module> from opencensus.trace import status as status_module File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\opencensus\trace\status.py", line 15, in <module> from google.rpc import code_pb2 File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\google\rpc\code_pb2.py", line 23, in <module> from google.protobuf.internal import builder as _builder ImportError: cannot import name 'builder' from 'google.protobuf.internal' (D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\google\protobuf\internal\__init__.py) 2024-07-04 21:20:44,785 INFO worker.py:1621 -- Started a local Ray instance. INFO flwr 2024-07-04 21:20:47,703 | app.py:210 | Flower VCE: Ray initialized with resources: {'object_store_memory': 615149568.0, 'memory': 1230299136.0, 'node:127.0.0.1': 1.0, 'GPU': 1.0, 'CPU': 12.0, 'node:__internal_head__': 1.0} INFO flwr 2024-07-04 21:20:47,703 | app.py:224 | Flower VCE: Resources for each Virtual Client: {'num_cpus': 1, 'num_gpus': 0.0} INFO flwr 2024-07-04 21:20:47,716 | app.py:270 | Flower VCE: Creating VirtualClientEngineActorPool with 12 actors INFO flwr 2024-07-04 21:20:47,717 | server.py:89 | Initializing global parameters INFO flwr 2024-07-04 21:20:47,717 | server.py:276 | Requesting initial parameters from one random client (raylet) [2024-07-04 21:20:48,994 E 15016 11476] (raylet.exe) agent_manager.cc:135: The raylet exited immediately because the Ray agent failed. The raylet fate shares with the agent. This can happen because the Ray agent was unexpectedly killed or failed. Agent can fail when (raylet) - The version of grpciodoesn't follow Ray's requirement. Agent can segfault with the incorrectgrpcioversion. Check the grpcio versionpip freeze | grep grpcio. (raylet) - The agent failed to start because of unexpected error or port conflict. Read the log cat /tmp/ray/session_latest/logs/dashboard_agent.log. You can find the log file structure here https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure. (raylet) - The agent is killed by the OS (e.g., out of memory). (raylet) *** SIGTERM received at time=1720099248 *** (raylet) @ 00007FF78507F6CC (unknown) (unknown) (raylet) @ 00007FF78507EB3E (unknown) (unknown) (raylet) @ 00007FF991FC1BB2 (unknown) configthreadlocale (raylet) @ 00007FF993407344 (unknown) BaseThreadInitThunk (raylet) @ 00007FF99475CC91 (unknown) RtlUserThreadStart (raylet) [2024-07-04 21:20:49,011 E 15016 11476] (raylet.exe) logging.cc:361: *** SIGTERM received at time=1720099249 *** (raylet) [2024-07-04 21:20:49,011 E 15016 11476] (raylet.exe) logging.cc:361: @ 00007FF78507F6CC (unknown) (unknown) (raylet) [2024-07-04 21:20:49,011 E 15016 11476] (raylet.exe) logging.cc:361: @ 00007FF78507EB3E (unknown) (unknown) (raylet) [2024-07-04 21:20:49,011 E 15016 11476] (raylet.exe) logging.cc:361: @ 00007FF991FC1BB2 (unknown) configthreadlocale (raylet) [2024-07-04 21:20:49,012 E 15016 11476] (raylet.exe) logging.cc:361: @ 00007FF993407344 (unknown) BaseThreadInitThunk (raylet) [2024-07-04 21:20:49,012 E 15016 11476] (raylet.exe) logging.cc:361: @ 00007FF99475CC91 (unknown) RtlUserThreadStart 2024-07-04 21:21:03,562 WARNING worker.py:2037 -- The node with node id: 0a54d72dcaeb8857fb9185613f7d05c36e7ef756ebc7c664f79ed7eb and address: 127.0.0.1 and node name: 127.0.0.1 has been marked dead because the detector has missed too many heartbeats from it. This can happen when a (1) raylet crashes unexpectedly (OOM, preempted node, etc.) (2) raylet has lagging heartbeats due to slow network or busy workload. [2024-07-04 21:21:03,638 E 13952 4496] core_worker.cc:587: :info_message: Attempting to recover 1 lost objects by resubmitting their tasks. To disable object reconstruction, set @ray.remote(max_retries=0). Exception in thread Thread-2: Traceback (most recent call last): File "D:\Users\Administrator\anaconda3\envs\tf\lib\threading.py", line 980, in _bootstrap_inner self.run() File "D:\Users\Administrator\anaconda3\envs\tf\lib\threading.py", line 1306, in run self.function(*self.args, **self.kwargs) File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\flwr\simulation\app.py", line 258, in update_resources num_max_actors = pool_size_from_resources(client_resources) File "D:\Users\Administrator\anaconda3\envs\tf\lib\site-packages\flwr\simulation\ray_transport\ray_actor.py", line 137, in pool_size_from_resources num_cpus, UnboundLocalError: local variable 'num_cpus' referenced before assignment

I am a student who has just been exposed to federated learning and deep learning. May I ask this question? Thank you

@MicSif MicSif added the question Further information is requested label Jul 4, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
question Further information is requested
Projects
None yet
Development

No branches or pull requests

1 participant