# Server

## Setting variables

In [1]:
rounds = 100
local_epoch = 1
num_users = 10 # number of clients
target_test_acc = 0.99
lrs = [0.1]

C = 1
E = 5
B = 10 # 'all' for a single minibatch

In [2]:
import os

import socket
import struct
import pickle
import sys

from threading import Thread
from threading import Lock

import copy

import logging
import math
import random
import re
import time
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from datetime import timedelta
from keras import backend as K
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.optimizers import SGD
from tqdm import tqdm
tf.get_logger().setLevel(logging.ERROR)



## Device

In [3]:
if tf.test.gpu_device_name():
    device = "gpu"
else:
    device = "cpu"

print(device)

cpu


2023-05-01 17:38:42.054735: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


## Model

A CNN with two 5x5 convolution layers (the first with 32 channels, the second with 64, each followed with 2x2 max pooling), a fully connected layer with 512 units and ReLu activation, and a final softmax output layer (1,663,370 total parameters)

In [4]:
class CNN:
    @staticmethod
    def build(input_shape):
        model = Sequential()
        model.add(Conv2D(filters=32, kernel_size=(5,5), padding='same', activation='relu', input_shape=input_shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(filters=64, padding='same', kernel_size=(5,5), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(10, activation='softmax'))
        return model

initialize global model

In [5]:
model = CNN()
  
global_model = model.build((28,28,1))
initial_weights = global_model.get_weights()

# client_models = [model.build((28,28,1)) for _ in range(K)]

# for i in range(len(client_models)):
#   client_models[i].compile(loss=loss, 
#                       optimizer=optimizer, 
#                       metrics=metrics)
#   client_models[i].set_weights(global_model.get_weights())

global_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 28, 28, 32)        832       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 14, 14, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 14, 64)        51264     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 7, 7, 64)         0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1

# Load data

In [6]:
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()

In [7]:
X_train = X_train.astype("float32")/255
X_test = X_test.astype("float32")/255
X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

print("x_train shape:", X_train.shape)
print(X_train.shape[0], "train samples")
print(X_test.shape[0], "test samples")

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [8]:
train_batched = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(len(y_train)) # for testing on train set
test_batched = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(len(y_test))

In [9]:
# for (X, y) in test_batched:
#     print(y)
# print(y_test)

## variables

In [10]:
global global_weights    

clientsoclist = [0]*num_users

start_time = 0
weight_count = 0

global_weights = initial_weights
# print(global_weights)

# datasetsize = [0]*num_users
weights_list = {}

lock = Lock()

## Socket initialization
### Set host address and port number

### Required socket functions

In [11]:
def send_msg(sock, msg):
    # prefix each message with a 4-byte length in network byte order
    msg = pickle.dumps(msg)
    l_send = len(msg)
    msg = struct.pack('>I', l_send) + msg
    sock.sendall(msg)
    return l_send

def recv_msg(sock):
    # read message length and unpack it into an integer
    raw_msglen = recvall(sock, 4)
    if not raw_msglen:
        return None
    msglen = struct.unpack('>I', raw_msglen)[0]
    # read the message data
    msg =  recvall(sock, msglen)
    msg = pickle.loads(msg)
    return msg

def recvall(sock, n):
    # helper function to receive n bytes or return None if EOF is hit
    data = b''
    while len(data) < n:
        packet = sock.recv(n - len(data))
        if not packet:
            return None
        data += packet
    return data

In [12]:
def average_weights(weights_list):
    avg_weights = list()
    for j in range(len(global_weights)):
        weights = [weights_list[k][j] for k in range(num_users)]
        layer_mean = tf.math.reduce_mean(weights, axis=0)
        avg_weights.append(layer_mean)
        
    return avg_weights

## Receive users for aggregation

In [13]:
def receive(userid, num_users, conn): #thread for receive clients
    global weight_count
    global global_weights
    global weights_list
    
    msg = {
        'rounds': rounds,
        'client_id': userid,
        'local_epoch': local_epoch,
        'weight': global_weights
    }
    distribute = send_msg(conn, msg)    #send global weight
    r = recv_msg(conn)    # get weights from clients
    with lock:
        weights_list[userid] = r['weight']
        weight_count += 1
        
        if weight_count == num_users:
            global_weights = average_weights(weights_list)
            weight_count = 0

## Thread define

In [14]:
def run_thread(func, num_user):
    global clientsoclist
    global start_time
    
    thrs = []
    print("timer start!")
    start_time = time.time()    # store start time
    for i in range(num_user):
        conn, addr = s.accept()
        print('Conntected with', addr)
        # append client socket on list
        clientsoclist[i] = conn
        args = (i, num_user, conn)
        thread = Thread(target=func, args=args)
        thrs.append(thread)
        thread.start()
    for thread in thrs:
        thread.join()
    end_time = time.time()  # store end time
    print("TrainingTime: {} sec".format(end_time - start_time))

In [15]:
host_name = socket.gethostbyname(socket.gethostname())
# host_name = '13.55.165.227'
port_number = 12345
print(host_name)

172.31.2.147


In [16]:
print(host_name)

172.31.2.147


### Open the server socket

In [17]:
s = socket.socket()
s.bind((host_name, port_number))
s.listen(10)

## Comunication overhead

In [None]:
loss='categorical_crossentropy'
metrics = ['accuracy']
cce = tf.keras.losses.CategoricalCrossentropy()

result_per_lr = {}
start = time.time()
for lr in lrs:
    train_losses = []
    train_accs = []
    test_losses = []
    test_accs = []
    
    print('\nlearning rate: {}'.format(lr))
    
    for r in range(rounds):
        train_loss = 0
        train_acc = 0
        test_loss = 0
        test_acc = 0
        
        run_thread(receive, num_users)
    
        # evaluate current round
        start = time.time()
        print('\nlearning rate: {}'.format(lr))
        global_model.set_weights(global_weights)
        
        # test global model on full training set
        for (X,y) in train_batched:
            preds = global_model.predict(X)
            train_loss = cce(y, preds)
            train_acc = accuracy_score(tf.argmax(preds, axis=1), tf.argmax(y, axis=1))
            train_losses.append(train_loss.numpy())
            train_accs.append(train_acc)

        # test global model on testing set
        for(X, y) in test_batched:
            preds = global_model.predict(X)
            test_loss = cce(y, preds)
            test_acc = accuracy_score(tf.argmax(preds, axis=1), tf.argmax(y, axis=1))
            test_losses.append(test_loss.numpy())
            test_accs.append(test_acc)

        elapsed = (time.time() - start)

        print('comm_round: {}/{} | test_acc: {:.3%} | test_loss: {:.3} | train_acc: {:.3%} | train_loss: {:.3} | elapsed: {}'.format(r+1, rounds, test_acc, test_loss, train_acc, train_loss, timedelta(seconds=elapsed)))
        print('\n')
        
    result_per_lr[lr] = {
        'train_accs' : train_accs,
        'test_accs' : test_accs,
        'train_losses' : train_losses,
        'test_losses' : test_losses
                          }
    


learning rate: 0.1
timer start!
Conntected with ('172.31.2.147', 46508)
Conntected with ('172.31.2.147', 46524)
Conntected with ('172.31.2.147', 46526)
Conntected with ('172.31.2.147', 46538)
Conntected with ('172.31.27.186', 37170)
Conntected with ('172.31.27.186', 37172)
Conntected with ('172.31.27.186', 37174)
Conntected with ('172.31.27.171', 41814)
Conntected with ('172.31.27.171', 36766)
Conntected with ('172.31.27.171', 36778)
TrainingTime: 529.6763823032379 sec

learning rate: 0.1
comm_round: 1/100 | test_acc: 49.230% | test_loss: 1.79 | train_acc: 48.318% | train_loss: 1.8 | elapsed: 0:00:26.623260


timer start!
Conntected with ('172.31.27.186', 55126)
Conntected with ('172.31.27.186', 55128)
Conntected with ('172.31.27.186', 55130)
Conntected with ('172.31.27.171', 51848)
Conntected with ('172.31.27.171', 60362)
Conntected with ('172.31.27.171', 48646)
Conntected with ('172.31.2.147', 32884)
Conntected with ('172.31.2.147', 32888)
Conntected with ('172.31.2.147', 51064)
Con

TrainingTime: 334.167857170105 sec

learning rate: 0.1
comm_round: 12/100 | test_acc: 97.190% | test_loss: 0.083 | train_acc: 97.225% | train_loss: 0.0858 | elapsed: 0:00:25.939102


timer start!
Conntected with ('172.31.27.171', 50308)
Conntected with ('172.31.27.171', 35736)
Conntected with ('172.31.27.171', 48262)
Conntected with ('172.31.2.147', 47580)
Conntected with ('172.31.2.147', 42160)
Conntected with ('172.31.2.147', 42170)
Conntected with ('172.31.2.147', 42186)
Conntected with ('172.31.27.186', 53852)
Conntected with ('172.31.27.186', 36782)
Conntected with ('172.31.27.186', 46146)
TrainingTime: 308.61340618133545 sec

learning rate: 0.1
comm_round: 13/100 | test_acc: 97.790% | test_loss: 0.0729 | train_acc: 97.738% | train_loss: 0.0724 | elapsed: 0:00:25.904476


timer start!
Conntected with ('172.31.27.171', 58854)
Conntected with ('172.31.27.171', 58866)
Conntected with ('172.31.2.147', 57000)
Conntected with ('172.31.27.171', 54420)
Conntected with ('172.31.2.147', 504

Conntected with ('172.31.27.171', 49948)
Conntected with ('172.31.27.171', 46022)
Conntected with ('172.31.2.147', 45200)
Conntected with ('172.31.2.147', 45204)
Conntected with ('172.31.2.147', 41250)
Conntected with ('172.31.2.147', 44948)
Conntected with ('172.31.27.186', 54458)
Conntected with ('172.31.27.186', 54460)
Conntected with ('172.31.27.186', 57976)
TrainingTime: 303.6315155029297 sec

learning rate: 0.1
comm_round: 24/100 | test_acc: 98.150% | test_loss: 0.0567 | train_acc: 98.287% | train_loss: 0.0512 | elapsed: 0:00:25.891184


timer start!
Conntected with ('172.31.27.171', 49324)
Conntected with ('172.31.27.186', 55828)
Conntected with ('172.31.27.171', 39896)
Conntected with ('172.31.27.171', 47448)
Conntected with ('172.31.2.147', 36750)
Conntected with ('172.31.2.147', 58058)
Conntected with ('172.31.2.147', 58064)
Conntected with ('172.31.2.147', 41160)
Conntected with ('172.31.27.186', 35298)
Conntected with ('172.31.27.186', 35300)
TrainingTime: 311.3410725593567

TrainingTime: 307.53256011009216 sec

learning rate: 0.1
comm_round: 35/100 | test_acc: 98.430% | test_loss: 0.0489 | train_acc: 98.698% | train_loss: 0.0372 | elapsed: 0:00:25.969854


timer start!
Conntected with ('172.31.27.171', 51412)
Conntected with ('172.31.27.171', 35194)
Conntected with ('172.31.27.171', 54972)
Conntected with ('172.31.2.147', 43202)
Conntected with ('172.31.2.147', 43218)
Conntected with ('172.31.2.147', 56248)
Conntected with ('172.31.2.147', 56254)
Conntected with ('172.31.27.186', 53244)
Conntected with ('172.31.27.186', 39806)
Conntected with ('172.31.27.186', 39808)
TrainingTime: 312.8946578502655 sec

learning rate: 0.1
comm_round: 36/100 | test_acc: 98.090% | test_loss: 0.0592 | train_acc: 98.333% | train_loss: 0.0478 | elapsed: 0:00:26.210171


timer start!
Conntected with ('172.31.27.171', 56054)
Conntected with ('172.31.27.171', 56070)
Conntected with ('172.31.27.171', 54448)
Conntected with ('172.31.2.147', 59494)
Conntected with ('172.31.2.147', 5

Conntected with ('172.31.27.171', 60296)
Conntected with ('172.31.2.147', 55506)
Conntected with ('172.31.2.147', 38106)
Conntected with ('172.31.2.147', 38114)
Conntected with ('172.31.2.147', 38120)
Conntected with ('172.31.27.186', 47036)
Conntected with ('172.31.27.186', 34616)
Conntected with ('172.31.27.186', 57314)
TrainingTime: 267.51111602783203 sec

learning rate: 0.1
comm_round: 47/100 | test_acc: 98.360% | test_loss: 0.0477 | train_acc: 98.750% | train_loss: 0.0347 | elapsed: 0:00:26.000643


timer start!
Conntected with ('172.31.27.171', 48858)
Conntected with ('172.31.27.171', 60006)
Conntected with ('172.31.27.171', 52984)
Conntected with ('172.31.2.147', 52868)
Conntected with ('172.31.2.147', 56080)
Conntected with ('172.31.2.147', 38050)
Conntected with ('172.31.2.147', 56974)
Conntected with ('172.31.27.186', 55208)
Conntected with ('172.31.27.186', 55210)
Conntected with ('172.31.27.186', 47002)
TrainingTime: 325.1666946411133 sec

learning rate: 0.1
comm_round: 48/

In [None]:
dir = ''
with open(dir+'result_per_lr_{}_{}_{}_{}.pickle'.format(B,C,E, lr), 'wb') as handle:
    pickle.dump(result_per_lr, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# end_time = time.time()  # store end time
# print("TrainingTime: {} sec".format(end_time - start_time))

In [None]:
end_time = time.time()  # store end time
print("WorkingTime: {} sec".format(end_time - start_time))