# Import libs

In [1]:
import sys
import os
import shutil
utils_dir = '../../src/utils'
sys.path.insert(0, utils_dir) # add utils dir to path
import testbed_utils as tu
import time

# Step-by-step

## Debug

In [30]:
tu.umount_image()

In [29]:
tu.mount_image(tu.CONF['global']['path']['orig_img'])

In [7]:
tu.umount_image()
# mount base image
tu.mount_new_image(tu.CONF['global']['path']['orig_img'], tu.CONF['global']['path']['base_img'])

In [81]:
img = tu.CONF['global']['path']['base_img']
cmd = "sudo modprobe nbd max_part=8;\
    sudo qemu-nbd --connect=/dev/nbd0 {};\
    sudo mount -o loop /dev/nbd0p1 {};\
    \n".format(img, tu.LOOP_DIR)
print(cmd)

sudo modprobe nbd max_part=8;    sudo qemu-nbd --connect=/dev/nbd0 /home/yzy/Load-Balancer/data/img/lb-vpp-base.img;    sudo mount -o loop /dev/nbd0p1 /mnt/loop;    



In [84]:
cmd = "sudo chown 1000:1000 {0}/home/cisco/*;\
        sudo umount {0}/;\
        sudo qemu-nbd --disconnect /dev/nbd0;\n".format(tu.LOOP_DIR)
print(cmd)

sudo chown 1000:1000 /mnt/loop/home/cisco/*;        sudo umount /mnt/loop/;        sudo qemu-nbd --disconnect /dev/nbd0;



# Unit test

In [3]:
def setup_env(config, remote=True, local=True):
    '''
    @brief:
        - running VMs on both local machine (LB + servers) and remote machine (clients + edge routers + servers)
        - check network connection (management channel via vlan)
    '''    
    if remote:
        config_file_remote = {i: config['config_file_prefix']+'-{}'.format(i) for i in config['remote_servers']}

        ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

        # for remote physical machine
        for server_id in config['remote_servers']:
            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py --start -m {} -f {}.json"'.format(
                ip_remote[server_id], 'ecmp', config_file_remote[server_id])
            tu.subprocess.Popen(cmd, shell=True)
        
    if local:
        config_file = config['config_file_prefix']+'-'+str(config['local_server_id'])+'.json'
        # for local machine
        tu.init_nodes_info(config_file, config['lb_method'])
        tu.prepare_img(lb_method=config['lb_method'], from_orig=config['from_orig'], debug_node=False)
        tu.runall()

def prepare_episode(config, ep, prepare_fn=None, args_dict=None, reboot_agents=False):
    '''
    @brief:
        - get episode number
        - prepare task name and directory
        - copy sample
        - check network connection (management)
        - prepare extra files if necessary (rlb)
    @param:
        - config: configuration dictionary
        - ep: episode number
        - sample: directory of the sample, by default append after CONF['global']['path']['trace']
        - prepare_fn: a callback function that prepare specific files for certain nodes
        - reboot_agetns: whether we should reboot load balancers
    '''
    # initialise task info
    task_name, task_dir, nodes = tu.init_task_info(
        experiment=config['experiment'],
        lb_method=config['lb_method'],
        trace=config['trace'],
        sample=config['sample'],
        filename=config['config_file_prefix']+'-'+str(config['local_server_id'])+'.json',
        colocate=config['colocate'],
        colocate_freq=float(config['colocate_freq']),
        alias=config['alias'])
    
    print(">> run task {} -- episode {}".format(task_name, ep))

    # copy sample
    client_server_id = 0
    config_file_remote = config['config_file_prefix']+'-{}'.format(client_server_id)
    cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m {} -f {}.json --tr {} --sample {} -n {}"'.format(
        tu.COMMON_CONF['net']['physical_server_ip'][client_server_id], 'ecmp', config_file_remote, config['trace'], config['sample'], config['clip_n'])
    tu.subprocess_cmd(cmd, debug=True)

    if reboot_agents:
        reboot_lb(config)
    
    # check shared memory file exists
    while True:
        shm_ok = True
        lb2reboot = []
        for lb in tu.NODES['lb']:
            files = !ssh -t -t -i ~/.ssh/lb_rsa cisco@localhost -p {lb.ssh_port} ls /dev/shm
            if 'shm_vip_1' not in files[0]:
                print("shm for {} doesn't exist ({})".format(lb.id, files))
                shm_ok = False
                lb2reboot.append(lb)
        if shm_ok:
            print("[shm check ok]")
            break
        else:
            reboot_lb(config, nodes=lb2reboot)
            
    # check network connection
    net_ok = False
    while not net_ok:
        try:
            tu.gt_socket_check()
            net_ok = True
        except:
            print('network error')
            time.sleep(1)
    print('[net check ok]')
            
    if prepare_fn:
        prepare_fn(config, task_dir, args_dict)
        
def prepare_rlb_qmix(config, task_dir, extra_args=None):
    '''
    @brief:
        prepare RLB QMIX files
    '''
    print("[Prepare QMIX files]")
    if 'task_dir' in extra_args.keys():
        task_dir = extra_args['task_dir']
    local_folder = '{}/0_rl_ep{}'.format(task_dir, extra_args['ep'])
    if not os.path.exists(local_folder):
        print("no corresponding folder found: {}".format(local_folder))
    for lb in tu.NODES['lb']:
        cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1} cisco@{2}:~/rl;'.format(
            lb.ssh_port, local_folder, lb.physical_server_ip)
        tu.subprocess_cmd(cmd)
        lb.update_file_ssh('{}/{}/{}'.format(lb.id, len(tu.NODES['lb']), len(tu.NODES['as'])), 'topo')


def run_ep(config, client_server_id=0):
    '''
    @brief:
        run episode
    '''
    print("[Run episode and generate traffic]")


    config_file_remote = config['config_file_prefix']+'-{}'.format(client_server_id)
    
    for lb in tu.NODES['lb'][::-1]:
        lb.run_init_bg()
        
    t0 = time.time()
    cmd = 'ssh -t yzy@{} "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m {} -f {}.json --run"'.format(
        tu.COMMON_CONF['net']['physical_server_ip'][client_server_id], config['lb_method'], config_file_remote)
    tu.subprocess_cmd(cmd, debug=True)
    print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))

    # mark episode done
    for lb in tu.NODES['lb']:
        lb.execute_cmd_ssh("touch /home/cisco/done")


def finalize_ep(config, ep, check_train=None, client_server_id=0):
    '''
    @brief:
        - fetch result from the client
        - make sure the RLB is done training
        - fetch result from the LB agents
    '''
    print("[Fetch results]")

    task_kwargs = {
        'experiment': config['experiment'],
        'lb_method': config['lb_method'],
        'trace': config['trace'],
        'sample': config['sample'],
        'colocate_freq': float(config['colocate_freq']),
    }


    config_file_remote = config['config_file_prefix']+'-{}'.format(client_server_id)
    # fetch first result from the client
    cmd = 'ssh -t yzy@{} "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m {} -f {}.json --client --tr {} --sample {} --experiment {} --colocate-freq {} --episode {}'.format(
        tu.COMMON_CONF['net']['physical_server_ip'][client_server_id], config['lb_method'], config_file_remote, config['trace'], config['sample'], config['experiment'], config['colocate_freq'], ep)
    if config['alias']:
        task_kwargs.update({'alias': config['alias']})
        cmd += ' --alias {}'.format(config['alias'])
    if config['colocate']:
        task_kwargs.update({'colocate': config['colocate']})
        cmd += ' --colocate {}'.format(config['colocate'])
    cmd +='"'
    tu.subprocess_cmd(cmd, debug=True)
    
    if check_train:
        # only need to check the master LB agent, whose training proces
        while True:
            res = !ssh -t -i ~/.ssh/lb_rsa cisco@localhost -p {tu.NODES['lb'][0].ssh_port} ps aux | grep "python3.6"
            terminate_process = True
            for line in res:
                # check_train contains the keyword to check in the process running for the given method
                # e.g. for RLB-QMIX, check_train = sac_qmix.py
                if check_train in line: 
                    terminate_process = False
                    break
            if terminate_process: break
    
    # fetch LB agent results
        
    task_name, task_dir = tu.get_task_name_dir(**task_kwargs)
    
    for lb in tu.NODES['lb'][::-1]:
        lb.fetch_result(task_dir, ep)
        if 'rlb' in config['lb_method']:
            rl_model_root = '/'.join(task_dir.split('/')[:-1])
            rl_model_path = os.path.join(rl_model_root, '{}_rl_ep-1'.format(lb.id))
            if os.path.exists(rl_model_path):
                shutil.rmtree(rl_model_path)
                print("remove previous rl model path: {}".format(rl_model_path))
            lb.fetch_result(rl_model_root, -1, filename='rl')
            
    return task_name, task_dir
            
def reboot_lb(config, nodes=None):
    print("[Reboot LB nodes {}]".format(nodes))
    if nodes is None:
        nodes = tu.NODES['lb']
    for lb in nodes:
        lb.shutdown()
    tu.prepare_img(
        lb_method=config['lb_method'],
        from_orig=None,
        debug_node=False
    )
    for lb in nodes:
        lb.run()
    tu.host_br_up(only_lb=True)
            
def cleanup_env(config, remote=True, local=True):
    print("[Cleanup environment]")
    if remote:
        config_file_remote = {i: config['config_file_prefix']+'-{}'.format(i) for i in config['remote_servers']}

        ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

        # for remote physical machine
        for server_id in config['remote_servers']:
            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py --shutdown -m {} -f {}.json"'.format(
                ip_remote[server_id], 'ecmp', config_file_remote[server_id])
            tu.subprocess.Popen(cmd, shell=True)
    
    if local:
        # for local machine
        tu.shutall()

In [14]:
config = {'lb_method': 'rlb-qmix',
 'trace': 'wiki_600',
 'sample': 'hour0.csv',
 'experiment': 'nips-lb',
 'from_orig': False,
 'colocate': None,
 'colocate_freq': 0.0001,
 'config_file_prefix': 'conf01',
 'twist': False,
 'clip_n': 30000,
 'local_server_id': 1,
 'remote_servers': [0],
 'alias': 'conf01'}

In [27]:
setup_env(config)

Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost


In [28]:
ep = 0
args_dict = {'ep': ep}

if config['lb_method'] == 'rlb-qmix':
    prepare_fn = prepare_rlb_qmix

prepare_episode(config, ep, prepare_fn=prepare_fn, args_dict=args_dict, reboot_agents=False)

>> run task wiki_600-rlb-qmix-hour0-conf01 -- episode 0
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour0.csv -n 30000"
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Prepare QMIX files]
no corresponding folder found: /home/yzy/Load-Balancer/data/results/nips-lb/wiki_600/rlb-qmix/hour0-conf01/0_rl_ep0


In [23]:
config_file_remote = config['config_file_prefix']+'-{}'.format(0)
    
for lb in tu.NODES['lb'][::-1]:
    lb.gather_usage()
    cmd = 'ssh -t -p {} cisco@localhost "bash init.sh"'.format(lb.ssh_port)
    tu.subprocess.Popen(cmd, stdout=tu.subprocess.PIPE, shell=True)

In [12]:
t0 = time.time()
cmd = 'ssh -t yzy@{} "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m {} -f {}.json --run"'.format(
    tu.COMMON_CONF['net']['physical_server_ip'][0], config['lb_method'], config_file_remote)
print("run cmd: {}".format(cmd))
tu.subprocess_cmd(cmd)
print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))

# mark episode done
for lb in tu.NODES['lb']:
    lb.execute_cmd_ssh("touch /home/cisco/done")

run cmd: ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 76.237s


In [29]:
run_ep(config)

[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m rlb-qmix -f conf01-0.json --run"
Trace replay over w/ total time: 42.752s


In [27]:
if 'rlb' in config['lb_method']:
    finalize_ep(config, ep, check_train=tu.LB_METHODS[config['lb_method']]['files'][0])
else:
    finalize_ep(config, ep, check_train=None)

[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m rlb-qmix -f conf01-0.json --client --tr wiki_600 --sample hour0.csv --experiment nips-lb --colocate-freq 0.0001 --episode 0"
alias=None


In [22]:
cleanup_env(config)

[Cleanup environment]


# Pipeline

In [5]:
def pipeline(eps, configs, model_dir=None, model_ep=None, train=True, warmup=1, setup=True, cleanup=True):
    
    # local variables
    prepare_fn, args_dict = None, {'ep': eps[0]}
    reboot_agents = [True]*len(eps)
    config = configs[0]
    
    if setup:
        setup_env(config)

    if 'rlb-qmix' in config['lb_method']:
        prepare_fn = prepare_rlb_qmix

    if model_ep:
        args_dict['ep'] = model_ep
    if model_dir:
        args_dict['task_dir'] = model_dir
    if warmup > 0:
        # warmup several episodes
        print("=== WARM UP for {} eps ===".format(warmup))
        reboot_agents_warmup = [True]*warmup
        reboot_agents_warmup[0] = False
        
        for ep in range(warmup):
            prepare_episode(config, ep, prepare_fn=prepare_fn, args_dict=args_dict, reboot_agents=True)
            run_ep(config)
            time.sleep(3)
    else:
        # if we don't warm up the system, there's no need to reboot LB agents
        reboot_agents[0] = False


    for i, (ep, config) in enumerate(zip(eps, configs)):
        print("=== Run episode {} ===".format(ep))

        prepare_episode(config, ep, prepare_fn=prepare_fn, args_dict=args_dict, reboot_agents=reboot_agents[i])
            
        run_ep(config)
        
        if 'rlb' in config['lb_method']:
            task_name, task_dir = finalize_ep(config, ep, check_train=tu.LB_METHODS[config['lb_method']]['files'][0])
        else:
            task_name, task_dir = finalize_ep(config, ep, check_train=None)
    
    
    if cleanup:
        cleanup_env(config)
    return task_name, task_dir, ep

In [6]:
def get_eps_configs_wiki(method, hours, clip_ns, config_base, ep_start=0):
    configs = []
    for clip_n in clip_ns:
        for hour in hours:
            configs.append(config_base.copy())
            configs[-1]['lb_method'] = method
            configs[-1]['clip_n'] = clip_n
            configs[-1]['sample'] = 'hour{}.csv'.format(hour)
    eps = range(ep_start, ep_start + len(configs))
    return eps, configs

### Choose samples for 2lb

In [16]:
config_base = {'lb_method': 'rlb-qmix',
 'trace': 'wiki_600',
 'sample': 'hour0.csv',
 'experiment': 'nips-22',
 'from_orig': False,
 'colocate': None,
 'colocate_freq': 0.0001,
 'config_file_prefix': 'conf01',
 'twist': False,
 'clip_n': 30000,
 'local_server_id': 1,
 'remote_servers': [0],
 'alias': 'conf01'}
# hours with more than 500 queries/s
hours = [0, 1, 2, 3, 4, 5, 6, 18, 19, 20, 21, 22, 23]
hours2train = [h for h in hours if h%2 != 0]
hours2test = [h for h in hours if h%2 == 0]
clip_ns = range(30000, 50000, 1000)
ep_start = 0

In [None]:
t0 = time.time()
methods = [
    'rlb-qmix-var-flow-duration',
    'rlb-qmix-var-log-flow-duration',
    'rlb-qmix-var-exp-flow-duration',
    'rlb-qmix-max-flow-duration',
    'rlb-qmix-max-log-flow-duration',
]
ep_start = 0
for method in methods:
    # generate a list of configurations
    eps, configs_train = get_eps_configs_wiki(method, hours2train, clip_ns, config_base)
    print("total {} episodes 2 run".format(len(eps)))
    last_task_name, last_task_dir, last_ep = pipeline(
      eps[ep_start:], 
      configs_train[ep_start:],
      model_dir='/home/yzy/Load-Balancer/data/results/nips-22/wiki_600/{}'.format(method),
      model_ep=-1,
      train=True,
      setup=True
    )
print('>> total time for {} episodes is: {:.3f}s'.format(len(eps), time.time()-t0))

total 120 episodes 2 run
Create LB node image...
Create LB node image...


In [116]:
# run tests
methods = ['wcmp', 'lsq', 'sed', 'rlb-qmix']
ep_start = 0
for method in methods:
    # generate a list of configurations
    configs_train = []
    for clip_n in [100000]*5:
        for hour in hours2test:
            configs_train.append(config_base.copy())
            configs_train[-1]['lb_method'] = method
            configs_train[-1]['clip_n'] = clip_n
            configs_train[-1]['sample'] = 'hour{}.csv'.format(hour)
    eps = range(ep_start, ep_start + len(configs_train))
    print("total {} episodes 2 run".format(len(eps)))
    last_task_name, last_task_dir, last_ep = pipeline(eps, configs_train, continue_from_dir='/home/yzy/Load-Balancer/data/results/nips-22/wiki_600/rlb-qmix/hour3-conf01', continue_from_ep=115, train=False)

total 45 episodes 2 run
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
alias=conf01
>> run task wiki_600-wcmp-hour0-conf01 -- episode 0
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour0.csv -n 100000"
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay o

alias=conf01
>> run task wiki_600-wcmp-hour22-conf01 -- episode 8
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour22.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay over w/ total time: 147.166s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --client --tr wiki_600 --sample hour22.csv --experiment nips-22 --colocate-freq 0.0001 --episode 8 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-w

alias=conf01
>> run task wiki_600-wcmp-hour20-conf01 -- episode 16
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour20.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay over w/ total time: 135.217s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --client --tr wiki_600 --sample hour20.csv --experiment nips-22 --colocate-freq 0.0001 --episode 16 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay over w/ total time: 141.580s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --client --tr wiki_600 --sample hour18.csv --experiment nips-22 --colocate-freq 0.0001 --episode 24 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-wcmp-hour20-conf01 -- episode 25
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour20.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t y

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour18.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay over w/ total time: 141.509s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --client --tr wiki_600 --sample hour18.csv --experiment nips-22 --colocate-freq 0.0001 --episode 33 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-wcmp-hour20-conf01 -- episode 34
@subprocess_cmd: execute ssh -t y

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour8.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --run"
Trace replay over w/ total time: 194.352s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m wcmp -f conf01-0.json --client --tr wiki_600 --sample hour8.csv --experiment nips-22 --colocate-freq 0.0001 --episode 40 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-wcmp-hour16-conf01 -- episode 41
@subprocess_cmd: execute ssh -t yzy

Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 148.239s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --client --tr wiki_600 --sample hour6.csv --experiment nips-22 --colocate-freq 0.0001 --episode 3 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-lsq-hour8-conf01 -- episode 4
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour8.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node im

[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --client --tr wiki_600 --sample hour4.csv --experiment nips-22 --colocate-freq 0.0001 --episode 11 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-lsq-hour6-conf01 -- episode 12
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour6.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 148.025s
[Fetch results]
@subprocess_cmd: execut

Create LB node image...
Create LB node image...
node_lb_1 ready: ssh -p 8901 cisco@localhost
shm for 1 doesn't exist (['ssh_exchange_identification: read: Connection reset by peer'])
[Reboot LB nodes [<testbed_utils.lbNode object at 0x7f46f727ea90>]]
Create LB node image...
Create LB node image...
node_lb_1 ready: ssh -p 8901 cisco@localhost
shm for 1 doesn't exist (['ssh_exchange_identification: read: Connection reset by peer'])
[Reboot LB nodes [<testbed_utils.lbNode object at 0x7f46f727ea90>]]
Create LB node image...
Create LB node image...
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 149.802s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py

alias=conf01
>> run task wiki_600-lsq-hour0-conf01 -- episode 27
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour0.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 130.314s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --client --tr wiki_600 --sample hour0.csv --experiment nips-22 --colocate-freq 0.0001 --episode 27 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-lsq-h

alias=conf01
>> run task wiki_600-lsq-hour22-conf01 -- episode 35
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour22.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 131.192s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --client --tr wiki_600 --sample hour22.csv --experiment nips-22 --colocate-freq 0.0001 --episode 35 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-ls

alias=conf01
>> run task wiki_600-lsq-hour20-conf01 -- episode 43
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour20.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --run"
Trace replay over w/ total time: 135.160s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m lsq -f conf01-0.json --client --tr wiki_600 --sample hour20.csv --experiment nips-22 --colocate-freq 0.0001 --episode 43 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-ls

[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --client --tr wiki_600 --sample hour16.csv --experiment nips-22 --colocate-freq 0.0001 --episode 5 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-sed-hour18-conf01 -- episode 6
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour18.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 141.427s
[Fetch results]
@subprocess_cmd: execu

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --client --tr wiki_600 --sample hour8.csv --experiment nips-22 --colocate-freq 0.0001 --episode 13 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-sed-hour16-conf01 -- episode 14
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour16.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 198.654s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@1

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 148.109s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --client --tr wiki_600 --sample hour6.csv --experiment nips-22 --colocate-freq 0.0001 --episode 21 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-sed-hour8-conf01 -- episode 22
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour8.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 148.639s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --client --tr wiki_600 --sample hour4.csv --experiment nips-22 --colocate-freq 0.0001 --episode 29 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-sed-hour6-conf01 -- episode 30
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour6.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.

@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --run"
Trace replay over w/ total time: 149.793s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m sed -f conf01-0.json --client --tr wiki_600 --sample hour2.csv --experiment nips-22 --colocate-freq 0.0001 --episode 37 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-sed-hour4-conf01 -- episode 38
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour4.csv -n 100000"
[Reboot LB nodes None]
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.

alias=conf01
>> run task wiki_600-rlb-qmix-hour0-conf01 -- episode 0
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m ecmp -f conf01-0.json --tr wiki_600 --sample hour0.csv -n 100000"
[shm check ok]
LB Node 0: pass
LB Node 1: pass
[net check ok]
[Prepare QMIX files]
no corresponding folder found: /home/yzy/Load-Balancer/data/results/nips-22/wiki_600/rlb-qmix/hour3-conf01/0_rl_ep115
[Run episode and generate traffic]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m rlb-qmix -f conf01-0.json --run"
Trace replay over w/ total time: 129.998s
[Fetch results]
@subprocess_cmd: execute ssh -t yzy@10.60.16.12 "python3.6 /home/yzy/Load-Balancer/src/utils/rlb_setup_env.py -m rlb-qmix -f conf01-0.json --client --tr wiki_600 --sample hour0.csv --experiment nips-22 --colocate-freq 0.0001 --episode 0 --alias conf01"
alias=conf01
alias=conf01
>> run task wiki_600-rlb-qmix-hour2

IndexError: list index out of range

In [None]:
n_test = 5
test_clip_n = 100000
t0 = time.time()
methods = ['rlb-qmix-max-flow-duration',
           'rlb-qmix-var-flow-duration'
          ]
ep_start = 0
for method in methods:
    # generate a list of configurations
    eps, configs_test = get_eps_configs_wiki(method+'-test', hours2test[:3], [test_clip_n]*n_test, config_base)
    print("total {} episodes 2 run".format(len(eps)))
    last_task_name, last_task_dir, last_ep = pipeline(
      eps[ep_start:], 
      configs_train[ep_start:],
      model_dir='/home/yzy/Load-Balancer/data/results/nips-22/wiki_600/{}'.format(method),
      model_ep=-1,
      train=False,
      setup=True
    )
print('>> total time for {} episodes is: {:.3f}s'.format(len(eps), time.time()-t0))

In [None]:
t0 = time.time()
# run tests
methods = [
    'rlb-qmix-var-flow-duration',
    'rlb-qmix-var-log-flow-duration',
    'rlb-qmix-var-exp-flow-duration',
    'rlb-qmix-max-flow-duration',
    'rlb-qmix-max-log-flow-duration',
]
ep_start = 20
for method in methods:
    # generate a list of configurations
    configs_train = []
    for clip_n in [50000]*5:
        for hour in hours2test[:4]:
            configs_train.append(config_base.copy())
            configs_train[-1]['lb_method'] = method
            configs_train[-1]['clip_n'] = clip_n
            configs_train[-1]['sample'] = 'hour{}.csv'.format(hour)
    eps = range(ep_start, ep_start + len(configs_train))
    print("total {} episodes 2 run".format(len(eps)))
#     last_task_name, last_task_dir, last_ep = pipeline(eps, configs_train, continue_from_dir='/home/yzy/Load-Balancer/data/results/nips-22/wiki_600/rlb-qmix/hour23-conf01', continue_from_ep=79, train=False)
    last_task_name, last_task_dir, last_ep = pipeline(
        eps,
        configs_train,
        model_dir='/home/yzy/Load-Balancer/data/results/nips-22/wiki_600/{}'.format(method),
        model_ep=-1,
        train=False)
print('>> total time for {} episodes is: {:.3f}s'.format(len(eps), time.time()-t0))

In [None]:
# training process
samples = ['hour{}.csv'.format(i) for i in hours2train]
clip_ns = range(30000, 60000, 20)
ep = 0
for clip_n in clip_ns:
    for method in methods:


In [67]:
lb_method = 'rlb-qmix'
trace='wiki_600'
experiment='nips-22'
sample='hour0.csv'
from_orig=None
config_file_prefix='conf01'
colocate=None
colocate_freq=0.0001
twist=False
clip_n=30000
remote_servers = [0]
eps = range(4)

In [91]:
# methods = ['aqualight', 'wcmp', 'active-wcmp', 'active-wcmp-50', 'active-wcmp-150', 'active-wcmp-100', 'active-wcmp-500', 'active-wcmp-1000', 'aqualight-50', 'aqualight-150', 'aqualight-100', 'aqualight-500', 'aqualight-1000']
# methods = ['wcmp', 'active-wcmp-50', 'active-wcmp-150', 'active-wcmp-100', 'active-wcmp-500']
# methods = ['active-wcmp-1000', 'aqualight-50', 'aqualight-150', 'aqualight-100', 'aqualight-500']
# methods = ['ecmp', 'active-wcmp', 'aqualight', 'wcmp', 'rlb-sac-new']
methods = ['lsq', 'sed', 'rlb-qmix']
samples = ['hour{}.csv'.format(i) for i in hours2train]
config_prefix_lists = ['conf01']
eps = range(1)
from_orig=False

## Run methods

In [40]:
for method in methods:
    for config_prefix in config_prefix_lists:
        for i, sample in enumerate(samples):
            run_pipeline(
                lb_method=method,
                trace='wiki_600',
                experiment='ijcai-lb',
                sample=sample,
                from_orig=from_orig,
                config_file_prefix=config_prefix,
                colocate=None,
                colocate_freq=0.0001,
                twist=False,
                clip_n=30000,
                remote_servers = [0],
                eps = range(3))

NameError: name 'run_pipeline' is not defined

In [89]:
for method in methods:
    for config_prefix in config_prefix_lists:
        run_pipeline_train(
            lb_method=method,
            trace='wiki_600',
            experiment='ijcai-lb',
            sample=sample,
            from_orig=from_orig,
            config_file_prefix=config_prefix,
            colocate=None,
            colocate_freq=0.0001,
            twist=False,
            clip_n=30000,
            remote_servers = [0],
            eps = eps,
            samples = samples,
        )

init_task_info: alias=conf01
alias=conf01
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-rlb-sac-gru-discrete-hour2-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 49.093s
training on lb 0 is finished
training on lb 1 is finished
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-rlb-sac-gru-discrete-hour4-conf0

TODO:

- check overall page load time result
- check rlb reward evolvement
- check weight assignments
- if result is good, test different capacity ratio -> scale up to larger and different trace (transfer learning compare -- with or without pretrained models
- if result is bad, try simplest sac model and reduce server number to 2 (increase action space to 2)

## Extension

In [60]:
def run_pipeline_test(
    lb_method = 'wcmp',
    trace='wiki_600',
    experiment='atc-lb',
    sample='hour0.csv',
    from_orig=True,
    config_file_prefix='1lb-conf01',
    colocate=None,
    colocate_freq=0.0001,
    twist=False,
    clip_n=20000,
    remote_servers = [0],
    eps = range(3),
    samples=None
    ):
    server_id = 1
    if twist:
        config_file_prefix += '-twist'
    config_file = config_file_prefix+'-'+str(server_id)
    # config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in [0, 2, 3]}
    config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in remote_servers}

    ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

    if samples is None:
        samples = [sample]
    else:
        sample = samples[0]

    
    task_name, task_dir, nodes = tu.init_task_info(
        experiment=experiment,
        lb_method=lb_method,
        trace=trace,
        sample=sample,
        filename=config_file+'.json',
        colocate=colocate,
        colocate_freq=colocate_freq,
        alias=config_file_prefix
    )
    
    for server_id in remote_servers:
        if clip_n and server_id == 0:
            clip_str = '-n {}'.format(clip_n)
        else:
            clip_str = ''

        cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
            ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)

        tu.subprocess.Popen(cmd, shell=True)

    tu.prepare_img(lb_method=lb_method, from_orig=from_orig, debug_node=False)

    tu.runall()
    time.sleep(5)


    for sample in samples:
        task_name, task_dir, nodes = tu.init_task_info(
            experiment=experiment,
            lb_method=lb_method,
            trace=trace,
            sample=sample,
            filename=config_file+'.json',
            colocate=colocate,
            colocate_freq=colocate_freq,
            alias=config_file_prefix
        )

        print(">> run task {}".format(task_name))
    
        for server_id in remote_servers:
            if clip_n and server_id == 0:
                clip_str = '-n {}'.format(clip_n)
            else:
                clip_str = ''

            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --skip --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
                ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)
            tu.subprocess.Popen(cmd, shell=True)
        
        # run different episodes
        for ep in eps:
            print("== episode {} ==".format(ep))
            net_ok = False
            while not net_ok:
                try:
                    tu.gt_socket_check()
                    net_ok = True
                except:
                    print('error')
                    time.sleep(1)

            if 'rlb' in lb_method and 'test' in lb_method:
                for lb in tu.NODES['lb']:
                    if 'discrete' in lb_method:
                        cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep29 cisco@{4}:~/{2};'.format(
                            lb.ssh_port, '/home/yzy/Load-Balancer/data/results/atc-lb/wiki_600/{}/hour0-{}'.format(lb_method.strip('-test'), config_file_prefix), 'rl', lb.id, lb.physical_server_ip)
                    else:
                        cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep29 cisco@{4}:~/{2};'.format(
                            lb.ssh_port, '/home/yzy/Load-Balancer/data/results/atc-lb/wiki_600/{}/hour0-{}'.format(lb_method.strip('-test'), config_file_prefix), 'rl', lb.id, lb.physical_server_ip)
                    tu.subprocess_cmd(cmd)
            # start gathering at LB node
            for lb in tu.NODES['lb']:
                lb.run_init_bg()

            # run traffic
            t0 = time.time()
            cmd = 'ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json"'.format(
                experiment, colocate_freq, lb_method, trace, sample, config_file_remote[0])
            if colocate:
                cmd += ' --colocate {}'.format(colocate)

            tu.subprocess_cmd(cmd)
            print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))
            # time.sleep(5)

            # mark episode done
            for lb in tu.NODES['lb']:
                lb.execute_cmd_ssh("touch /home/cisco/done")

            # fetch results from nodes
            for server_id in remote_servers:
                cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/shutdown2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json --episode {}"'.format(
                    ip_remote[server_id], experiment, colocate_freq, lb_method, trace, sample, config_file_remote[server_id], ep)
                if ep == eps[-1] and samples == samples[-1]:
                    cmd += ' --shutdown'
                tu.subprocess.Popen(cmd, shell=True)

            for lb in tu.NODES['lb']:
                lb.fetch_result(task_dir, ep)
                if 'rlb' in lb_method:
                    lb.fetch_result(task_dir, ep, filename='rl')
                lb.shutdown()

            tu.prepare_img(lb_method=lb_method, from_orig=None, debug_node=False)
            for lb in tu.NODES['lb']:
                lb.run()
            tu.host_br_up()
    tu.shutall()

In [61]:
methods = ['ecmp', 'active-wcmp', 'aqualight', 'wcmp', 'rlb-sac-new-test', 'rlb-sac-gru-discrete-test', 'sed']
methods = ['lsq', 'ecmp', 'active-wcmp', 'aqualight', 'wcmp', 'rlb-sac-new-test']
samples = ['hour{}.csv'.format(i) for i in range(2, 12, 2)]
config_prefix_list = ['conf01']
from_orig=False

In [62]:
for method in methods:
    if method == 'lsq':
        samples = ['hour{}.csv'.format(i) for i in range(4, 12, 2)]
    else:
        samples = ['hour{}.csv'.format(i) for i in range(2, 12, 2)]
    for config_prefix in config_prefix_list:
        run_pipeline_test(
            lb_method=method,
            trace='wiki_600',
            experiment='atc-lb',
            sample=sample,
            from_orig=from_orig,
            config_file_prefix=config_prefix,
            colocate=None,
            colocate_freq=0.0001,
            twist=False,
            clip_n=30000,
            remote_servers = [0],
            eps = range(5),
            samples = samples
        )

init_task_info: alias=conf01
alias=conf01
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
Trace replay over w/ total time: 47.580s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.616s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ to

== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 80.112s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 81.650s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 4 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 83.881s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-ecmp-hour8-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 66.312s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 

node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-active-wcmp-hour10-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 85.572s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 85.564s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 85.553s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 85.540s
Create LB node image...
Create LB node image...
nod

node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-wcmp-hour2-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 84.166s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 84.543s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 82.872s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@loc

LB Node 1: pass
Trace replay over w/ total time: 47.718s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.585s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.592s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.592s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 4 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.621s
Create LB node image...
Create LB n