# Import libs

In [1]:
import sys
import os
utils_dir = '../../src/utils'
sys.path.insert(0, utils_dir) # add utils dir to path
import testbed_utils as tu
import time

# Pipeline

In [289]:
def run_pipeline_train(
    lb_method = 'wcmp',
    trace='wiki_600',
    experiment='atc-lb',
    sample='hour0.csv',
    from_orig=True,
    config_file_prefix='1lb-conf01',
    colocate=None,
    colocate_freq=0.0001,
    twist=False,
    clip_n=20000,
    remote_servers = [0],
    eps = range(3),
    samples=None,
    continue_from=None
    ):
    server_id = 1
    if twist:
        config_file_prefix += '-twist'
    config_file = config_file_prefix+'-'+str(server_id)
    # config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in [0, 2, 3]}
    config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in remote_servers}

    ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

    if samples is None:
        samples = [sample]
    else:
        sample = samples[0]

    
    task_name, task_dir, nodes = tu.init_task_info(
        experiment=experiment,
        lb_method=lb_method,
        trace=trace,
        sample=sample,
        filename=config_file+'.json',
        colocate=colocate,
        colocate_freq=colocate_freq,
        alias=config_file_prefix
    )
    
    for server_id in remote_servers:
        if clip_n and server_id == 0:
            clip_str = '-n {}'.format(clip_n)
        else:
            clip_str = ''

        cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
            ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)

        tu.subprocess.Popen(cmd, shell=True)

    tu.prepare_img(lb_method=lb_method, from_orig=from_orig, debug_node=False)

    tu.runall()
    time.sleep(5)
    
    ep_abs = 0
    
    if continue_from:
        ep_abs = continue_from['ep']
        # copy rl model
        if 'rlb' in lb_method:
            for lb in tu.NODES['lb']:
                if 'qmix' in lb_method:
                    lbid = 0
                else:
                    lbid = lb.id
                cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep{5} cisco@{4}:~/{2};'.format(
                    lb.ssh_port, continue_from['task_dir'], 'rl', lbid, lb.physical_server_ip, ep_abs)
                tu.subprocess_cmd(cmd)
                print("run cmd: {}".format(cmd))

        ep_abs += 1

    for sample in samples:
        task_name, task_dir, nodes = tu.init_task_info(
            experiment=experiment,
            lb_method=lb_method,
            trace=trace,
            sample=sample,
            filename=config_file+'.json',
            colocate=colocate,
            colocate_freq=colocate_freq,
            alias=config_file_prefix
        )

        print(">> run task {}".format(task_name))
    
        for server_id in remote_servers:
            if clip_n and server_id == 0:
                clip_str = '-n {}'.format(clip_n)
            else:
                clip_str = ''

#             cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --skip --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
                ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)
            tu.subprocess.Popen(cmd, shell=True)
            print("run cmd: {}".format(cmd))

        
        # run different episodes
        for ep in eps:
            print("== episode {} (abs: {}) ==".format(ep, ep_abs))
            net_ok = False
            while not net_ok:
                try:
                    tu.gt_socket_check()
                    net_ok = True
                except:
                    print('error')
                    time.sleep(1)

            # start gathering at LB node
            for lb in tu.NODES['lb'][::-1]:
                lb.run_init_bg()

            # run traffic
            t0 = time.time()
            cmd = 'ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json"'.format(
                experiment, colocate_freq, lb_method, trace, sample, config_file_remote[0])
            if colocate:
                cmd += ' --colocate {}'.format(colocate)
            print("run cmd: {}".format(cmd))
            tu.subprocess_cmd(cmd)
            print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))
            # time.sleep(5)

            # mark episode done
            for lb in tu.NODES['lb']:
                lb.execute_cmd_ssh("touch /home/cisco/done")

            # fetch results from nodes
            for server_id in remote_servers:
                cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/shutdown2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json --episode {}"'.format(
                    ip_remote[server_id], experiment, colocate_freq, lb_method, trace, sample, config_file_remote[server_id], ep_abs)
                if ep_abs == len(eps)*len(samples) -1:
                    cmd += ' --shutdown'
                tu.subprocess.Popen(cmd, shell=True)
                print("run cmd: {}".format(cmd))


            time.sleep(5) # let model train a bit
            if 'rlb' in lb_method:
                # check if training is done by probing the file /home/cisco/train_done
                for lb in tu.NODES['lb']:
                    while True:
                        lb.fetch_result('/home/yzy/dev', ep_abs, filename='train_done', isfolder=False)
                        res = !ls /home/yzy/dev/*train_done*
                        if not 'cannot access' in res[0] and not 'no matches found' in res[0]:
                            print("training on lb {} is finished".format(lb.id))
                            !rm /home/yzy/dev/*train_done*
                            break
                        time.sleep(5)

            for lb in tu.NODES['lb'][::-1]:
                lb.fetch_result(task_dir, ep_abs)
                if 'rlb' in lb_method:
                    rlb_last_model_dir = lb.fetch_result(task_dir, ep_abs, filename='rl')
                lb.shutdown()

            tu.prepare_img(lb_method=lb_method, from_orig=None, debug_node=False)
            for lb in tu.NODES['lb']:
                lb.run()
            tu.host_br_up()
            
            # copy rl model
            if 'rlb' in lb_method:
                for lb in tu.NODES['lb']:
                    if 'qmix' in lb_method:
                        lbid = 0
                    else:
                        lbid = lb.id
                    cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep{5} cisco@{4}:~/{2};'.format(
                        lb.ssh_port, task_dir, 'rl', lbid, lb.physical_server_ip, ep_abs)
                    tu.subprocess_cmd(cmd)
                    print("run cmd: {}".format(cmd))


            ep_abs += 1
    tu.shutall()

In [259]:
def run_pipeline_train(
    lb_method = 'wcmp',
    trace='wiki_600',
    experiment='atc-lb',
    from_orig=True,
    config_file_prefix='1lb-conf01',
    colocate=None,
    colocate_freq=0.0001,
    twist=False,
    clip_n=20000,
    remote_servers = [0],
    samples=None,
    continue_from=None
    ):
    server_id = 1
    if twist:
        config_file_prefix += '-twist'
    config_file = config_file_prefix+'-'+str(server_id)
    # config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in [0, 2, 3]}
    config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in remote_servers}

    ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

    if samples:
        sample = samples[0]
    else:
        print('no samples, return')
        return

    
    task_name, task_dir, nodes = tu.init_task_info(
        experiment=experiment,
        lb_method=lb_method,
        trace=trace,
        sample=sample,
        filename=config_file+'.json',
        colocate=colocate,
        colocate_freq=colocate_freq,
        alias=config_file_prefix
    )
    
    for server_id in remote_servers:
        if clip_n and server_id == 0:
            clip_str = '-n {}'.format(clip_n)
        else:
            clip_str = ''

        cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server-samples.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
            ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)

        tu.subprocess.Popen(cmd, shell=True)
        print("run cmd: {}".format(cmd))

    tu.prepare_img(lb_method=lb_method, from_orig=from_orig, debug_node=False)

    tu.runall()
    time.sleep(5)
    
    ep_abs = 0
    
    if continue_from:
        ep_abs = continue_from['ep']
        # copy rl model
        if 'rlb' in lb_method:
            for lb in tu.NODES['lb']:
                if 'qmix' in lb_method:
                    lbid = 0
                else:
                    lbid = lb.id
                cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep{5} cisco@{4}:~/{2};'.format(
                    lb.ssh_port, continue_from['task_dir'], 'rl', lbid, lb.physical_server_ip, ep_abs)
                tu.subprocess_cmd(cmd)
                print("run cmd: {}".format(cmd))

        ep_abs += 1

    for sample in samples[ep_abs:]:
        # 1 sample 1 episode
        task_name, task_dir, nodes = tu.init_task_info(
            experiment=experiment,
            lb_method=lb_method,
            trace=trace,
            sample=sample,
            filename=config_file+'.json',
            colocate=colocate,
            colocate_freq=colocate_freq,
            alias=config_file_prefix
        )

        print(">> run task {} -- episode {}".format(task_name, ep_abs))
    
        net_ok = False
        while not net_ok:
            try:
                tu.gt_socket_check()
                net_ok = True
            except:
                print('error')
                time.sleep(1)

        # start gathering at LB node
        for lb in tu.NODES['lb'][::-1]:
            lb.run_init_bg()

        # run traffic
        t0 = time.time()
        cmd = 'ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment {} --use-sample -m {} --tr {} --sample {} -f {}.json"'.format(
            experiment, lb_method, trace, sample, config_file_remote[0])
        print("run cmd: {}".format(cmd))
        tu.subprocess_cmd(cmd)
        print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))
        # time.sleep(5)

        # mark episode done
        for lb in tu.NODES['lb']:
            lb.execute_cmd_ssh("touch /home/cisco/done")

        # fetch results from nodes
        for server_id in remote_servers:
            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/shutdown2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json --episode {}"'.format(
                ip_remote[server_id], experiment, colocate_freq, lb_method, trace, sample, config_file_remote[server_id], ep_abs)
            if ep_abs == len(samples) -1:
                cmd += ' --shutdown'
            tu.subprocess.Popen(cmd, shell=True)
            print("run cmd: {}".format(cmd))


        time.sleep(5) # let model train a bit
        if 'rlb' in lb_method:
            # check if training is done by probing the file /home/cisco/train_done
            for lb in tu.NODES['lb']:
                while True:
                    lb.fetch_result('/home/yzy/dev', ep_abs, filename='train_done', isfolder=False)
                    res = !ls /home/yzy/dev/*train_done*
                    if not 'cannot access' in res[0] and not 'no matches found' in res[0]:
                        print("training on lb {} is finished".format(lb.id))
                        !rm /home/yzy/dev/*train_done*
                        break
                    time.sleep(5)

        for lb in tu.NODES['lb'][::-1]:
            lb.fetch_result(task_dir, ep_abs)
            if 'rlb' in lb_method:
                rlb_last_model_dir = lb.fetch_result(task_dir, ep_abs, filename='rl')
            lb.shutdown()

        tu.prepare_img(lb_method=lb_method, from_orig=None, debug_node=False)
        for lb in tu.NODES['lb']:
            lb.run()
        tu.host_br_up()

        # copy rl model
        if 'rlb' in lb_method:
            for lb in tu.NODES['lb']:
                if 'qmix' in lb_method:
                    lbid = 0
                else:
                    lbid = lb.id
                cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep{5} cisco@{4}:~/{2};'.format(
                    lb.ssh_port, task_dir, 'rl', lbid, lb.physical_server_ip, ep_abs)
                tu.subprocess_cmd(cmd)
                print("run cmd: {}".format(cmd))

        ep_abs += 1
    tu.shutall()

### Choose samples for 2lb

In [301]:
# hours with more than 500 queries/s
hours = [0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 17, 18, 19, 20, 21, 22, 23]
hours2train = [h for h in hours if h%2 != 0]
hours2test = [h for h in hours if h%2 == 0]

In [311]:
lb_method = 'rlb-qmix'
trace='wiki_600'
experiment='ijcai-lb'
sample='hour0.csv'
from_orig=None
config_file_prefix='conf01'
colocate=None
colocate_freq=0.0001
twist=False
clip_n=30000
remote_servers = [0]
eps = range(4)

In [312]:
methods = ['rlb-qmix']
samples = ['hour{}.csv'.format(i) for i in hours2train*3]
config_prefix_lists = ['conf01']
eps = range(3)
clip_n=30000
from_orig=False

In [313]:
train_samples = []
for sample in samples:
    for i in range(3):
        train_samples.append(sample)

## Run methods

In [305]:
for method in methods:
    for config_prefix in config_prefix_lists:
        run_pipeline_train(
            lb_method=method,
            trace='wiki_600',
            experiment='ijcai-lb',
            sample=sample,
            from_orig=from_orig,
            config_file_prefix=config_prefix,
            colocate=None,
            colocate_freq=0.0001,
            twist=False,
            clip_n=clip_n,
            remote_servers = [0],
            eps = eps,
            samples = samples,
            continue_from={'ep': 2, 'task_dir': }
        )

init_task_info: alias=conf01
alias=conf01
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-rlb-qmix-hour1-conf01
run cmd: ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment ijcai-lb --colocate-freq 0.0001 -m ecmp --tr wiki_600 --sample hour1.csv -f conf01-0.json -n 30000"
== episode 0 (abs: 0) ==
LB Node 0: pass
LB Node 1: pass
run cmd: ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment ijcai-lb --colocate-freq 0.000

KeyboardInterrupt: 

## Extension

In [281]:
# unit test
lb_method = 'wcmp'
trace='wiki_600'
experiment='atc-lb'
sample='hour0.csv'
from_orig=False
config_file_prefix='conf01'
colocate=None
colocate_freq=0.0001
twist=False
clip_n=20000
remote_servers = [0]
eps = range(3)
samples=None
server_id = 1

config_file = config_file_prefix+'-'+str(server_id)
# config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in [0, 2, 3]}
config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in remote_servers}

ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

if samples is None:
    samples = [sample]
else:
    sample = samples[0]


task_name, task_dir, nodes = tu.init_task_info(
    experiment=experiment,
    lb_method=lb_method,
    trace=trace,
    sample=sample,
    filename=config_file+'.json',
    colocate=colocate,
    colocate_freq=colocate_freq,
    alias=config_file_prefix
)

for server_id in remote_servers:
    if clip_n and server_id == 0:
        clip_str = '-n {}'.format(clip_n)
    else:
        clip_str = ''

    cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
        ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)

    tu.subprocess.Popen(cmd, shell=True)

tu.prepare_img(lb_method=lb_method, from_orig=from_orig, debug_node=False)

tu.runall()
time.sleep(5)


init_task_info: alias=conf01
alias=conf01
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost


In [None]:
sample = samples[0]

In [282]:
task_name, task_dir, nodes = tu.init_task_info(
    experiment=experiment,
    lb_method=lb_method,
    trace=trace,
    sample=sample,
    filename=config_file+'.json',
    colocate=colocate,
    colocate_freq=colocate_freq,
    alias=config_file_prefix
)

print(">> run task {}".format(task_name))

for server_id in remote_servers:
    if clip_n and server_id == 0:
        clip_str = '-n {}'.format(clip_n)
    else:
        clip_str = ''

    cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --skip --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
        ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)
    tu.subprocess.Popen(cmd, shell=True)

init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-wcmp-hour0-conf01


In [283]:
# run different episodes
ep = 0
print("== episode {} ==".format(ep))
net_ok = False
while not net_ok:
    try:
        tu.gt_socket_check()
        net_ok = True
    except:
        print('error')
        time.sleep(1)

if 'rlb' in lb_method and 'test' in lb_method:
    for lb in tu.NODES['lb']:
        if 'discrete' in lb_method:
            if 'qmix' in lb_method:
                lbid = 0
            else:
                lbid = lb.id
            cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep71 cisco@{4}:~/{2};'.format(
                lb.ssh_port, '/home/yzy/Load-Balancer/data/results/ijcai-lb/wiki_600/{}/hour23-{}'.format(lb_method.replace('-test', ''), config_file_prefix), 'rl', lbid, lb.physical_server_ip)
        else:
            cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep29 cisco@{4}:~/{2};'.format(
                lb.ssh_port, '/home/yzy/Load-Balancer/data/results/atc-lb/wiki_600/{}/hour0-{}'.format(lb_method.strip('-test'), config_file_prefix), 'rl', lb.id, lb.physical_server_ip)
        tu.subprocess_cmd(cmd)

== episode 0 ==
LB Node 0: pass
LB Node 1: pass


In [284]:
# start gathering at LB node
for lb in tu.NODES['lb'][::-1]:
    lb.run_init_bg()

# run traffic
t0 = time.time()
cmd = 'ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json"'.format(
    experiment, colocate_freq, lb_method, trace, sample, config_file_remote[0])
if colocate:
    cmd += ' --colocate {}'.format(colocate)

tu.subprocess_cmd(cmd)
print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))
# time.sleep(5)

# mark episode done
for lb in tu.NODES['lb']:
    lb.execute_cmd_ssh("touch /home/cisco/done")

Trace replay over w/ total time: 31.626s


In [285]:
# fetch results from nodes
for server_id in remote_servers:
    cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/shutdown2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json --episode {}"'.format(
        ip_remote[server_id], experiment, colocate_freq, lb_method, trace, sample, config_file_remote[server_id], ep)
    cmd += ' --shutdown'
    tu.subprocess.Popen(cmd, shell=True)

tu.shutall()

## Pipeline

In [286]:
def run_pipeline_test(
    lb_method = 'wcmp',
    trace='wiki_600',
    experiment='atc-lb',
    sample='hour0.csv',
    from_orig=True,
    config_file_prefix='1lb-conf01',
    colocate=None,
    colocate_freq=0.0001,
    twist=False,
    clip_n=20000,
    remote_servers = [0],
    eps = range(3),
    samples=None
    ):
    server_id = 1
    if twist:
        config_file_prefix += '-twist'
    config_file = config_file_prefix+'-'+str(server_id)
    # config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in [0, 2, 3]}
    config_file_remote = {i: config_file_prefix+'-{}'.format(i) for i in remote_servers}

    ip_remote = tu.COMMON_CONF['net']['physical_server_ip']

    if samples is None:
        samples = [sample]
    else:
        sample = samples[0]

    
    task_name, task_dir, nodes = tu.init_task_info(
        experiment=experiment,
        lb_method=lb_method,
        trace=trace,
        sample=sample,
        filename=config_file+'.json',
        colocate=colocate,
        colocate_freq=colocate_freq,
        alias=config_file_prefix
    )
    
    for server_id in remote_servers:
        if clip_n and server_id == 0:
            clip_str = '-n {}'.format(clip_n)
        else:
            clip_str = ''

        cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
            ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)

        tu.subprocess.Popen(cmd, shell=True)

    tu.prepare_img(lb_method=lb_method, from_orig=from_orig, debug_node=False)

    tu.runall()
    time.sleep(5)


    for sample in samples:
        task_name, task_dir, nodes = tu.init_task_info(
            experiment=experiment,
            lb_method=lb_method,
            trace=trace,
            sample=sample,
            filename=config_file+'.json',
            colocate=colocate,
            colocate_freq=colocate_freq,
            alias=config_file_prefix
        )

        print(">> run task {}".format(task_name))
    
        for server_id in remote_servers:
            if clip_n and server_id == 0:
                clip_str = '-n {}'.format(clip_n)
            else:
                clip_str = ''

            cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/run2server.py --experiment {} --skip --colocate-freq {} -m {} --tr {} --sample {} -f {}.json {}"'.format(
                ip_remote[server_id], experiment, colocate_freq, 'ecmp', trace, sample, config_file_remote[server_id], clip_str)
            tu.subprocess.Popen(cmd, shell=True)
        
        # run different episodes
        for ep in eps:
            print("== episode {} ==".format(ep))
            net_ok = False
            while not net_ok:
                try:
                    tu.gt_socket_check()
                    net_ok = True
                except:
                    print('error')
                    time.sleep(1)

            if 'rlb' in lb_method and 'test' in lb_method:
                for lb in tu.NODES['lb']:
                    if 'discrete' in lb_method:
                        if 'qmix' in lb_method:
                            lbid = 0
                        else:
                            lbid = lb.id
                        cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep71 cisco@{4}:~/{2};'.format(
                            lb.ssh_port, '/home/yzy/Load-Balancer/data/results/ijcai-lb/wiki_600/{}/hour23-{}'.format(lb_method.replace('-test', ''), config_file_prefix), 'rl', lbid, lb.physical_server_ip)
                    else:
                        cmd = 'scp -i ~/.ssh/lb_rsa -oStrictHostKeyChecking=no -P {0} -r {1}/{3}_{2}_ep29 cisco@{4}:~/{2};'.format(
                            lb.ssh_port, '/home/yzy/Load-Balancer/data/results/atc-lb/wiki_600/{}/hour0-{}'.format(lb_method.strip('-test'), config_file_prefix), 'rl', lb.id, lb.physical_server_ip)
                    tu.subprocess_cmd(cmd)
            # start gathering at LB node
            for lb in tu.NODES['lb'][::-1]:
                lb.run_init_bg()

            # run traffic
            t0 = time.time()
            cmd = 'ssh -t yzy@10.60.16.12 "python3 /home/yzy/Load-Balancer/src/utils/run_traffic.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json"'.format(
                experiment, colocate_freq, lb_method, trace, sample, config_file_remote[0])
            if colocate:
                cmd += ' --colocate {}'.format(colocate)

            tu.subprocess_cmd(cmd)
            print("Trace replay over w/ total time: {:.3f}s".format(time.time()-t0))
            # time.sleep(5)

            # mark episode done
            for lb in tu.NODES['lb']:
                lb.execute_cmd_ssh("touch /home/cisco/done")

            # fetch results from nodes
            for server_id in remote_servers:
                cmd = 'ssh -t yzy@{} "python3 /home/yzy/Load-Balancer/src/utils/shutdown2server.py --experiment {} --colocate-freq {} -m {} --tr {} --sample {} -f {}.json --episode {}"'.format(
                    ip_remote[server_id], experiment, colocate_freq, lb_method, trace, sample, config_file_remote[server_id], ep)
                if ep == eps[-1] and sample == samples[-1]:
                    cmd += ' --shutdown'
                tu.subprocess.Popen(cmd, shell=True)

            for lb in tu.NODES['lb'][::-1]:
                lb.fetch_result(task_dir, ep)
                if 'rlb' in lb_method:
                    lb.fetch_result(task_dir, ep, filename='rl')
                lb.shutdown()

            tu.prepare_img(lb_method=lb_method, from_orig=None, debug_node=False)
            for lb in tu.NODES['lb']:
                lb.run()
            tu.host_br_up()
    tu.shutall()

In [287]:
methods = ['ecmp', 'active-wcmp', 'aqualight', 'wcmp', 'rlb-sac-new-test', 'rlb-sac-gru-discrete-test', 'sed']
samples = ['hour{}.csv'.format(i) for i in [0, 2, 4, 6, 8]]
config_prefix_list = ['conf01']
from_orig=False

In [288]:
for method in methods:
    for config_prefix in config_prefix_list:
        run_pipeline_test(
            lb_method=method,
            trace='wiki_600',
            experiment='atc-lb',
            sample=sample,
            from_orig=from_orig,
            config_file_prefix=config_prefix,
            colocate=None,
            colocate_freq=0.0001,
            twist=False,
            clip_n=30000,
            remote_servers = [0],
            eps = range(5),
            samples = samples
        )

init_task_info: alias=conf01
alias=conf01
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
node_server_0 ready: ssh -p 9000 cisco@localhost
node_server_1 ready: ssh -p 9001 cisco@localhost
node_server_2 ready: ssh -p 9002 cisco@localhost
node_server_3 ready: ssh -p 9003 cisco@localhost
node_server_4 ready: ssh -p 9004 cisco@localhost
node_server_5 ready: ssh -p 9005 cisco@localhost
node_server_6 ready: ssh -p 9006 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-sed-hour0-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 42.726s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 42.635s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisc

Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-active-wcmp-hour2-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 48.834s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 49.197s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 49.105s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 48.770s
Crea

node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 48.178s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 2 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.756s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.615s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 4 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.667s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
in

Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 3 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.414s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 4 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 47.468s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
init_task_info: alias=conf01
alias=conf01
>> run task wiki_600-lsq-hour8-conf01
== episode 0 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 62.240s
Create LB node image...
Create LB node image...
node_lb_0 ready: ssh -p 8900 cisco@localhost
node_lb_1 ready: ssh -p 8901 cisco@localhost
== episode 1 ==
LB Node 0: pass
LB Node 1: pass
Trace replay over w/ total time: 62.252s
Create LB no