In [2]:
import pickle
import glob
import os

In [4]:
def percent(value, percent):
    min_value = -1
    return (value - min_value) * percent + min_value

In [3]:
data = {os.path.split(q)[1]: {os.path.split(p)[1]: pickle.load(open(p, 'rb')) for p in glob.glob(q+'/*.pkl')} for q in glob.glob('runs/Q*')}

In [39]:
def find_T_train(m_opts, m_rands):
    # We define the training time T_train as the number of games an algorithm needs to play in order to reach
    # 80% of its final performance according to both m_opt and m_rand.
    assert m_opts.keys() == m_rands.keys()

    m_opt_max = max(max(v) for v in m_opts.values())
    m_rand_max = max(max(v) for v in m_rands.values())
    print(f"m_opt_max: {m_opt_max}")
    print(f"m_rand_max: {m_rand_max}")

    m_opt_percent = percent(m_opt_max, 0.8)
    m_rand_percent = percent(m_rand_max, 0.8)
    print(r"80% of M_opt:", m_opt_percent)
    print(r"80% of M_rand:", m_rand_percent)

    T_train = 0
    for i, num_games in enumerate(range(0, 20_000, 250)):
        for key in m_opts.keys():
            if (m_opts[key][i] >= m_opt_percent) and (m_rands[key][i] >= m_rand_percent):
                T_train = num_games
                print(key)
            if T_train:
                break
        if T_train:
            break
    print(f"T_train: {T_train}")
    return T_train

### Q-learning with learning from experts

In [40]:
m_opts = {**{f"n_star={k}": v for k, v in data['Q2']['m_opt_dict.pkl'].items()}, 
          **{f"epsilon={k}": v for k, v in data['Q4']['m_opt_per_epsilon.pkl'].items()}}
m_rands = {**{f"n_star={k}": v for k, v in data['Q2']['m_rand_dict.pkl'].items()}, 
           **{f"epsilon={k}": v for k, v in data['Q4']['m_rand_per_epsilon.pkl'].items()}}
T_train = find_T_train(m_opts, m_rands)

m_opt_max: 0.0
m_rand_max: 0.902
80% of M_opt: -0.19999999999999996
80% of M_rand: 0.5216000000000003
n_star=1
T_train: 3000


### Q-learning with learning by self-practice

In [41]:
m_opts = {**{f"epsilon={k}": v for k, v in data['Q7']['m_opt_against_itself.pkl'].items()}, 
          **{f"n_star={k}": v for k, v in data['Q8']['m_opt_dict.pkl'].items()}}
m_rands = {**{f"epsilon={k}": v for k, v in data['Q7']['m_opt_against_itself.pkl'].items()}, 
           **{f"n_star={k}": v for k, v in data['Q8']['m_rand_dict.pkl'].items()}}
T_train = find_T_train(m_opts, m_rands)

m_opt_max: 0.0
m_rand_max: 0.942
80% of M_opt: -0.19999999999999996
80% of M_rand: 0.5536000000000001
n_star=20000
T_train: 6750


### Deep Q-learning with learning from experts

In [48]:
m_opts = {**{f"n_star={k}": v for k, v in data['Q13']['m_opt_dict.pkl'].items()}, 
          **{f"epsilon={k}": v for k, v in data['Q14']['m_opt_dict.pkl'].items()}}
m_rands = {**{f"n_star={k}": v for k, v in data['Q13']['m_rand_dict.pkl'].items()}, 
           **{f"epsilon={k}": v for k, v in data['Q14']['m_rand_dict.pkl'].items()}}
T_train = find_T_train(m_opts, m_rands)

m_opt_max: 0.0
m_rand_max: 0.964
80% of M_opt: -0.19999999999999996
80% of M_rand: 0.5712000000000002
n_star=10000
T_train: 1000


### Deep Q-learning with learning by self-practice

In [52]:
m_opts = {**{f"epsilon={k}": v for k, v in data['Q16']['m_opt_against_itself.pkl'].items()}, 
          **{f"n_star={k}": v for k, v in data['Q17']['m_opt_against_itself.pkl'].items()}}
m_rands = {**{f"epsilon={k}": v for k, v in data['Q16']['m_rand_against_itself.pkl'].items()}, 
           **{f"n_star={k}": v for k, v in data['Q17']['m_rand_against_itself.pkl'].items()}}
T_train = find_T_train(m_opts, m_rands)

m_opt_max: 0.0
m_rand_max: 0.906
80% of M_opt: -0.19999999999999996
80% of M_rand: 0.5248000000000002
epsilon=0.25
T_train: 5250
