In [45]:
from plot_utils import data_dict, data_dict_icml, max_episodes_dict
from numpy import mean
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [46]:
# seeds
seeds = [224, 389, 405, 432, 521, 580, 639, 673, 803, 869]
seeds_icml = [237, 379, 482, 672, 886]
# envs
envs = ["Acrobot-v1", "CartPole-v1", "MountainCar-v0", "MountainCarContinuous-v0", "Pendulum-v1", "LunarLander-v2"]
# methods
methods = ["binQ", "CAT-RL", "TileCoding"]


## Results of experimnet fixed episodes

In [47]:
from numpy import std


data = data_dict(methods, envs, seeds)

l = []
for env in envs:
    for method in methods:
        success_rate = []
        accumulated_reward = []
        for seed in seeds:
            success_rate.append(mean(data[method][env][seed]["success"]))
            accumulated_reward.append(sum(data[method][env][seed]["accumulated reward"]))
        
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)
        acc_reward_pr_episodes = avg_accumulated_reward / max_episodes_dict[env]

        l.append({"method": method,
                  "env": env,
                  "success rate": avg_success_rate,
                  "success rate sts": std_success_rate,
                  "accumulated reward": avg_accumulated_reward,
                  "accumulated reward std": std_accumulated_reward,
                  "avg reward per episode": acc_reward_pr_episodes})

In [48]:
from numpy import std

algos = ["ppo"]
data = data_dict_icml(algos, envs, seeds_icml)

for env in envs:
    for algo in algos:
        success_rate = []
        accumulated_reward = []
        for seed in seeds_icml:
            success_rate.append(mean(data[algo][env][seed]["success"]))
            accumulated_reward.append(sum(data[algo][env][seed]["accumulated reward"]))
        
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)
        avg_reward_pr_episodes = avg_accumulated_reward / max_episodes_dict[env]

        l.append({"method": algo,
                  "env": env,
                  "success rate": avg_success_rate,
                  "success rate sts": std_success_rate,
                  "accumulated reward": avg_accumulated_reward,
                  "accumulated reward std": std_accumulated_reward,
                  "avg reward per episode": avg_reward_pr_episodes})

In [49]:
df = pd.DataFrame(l).sort_values(by=["env"])
df

Unnamed: 0,method,env,success rate,success rate sts,accumulated reward,accumulated reward std,avg reward per episode
0,binQ,Acrobot-v1,0.513,0.017,-133942787.8,51462217.431,-66971.394
1,CAT-RL,Acrobot-v1,0.38,0.045,-347618363.5,79685500.774,-173809.182
2,TileCoding,Acrobot-v1,0.86,0.021,1020687281.9,92993773.39,510343.641
18,ppo,Acrobot-v1,0.014,0.005,-155358900.2,1921178.741,-77679.45
3,binQ,CartPole-v1,0.0,0.0,205773254.0,5570689.014,34295.542
4,CAT-RL,CartPole-v1,0.0,0.0,251650810.2,57390804.337,41941.802
5,TileCoding,CartPole-v1,0.0,0.0,185847254.6,1584575.768,30974.542
19,ppo,CartPole-v1,0.0,0.0,60623793.8,891367.382,10103.966
17,TileCoding,LunarLander-v2,0.017,0.001,-1761775790.133,37050527.71,-293629.298
16,CAT-RL,LunarLander-v2,0.0,0.0,-4025443733.564,71502043.625,-670907.289


## Evaluation

In [50]:
data = data_dict(methods, envs, seeds, folder="results-after-train")

l = []
for env in envs:
    for method in methods:
        success_rate = []
        accumulated_reward = []
        for seed in seeds:
            success_rate.append(mean(data[method][env][seed]["success"]))
            accumulated_reward.append(sum(data[method][env][seed]["accumulated reward"]))
            
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)

        l.append({"method": method, "env": env, "success rate": avg_success_rate, "success rate sts": std_success_rate, "accumulated reward": avg_accumulated_reward, "accumulated reward std": std_accumulated_reward})


df = pd.DataFrame(l).sort_values(by=["method", "env"])
df

Unnamed: 0,method,env,success rate,success rate sts,accumulated reward,accumulated reward std
1,CAT-RL,Acrobot-v1,0.575,0.431,103374391.7,271870019.555
4,CAT-RL,CartPole-v1,0.0,0.0,6523779.4,3119390.011
16,CAT-RL,LunarLander-v2,0.0,0.0,-80072688.99,18531447.583
7,CAT-RL,MountainCar-v0,0.968,0.038,585005797.476,33622048.253
10,CAT-RL,MountainCarContinuous-v0,0.8,0.4,342994334.344,203403990.949
13,CAT-RL,Pendulum-v1,0.124,0.145,-174570782.72,381581271.57
2,TileCoding,Acrobot-v1,0.996,0.006,408798172.8,25910993.136
5,TileCoding,CartPole-v1,0.0,0.0,4851050.2,18006.067
17,TileCoding,LunarLander-v2,0.012,0.014,-74566564.325,57839713.236
8,TileCoding,MountainCar-v0,0.199,0.397,86852149.094,260718879.096


In [51]:
# avg reward per episode
l = []
for env in envs:
    for method in methods:
        avg_reward_ep = []
        for seed in seeds:

            avg_reward_ep.append(mean(data[method][env][seed]["reward"]))
        
        mean_avg_reward_ep = mean(avg_reward_ep)
        std_avg_reward_ep = std(avg_reward_ep)
        l.append({"method": method, "env": env, "sum reward": mean_avg_reward_ep, "std reward": std_avg_reward_ep})
df = pd.DataFrame(l).sort_values(by=["method", "env"])
print(df)
# data frame to latex table
df.to_latex(index=False)

        method                       env  sum reward  std reward
1       CAT-RL                Acrobot-v1     207.259     544.512
4       CAT-RL               CartPole-v1      13.020       6.146
16      CAT-RL            LunarLander-v2    -160.227      36.629
7       CAT-RL            MountainCar-v0    1168.740      66.648
10      CAT-RL  MountainCarContinuous-v0     685.359     406.610
13      CAT-RL               Pendulum-v1    -348.050     759.771
2   TileCoding                Acrobot-v1     816.423      53.131
5   TileCoding               CartPole-v1       9.687       0.031
17  TileCoding            LunarLander-v2    -147.956     114.543
8   TileCoding            MountainCar-v0     173.689     520.997
11  TileCoding  MountainCarContinuous-v0    1473.572    1280.430
14  TileCoding               Pendulum-v1    -977.944     406.586
0         binQ                Acrobot-v1     783.665      50.402
3         binQ               CartPole-v1       9.174       0.094
15        binQ           

'\\begin{tabular}{llrr}\n\\toprule\nmethod & env & sum reward & std reward \\\\\n\\midrule\nCAT-RL & Acrobot-v1 & 207.258600 & 544.511990 \\\\\nCAT-RL & CartPole-v1 & 13.019600 & 6.146334 \\\\\nCAT-RL & LunarLander-v2 & -160.226937 & 36.628955 \\\\\nCAT-RL & MountainCar-v0 & 1168.740146 & 66.648395 \\\\\nCAT-RL & MountainCarContinuous-v0 & 685.358859 & 406.610391 \\\\\nCAT-RL & Pendulum-v1 & -348.050003 & 759.771148 \\\\\nTileCoding & Acrobot-v1 & 816.422900 & 53.130532 \\\\\nTileCoding & CartPole-v1 & 9.687000 & 0.031279 \\\\\nTileCoding & LunarLander-v2 & -147.956194 & 114.543336 \\\\\nTileCoding & MountainCar-v0 & 173.689267 & 520.996633 \\\\\nTileCoding & MountainCarContinuous-v0 & 1473.571815 & 1280.430432 \\\\\nTileCoding & Pendulum-v1 & -977.943718 & 406.585534 \\\\\nbinQ & Acrobot-v1 & 783.665400 & 50.401798 \\\\\nbinQ & CartPole-v1 & 9.173900 & 0.093914 \\\\\nbinQ & LunarLander-v2 & -92.421568 & 26.760235 \\\\\nbinQ & MountainCar-v0 & 420.811312 & 563.605493 \\\\\nbinQ & Mount