In [11]:
from plot_utils import data_dict, data_dict_icml, max_episodes_dict, MOUNTAINCARCONTINUOUS_K, PENDULUM_K
from run_icml import split_max_episodes
from numpy import mean
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# seeds
seeds = [224, 389, 405, 432, 521, 580, 639, 673, 803, 869]
seeds_icml = [237, 379, 482, 672, 886]
# envs
envs = ["Acrobot-v1", "CartPole-v1", "MountainCar-v0", "MountainCarContinuous-v0", "Pendulum-v1", "LunarLander-v2"]
# methods
methods = ["binQ", "CAT-RL", "TileCoding"]


## Results of experimnet fixed episodes

In [3]:
from numpy import std


data = data_dict(methods, envs, seeds)

l = []
for env in envs:
    for method in methods:
        success_rate = []
        accumulated_reward = []
        avg_reward_ep = []
        for seed in seeds:
            success_rate.append(mean(data[method][env][seed]["success"]))
            accumulated_reward.append(sum(data[method][env][seed]["accumulated reward"]))
            avg_reward_ep.append(mean(data[method][env][seed]["reward"]))

        
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)
        mean_avg_reward_ep = mean(avg_reward_ep)
        std_avg_reward_ep = std(avg_reward_ep)

        l.append({"method": method,
                  "env": env,
                  "success rate": avg_success_rate,
                  "success rate sts": std_success_rate,
                  "accumulated reward": avg_accumulated_reward,
                  "accumulated reward std": std_accumulated_reward,
                  "avg reward per episode": mean_avg_reward_ep,
                  "avg reward per episode std": std_avg_reward_ep})

In [4]:
from numpy import std

algos = ["ppo"]
data = data_dict_icml(algos, envs, seeds_icml)

for env in envs:
    for algo in algos:
        success_rate = []
        accumulated_reward = []
        avg_reward_ep = []
        for seed in seeds_icml:
            success_rate.append(mean(data[algo][env][seed]["success"]))
            accumulated_reward.append(sum(data[algo][env][seed]["accumulated reward"]))
            avg_reward_ep.append(mean(data[algo][env][seed]["rewards"]))
        
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)
        mean_avg_reward_ep = mean(avg_reward_ep)
        std_avg_reward_ep = std(avg_reward_ep)

        l.append({"method": algo,
                  "env": env,
                  "success rate": avg_success_rate,
                  "success rate sts": std_success_rate,
                  "accumulated reward": avg_accumulated_reward,
                  "accumulated reward std": std_accumulated_reward,
                  "avg reward per episode": mean_avg_reward_ep,
                  "avg reward per episode std": std_avg_reward_ep})

In [5]:
df = pd.DataFrame(l).sort_values(by=["method", "env"])
df

Unnamed: 0,method,env,success rate,success rate sts,accumulated reward,accumulated reward std,avg reward per episode,avg reward per episode std
1,CAT-RL,Acrobot-v1,0.38,0.045,-347618363.5,79685500.774,-55.285,53.239
4,CAT-RL,CartPole-v1,0.0,0.0,251650810.2,57390804.337,13.599,3.942
16,CAT-RL,LunarLander-v2,0.0,0.0,-4025443733.564,71502043.625,-196.053,6.175
7,CAT-RL,MountainCar-v0,0.965,0.02,14464492011.118,580858094.467,1161.639,48.126
10,CAT-RL,MountainCarContinuous-v0,0.706,0.423,188131599.149,42681942.763,369.512,89.419
13,CAT-RL,Pendulum-v1,0.019,0.015,-19360056614.88,1090619204.989,-924.669,139.056
2,TileCoding,Acrobot-v1,0.86,0.021,1020687281.9,92993773.39,628.536,25.84
5,TileCoding,CartPole-v1,0.0,0.0,185847254.6,1584575.768,10.062,0.064
17,TileCoding,LunarLander-v2,0.017,0.001,-1761775790.133,37050527.71,-97.726,2.077
8,TileCoding,MountainCar-v0,0.138,0.008,1355634141.843,154421772.818,107.614,8.868


## Evaluation

In [52]:
data = data_dict(methods, envs, seeds, folder="results-after-train")

l = []
for env in envs:
    for method in methods:
        success_rate = []
        accumulated_reward = []
        for seed in seeds:
            success_rate.append(mean(data[method][env][seed]["success"]))
            accumulated_reward.append(sum(data[method][env][seed]["accumulated reward"]))
            
        avg_success_rate = mean(success_rate)
        std_success_rate = std(success_rate)
        avg_accumulated_reward = mean(accumulated_reward)
        std_accumulated_reward = std(accumulated_reward)

        l.append({"method": method, "env": env, "success rate": avg_success_rate, "success rate sts": std_success_rate, "accumulated reward": avg_accumulated_reward, "accumulated reward std": std_accumulated_reward})


df = pd.DataFrame(l).sort_values(by=["method", "env"])
df

Unnamed: 0,method,env,success rate,success rate sts,accumulated reward,accumulated reward std
1,CAT-RL,Acrobot-v1,0.575,0.431,103374391.7,271870019.555
4,CAT-RL,CartPole-v1,0.0,0.0,6523779.4,3119390.011
16,CAT-RL,LunarLander-v2,0.0,0.0,-80072688.99,18531447.583
7,CAT-RL,MountainCar-v0,0.968,0.038,585005797.476,33622048.253
10,CAT-RL,MountainCarContinuous-v0,0.8,0.4,342994334.344,203403990.949
13,CAT-RL,Pendulum-v1,0.124,0.145,-174570782.72,381581271.57
2,TileCoding,Acrobot-v1,0.996,0.006,408798172.8,25910993.136
5,TileCoding,CartPole-v1,0.0,0.0,4851050.2,18006.067
17,TileCoding,LunarLander-v2,0.012,0.014,-74566564.325,57839713.236
8,TileCoding,MountainCar-v0,0.199,0.397,86852149.094,260718879.096


In [16]:
folder = "results"
info_l = []
for env in envs:
    for method in methods:
        time_l = []
        for seed in seeds:
            path = f"{folder}/{method}/{env}/{method}_{seed}_info.csv"
            temp = pd.read_csv(path)
            time_l.append(temp["time"].values[-1])
        mean_time = mean(time_l)
        std_time = std(time_l)
        info_l.append({"method": method, "env": env, "time": mean_time, "std time": std_time})


In [17]:
folder = "results"
for env in envs:
    max_episodes = max_episodes_dict[env]
    policy_episodes, experiment_episodes = split_max_episodes(max_episodes) 
    for algo in algos:
        time_l = []
        for seed in seeds_icml:
            
            if env == "MountainCarContinuous-v0":
                model = f"icml_{policy_episodes}_{MOUNTAINCARCONTINUOUS_K}_{algo}_{experiment_episodes}"
            elif env == "Pendulum-v1":
                model = f"icml_{policy_episodes}_{PENDULUM_K}_{algo}_{experiment_episodes}"
            else:
                model = f"icml_{policy_episodes}_{algo}_{experiment_episodes}" 
                
            path = f"{folder}/icml/{env}/{model}_{seed}_info.csv"
            temp = pd.read_csv(path)
            time_l.append(temp["total_train_time"].values[-1])
        mean_time = mean(time_l)
        std_time = std(time_l)
        info_l.append({"method": algo, "env": env, "time": mean_time, "std time": std_time})

info_df = pd.DataFrame(info_l).sort_values(by=["method", "env"])

In [18]:
info_df

Unnamed: 0,method,env,time,std time
0,ppo,Acrobot-v1,1204.923,23.624
1,ppo,CartPole-v1,2106.327,92.317
5,ppo,LunarLander-v2,3879.346,183.861
2,ppo,MountainCar-v0,1267.065,39.017
3,ppo,MountainCarContinuous-v0,1148.086,35.347
4,ppo,Pendulum-v1,1508.879,70.89
