In [1]:
import torch
import pandas as pd 
import numpy as np 
import altair as alt 
print(torch.__version__)

import os
from gym_runner.gym_runner import GymRunner
from gym_runner.q_func_approx import QFuncSmallThreelayer, QFuncLargeTwoLayer, QFuncMedThreelayer
from gym_runner.agents.q_learning_agent import  QLearningAgentExperienceReplay
from gym_runner.grid_search import GridSearch

1.10.0


In [2]:
param_grid = dict(
    alpha=np.logspace(-6, -2, 5),
    gamma=np.linspace(.9, .99, 5),
    # epsilon_decay=np.linspace(.997, .999, 5),
    q_func_approx = [QFuncSmallThreelayer, QFuncLargeTwoLayer, QFuncMedThreelayer],
    optimizer = ['SGD', 'Adam'],
    loss_func = ['mse', 'l1']
)

param_grid_small = dict(
    alpha=np.logspace(-6, -2, 3),
    gamma=np.linspace(.9, .99, 2),
)

In [3]:
gs_Q_exp = GridSearch(
    agent = QLearningAgentExperienceReplay,
    Q=QFuncMedThreelayer,
    runner=GymRunner,
    env_id = 'CartPole-v1',
    param_grid = param_grid
)

In [4]:
results_Q = gs_Q_exp.fit(num_procs = os.cpu_count())

Running 300 seperate agents through CartPole-v1 5 times each.
Using 24 processes.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [11:25<00:00,  2.28s/it]


In [5]:
results_Q.sort_values('mean_score').tail(10)

Unnamed: 0,alpha,gamma,loss_func,optimizer,q_func_approx,mean_score,max_mean_score
79,1e-05,0.9225,l1,SGD,<class 'q_func_approx.QFuncLargeTwoLayer'>,47.15,168.8
241,0.01,0.9,mse,SGD,<class 'q_func_approx.QFuncLargeTwoLayer'>,48.546,73.92
279,0.01,0.9675,mse,Adam,<class 'q_func_approx.QFuncSmallThreelayer'>,49.182,125.27
245,0.01,0.9,mse,Adam,<class 'q_func_approx.QFuncMedThreelayer'>,53.328,129.27
74,1e-05,0.9225,mse,SGD,<class 'q_func_approx.QFuncMedThreelayer'>,56.7,107.07
292,0.01,0.99,mse,Adam,<class 'q_func_approx.QFuncLargeTwoLayer'>,60.372,158.07
50,1e-06,0.99,mse,SGD,<class 'q_func_approx.QFuncMedThreelayer'>,77.774,266.06
127,0.0001,0.9,l1,SGD,<class 'q_func_approx.QFuncLargeTwoLayer'>,84.686,348.48
280,0.01,0.9675,mse,Adam,<class 'q_func_approx.QFuncLargeTwoLayer'>,115.532,228.56
293,0.01,0.99,mse,Adam,<class 'q_func_approx.QFuncMedThreelayer'>,147.0,413.46


In [19]:
best = gs_Q_exp._results[293].mean(axis = 1).argmax()
print(best)
rewards = gs_Q_exp._results[293][best]

3


In [20]:
rewards = pd.DataFrame(rewards).reset_index()
rewards.columns = ["episode", "reward"]


In [21]:
alt.Chart(rewards).mark_point().encode(x="episode", y="reward") 

In [25]:
runner = GymRunner('CartPole-v1', display_metrics = True)
num_actions = runner.env.action_space.n
state_dim = runner.env.observation_space.shape[0]

best_q_func = QFuncMedThreelayer(alpha = 0.01, loss_func = 'mse', optimizer = 'Adam', num_actions=num_actions, state_dim  = state_dim)
best_agent = QLearningAgentExperienceReplay(best_q_func, gamma = .99, num_actions = num_actions, state_dim = state_dim)

In [27]:
train_rewards = runner.train(best_agent)

Epsilon:  0.1375201748252333
Current Reward:  16.0
Episode:  990


In [29]:
test_rewards = runner.attempt(best_agent, num_episodes = 100)

In [30]:
train_rewards = pd.DataFrame(train_rewards).reset_index()
train_rewards.columns = ["episode", "reward"]

test_rewards = pd.DataFrame(test_rewards).reset_index()
test_rewards.columns = ["episode", "reward"]

In [32]:
alt.Chart(train_rewards).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards
).mark_point().encode(x="episode", y="reward")
