In [1]:
from gym_runner import GymRunner
from q_func_approx import QualityFuncApprox
from agents.sarsa_agent import SarsaAgent
from agents.q_learning_agent import QLearningAgent
import pandas as pd
import altair as alt



In [2]:
runner = GymRunner("CartPole-v1", display_metrics=True)



In [3]:
num_actions = runner.env.action_space.n


In [4]:
num_states = runner.env.observation_space.shape[0]
num_states


4

In [5]:
q_func = QualityFuncApprox(
    num_states=num_states,
    num_actions=num_actions,
    optimizer="sgd",
    loss_func="l1",
    alpha=0.01,
)


In [6]:
q_func_0 = QualityFuncApprox(
    num_states=num_states,
    num_actions=num_actions,
    optimizer="sgd",
    loss_func="l1",
    alpha=0.01,
)


In [7]:
agent = SarsaAgent(
    q_func_approx=q_func,
    num_states=num_states,
    num_actions=num_actions,
    gamma=0.9225,
    epsilon=1.0,
    epsilon_decay=0.9975,
)


In [8]:
agent_0 = QLearningAgent(
    q_func_approx=q_func_0,
    num_states=num_states,
    num_actions=num_actions,
    gamma=0.9225,
    epsilon=1.0,
    epsilon_decay=0.9975,
)


In [9]:
rewards = runner.train(agent=agent, num_episodes=1000, plot_state=False)



Epsilon:  0.1
Current Reward:  179.0
Episode:  990


In [10]:
rewards_0 = runner.train(agent=agent_0, num_episodes=1000, plot_state=False)



Epsilon:  0.1
Current Reward:  10.0
Episode:  990


In [11]:
rewards


[22.0,
 22.0,
 52.0,
 44.0,
 22.0,
 35.0,
 19.0,
 34.0,
 37.0,
 42.0,
 17.0,
 32.0,
 29.0,
 22.0,
 14.0,
 28.0,
 20.0,
 26.0,
 17.0,
 11.0,
 11.0,
 12.0,
 12.0,
 23.0,
 68.0,
 15.0,
 20.0,
 22.0,
 20.0,
 27.0,
 29.0,
 11.0,
 24.0,
 23.0,
 24.0,
 23.0,
 13.0,
 18.0,
 11.0,
 87.0,
 18.0,
 53.0,
 9.0,
 17.0,
 15.0,
 26.0,
 26.0,
 26.0,
 43.0,
 26.0,
 24.0,
 11.0,
 19.0,
 19.0,
 22.0,
 12.0,
 14.0,
 24.0,
 15.0,
 36.0,
 24.0,
 23.0,
 16.0,
 35.0,
 24.0,
 19.0,
 32.0,
 19.0,
 31.0,
 39.0,
 25.0,
 28.0,
 65.0,
 44.0,
 14.0,
 38.0,
 13.0,
 19.0,
 28.0,
 27.0,
 14.0,
 21.0,
 17.0,
 50.0,
 30.0,
 15.0,
 16.0,
 39.0,
 80.0,
 62.0,
 25.0,
 22.0,
 77.0,
 31.0,
 13.0,
 13.0,
 16.0,
 45.0,
 23.0,
 12.0,
 30.0,
 32.0,
 31.0,
 22.0,
 72.0,
 21.0,
 61.0,
 27.0,
 14.0,
 28.0,
 32.0,
 53.0,
 32.0,
 26.0,
 30.0,
 50.0,
 25.0,
 12.0,
 13.0,
 26.0,
 26.0,
 32.0,
 34.0,
 26.0,
 24.0,
 17.0,
 72.0,
 94.0,
 61.0,
 18.0,
 17.0,
 17.0,
 15.0,
 40.0,
 40.0,
 47.0,
 27.0,
 38.0,
 11.0,
 37.0,
 47.0,
 36.0,
 27.0,


In [12]:
test_rewards = runner.attempt(agent, num_episodes=100)


In [13]:
test_rewards_0 = runner.attempt(agent_0, num_episodes=100)


In [14]:
test_rewards


array([191., 186., 155., 171., 175., 170., 201., 179., 164., 194., 182.,
       183., 183., 173., 156., 157., 157., 284., 198., 174., 173., 158.,
       500., 187., 174., 195., 192., 191., 171., 173., 179., 176., 162.,
       180., 500., 216., 500., 209., 157., 248., 176., 178., 176., 236.,
       286., 147., 198., 191., 180., 241., 157., 181., 155., 152., 188.,
       222., 170., 141., 166., 197., 162., 195., 162., 159., 164., 186.,
       173., 208., 164., 164., 194., 198., 184., 158., 179., 168., 154.,
       172., 159., 178., 179., 196., 184., 174., 176., 183., 169., 500.,
       154., 146., 166., 164., 250., 170., 192., 165., 500., 174., 187.,
       175.])

In [15]:
test_rewards_0


array([10., 12., 10., 12., 10., 12., 12., 12., 11., 12.,  9., 10., 11.,
       10., 10.,  9., 10.,  9., 10.,  9., 10.,  9., 11., 10., 10., 10.,
       11.,  9., 11., 11.,  9.,  9., 12., 11., 10., 11., 11., 11., 12.,
        9.,  9., 10.,  9., 10., 10., 11., 10., 11., 10., 11., 10.,  9.,
       11., 11., 10., 12., 10., 11., 11., 10., 10., 10., 11., 10., 11.,
        9., 12., 11., 10., 13., 11., 11., 12., 11., 10.,  9.,  9.,  9.,
       10., 10., 10., 12.,  9., 11., 11., 12.,  9., 10., 11., 12., 10.,
       11.,  9., 12.,  9., 10., 10., 12., 11., 12.])

In [23]:
rewards = pd.DataFrame(rewards).reset_index()
rewards.columns = ["episode", "reward"]

test_rewards = pd.DataFrame(test_rewards).reset_index()
test_rewards.columns = ["episode", "reward"]

rewards_0 = pd.DataFrame(rewards_0).reset_index()
rewards_0.columns = ["episode", "reward"]

test_rewards_0 = pd.DataFrame(test_rewards_0).reset_index()
test_rewards_0.columns = ["episode", "reward"]


In [17]:
alt.Chart(rewards).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards
).mark_point().encode(x="episode", y="reward")


In [24]:
rewards_0

Unnamed: 0,episode,reward
0,0,38.0
1,1,22.0
2,2,19.0
3,3,15.0
4,4,17.0
...,...,...
995,995,11.0
996,996,9.0
997,997,8.0
998,998,9.0


In [25]:
test_rewards_0

Unnamed: 0,episode,reward
0,0,10.0
1,1,12.0
2,2,10.0
3,3,12.0
4,4,10.0
...,...,...
95,95,10.0
96,96,10.0
97,97,12.0
98,98,11.0


In [26]:
alt.Chart(rewards_0).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards_0
).mark_point().encode(x="episode", y="reward")
