In [1]:
from gym_runner import GymRunner
from q_func_approx import QualityFuncApprox
from agents.sarsa_agent import SarsaAgent
from agents.q_learning_agent import QLearningAgent, QLearningAgentExperienceReplay
import pandas as pd
import altair as alt



In [2]:
runner = GymRunner("CartPole-v1", display_metrics=True)



In [3]:
num_actions = runner.env.action_space.n


In [4]:
num_states = runner.env.observation_space.shape[0]
num_states


4

In [5]:
q_func = QualityFuncApprox(
    num_states=num_states,
    num_actions=num_actions,
    optimizer="sgd",
    loss_func="l1",
    alpha=0.01,
)


In [6]:
q_func_Q = QualityFuncApprox(
    num_states=num_states,
    num_actions=num_actions,
    optimizer="sgd",
    loss_func="l1",
    alpha=0.001,
)


In [7]:
q_func_Q_exp = QualityFuncApprox(
    num_states=num_states,
    num_actions=num_actions,
    optimizer='sgd',
    loss_func='l1',
    alpha = 0.001
)

In [8]:
agent = SarsaAgent(
    q_func_approx=q_func,
    num_states=num_states,
    num_actions=num_actions,
    gamma=0.9225,
    epsilon=1.0,
    epsilon_decay=0.9975,
)


In [9]:
agent_Q = QLearningAgent(
    q_func_approx=q_func_Q,
    num_states=num_states,
    num_actions=num_actions,
    gamma=0.9225,
    epsilon=1.0,
    epsilon_decay=0.9975,
)


In [10]:
agent_Q_exp = QLearningAgentExperienceReplay(
    q_func_approx=q_func_Q_exp,
    num_states=num_states,
    num_actions=num_actions,
    gamma=0.9225,
    epsilon=1.0,
    epsilon_decay=0.9975,
)


In [11]:
rewards = runner.train(agent=agent, num_episodes=1000, plot_state=False)



Epsilon:  0.1
Current Reward:  16.0
Episode:  990


In [12]:
rewards_Q = runner.train(agent=agent_Q, num_episodes=1000, plot_state=False)



Epsilon:  0.1
Current Reward:  10.0
Episode:  990


In [13]:
rewards_Q_exp = runner.train(agent=agent_Q_exp, num_episodes=1000, plot_state=False)



Epsilon:  0.1
Current Reward:  10.0
Episode:  990


In [14]:
test_rewards = runner.attempt(agent, num_episodes=100)


In [15]:
test_rewards_Q = runner.attempt(agent_Q, num_episodes=100)


In [16]:
test_rewards_Q_exp = runner.attempt(agent_Q_exp, num_episodes=100)


In [17]:
rewards = pd.DataFrame(rewards).reset_index()
rewards.columns = ["episode", "reward"]

test_rewards = pd.DataFrame(test_rewards).reset_index()
test_rewards.columns = ["episode", "reward"]

rewards_Q = pd.DataFrame(rewards_Q).reset_index()
rewards_Q.columns = ["episode", "reward"]

test_rewards_Q = pd.DataFrame(test_rewards_Q).reset_index()
test_rewards_Q.columns = ["episode", "reward"]

rewards_Q_exp = pd.DataFrame(rewards_Q_exp).reset_index()
rewards_Q_exp.columns = ["episode", "reward"]

test_rewards_Q_exp = pd.DataFrame(test_rewards_Q_exp).reset_index()
test_rewards_Q_exp.columns = ["episode", "reward"]


In [18]:
alt.Chart(rewards).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards
).mark_point().encode(x="episode", y="reward")


In [19]:
alt.Chart(rewards_Q).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards_Q
).mark_point().encode(x="episode", y="reward")


In [20]:
alt.Chart(rewards_Q_exp).mark_point().encode(x="episode", y="reward") | alt.Chart(
    test_rewards_Q_exp
).mark_point().encode(x="episode", y="reward")


In [21]:
type(agent)

agents.sarsa_agent.SarsaAgent

In [22]:
type(agent_Q)

agents.q_learning_agent.QLearningAgent

In [23]:
type(agent_Q_exp)

agents.q_learning_agent.QLearningAgentExperienceReplay