In [None]:
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np

from game import Game
from teacher import Teacher
from agent import Qlearner, SARSAlearner

# SARSA algorithm

Initialize the learning agent. We will use SARSA for now. All Q-values are initialized to 0

In [None]:
agent = SARSAlearner(alpha=0.5, gamma=0.9, eps=0.5, eps_decay=1e-5)

Play one game against the un-trained agent. This should be easy to win.

In [None]:
game = Game(agent)
game.start()

Initialize the teacher agent. The teacher agent knows the optimal tic-tac-toe strategy. "level" specifies how often
the agent will make the optimal move vs. a randomly-selected move (i.e., it is the probability of optimal move)

In [None]:
teacher = Teacher(level=0.9)

Teach the SARSA agent via 50000 episodes (games)

In [None]:
episodes = 100000
for i in range(episodes):
    game = Game(agent, teacher=teacher)
    game.start()
    if (i+1) % 5000 == 0:
        print("Games played: %i" % (i+1))

Visualize cumulative reward vs. episode

In [None]:
def plot_agent_reward(rewards):
    """ Function to plot agent's accumulated reward vs. episode """
    plt.plot(np.cumsum(rewards))
    plt.title('Cumulative Reward vs. Time')
    plt.ylabel('Cumulative Reward')
    plt.xlabel('Time (# of actions)')
    plt.show()

In [None]:
plot_agent_reward(agent.rewards)

Now try playing the agent again, post-learning

In [None]:
agent.eps = 0. # agent always takes greedy strategy
game = Game(agent)
game.start()

# Q-learning algorithm

Now, repeat with Q-learning agent

In [None]:
agent_q = Qlearner(alpha=0.5, gamma=0.9, eps=0.5, eps_decay=1e-5)
episodes = 100000
for i in range(episodes):
    game = Game(agent_q, teacher=teacher)
    game.start()
    if (i+1) % 5000 == 0:
        print("Games played: %i" % (i+1))

In [None]:
plot_agent_reward(agent_q.rewards)

Try playing the agent, post-learning

In [None]:
agent.eps = 0. # agent always takes greedy strategy
game = Game(agent)
game.start()