# First Agent In Python

In [38]:
import gym
import gym_toytext
import numpy as np

In [39]:
env = gym.make("NChain-v0")

In [40]:
def naive_sum_reward_agent(env, num_episodes = 500):
    r_table = np.zeros((5,2))
    for g in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(r_table[s, :]) == 0:
                # make a random selection of actions
                a = np.random.randint(0, 2)
            else:
                # select the action with highest cumulative reward
                a = np.argmax(r_table[s, :])
            new_s, r, done, _ = env.step(a)
            r_table[s, a] += r
            s = new_s
    return r_table

In [41]:
%%time
reward_table = naive_sum_reward_agent(env, 100)
reward_table

CPU times: user 730 ms, sys: 25.7 ms, total: 755 ms
Wall time: 739 ms


array([[     0., 122726.],
       [     0.,  24822.],
       [  1178.,      0.],
       [     0.,   3896.],
       [ 18774.,      0.]])

In [42]:
def q_learning_with_table(env, num_episodes=100):
    q_table = np.zeros((5,2))
    y = 0.95
    lr = 0.8
    for i in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(q_table[s,:]) == 0:
                # make a random selection of actions
                a = np.random.randint(0, 2)
            else:
                # select the action with largest q value in state s
                a = np.argmax(q_table[s, :])
            new_s, r, done, _ = env.step(a)
            q_table[s,a] += r + lr*(y*np.max(q_table[new_s, :]) - q_table[s,a])
            s = new_s
    return q_table

In [44]:
%%time
q_learning_with_table(env, 500)

CPU times: user 5.1 s, sys: 34.5 ms, total: 5.13 s
Wall time: 5.15 s


array([[ 0.        , 28.38378521],
       [ 0.        , 27.67570783],
       [ 0.        , 27.80012031],
       [30.07206428,  0.        ],
       [30.854693  ,  0.        ]])

In [19]:
def eps_greedy_q_learning_with_table(env, num_episodes=500):
    q_table = np.zeros((5, 2))
    y = 0.95
    eps = 0.5
    lr = 0.8
    decay_factor = 0.999
    for i in range(num_episodes):
        s = env.reset()
        eps *= decay_factor
        done = False
        while not done:
            # select the action with highest cummulative reward
            if np.random.random() < eps or np.sum(q_table[s, :]) == 0:
                a = np.random.randint(0, 2)
            else:
                a = np.argmax(q_table[s, :])
            # pdb.set_trace()
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += r + lr * (y * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
    return q_table

In [32]:
%%time
eps_greedy_q_learning_with_table(env, 500)

CPU times: user 4.94 s, sys: 40.3 ms, total: 4.98 s
Wall time: 4.97 s


array([[60.95085983, 59.67865717],
       [61.38658357, 60.04094886],
       [62.67389025, 61.36423056],
       [66.00436036, 64.91592648],
       [62.56613276, 61.14965081]])

In [23]:
def run_game(table, env):
    s = env.reset()
    tot_reward = 0
    done = False
    while not done:
        a = np.argmax(table[s, :])
        s, r, done, _ = env.step(a)
        tot_reward += r
    return tot_reward

In [35]:
def test_methods(env, num_iterations=100):
    winner = np.zeros((3,))
    for g in range(num_iterations):
        if g % 10 == 0:
            print("Game {} of {}".format(g + 1, num_iterations))
        # Train all models
        m0_table = naive_sum_reward_agent(env, 500)
        m1_table = q_learning_with_table(env, 500)
        m2_table = eps_greedy_q_learning_with_table(env, 500)
        # run a game with each model
        m0_score = run_game(m0_table, env)
        m1_score = run_game(m1_table, env)
        m2_score = run_game(m2_table, env)
        # determine which model got the most points, declare them as winner
        w = np.argmax(np.array([m0_score, m1_score, m2_score]))
        winner[w] += 1
    print("Done")
    return winner
        

In [36]:
%%time
results = test_methods(env, 100)
results

Game 1 of 100
Game 11 of 100
Game 21 of 100
Game 31 of 100
Game 41 of 100
Game 51 of 100
Game 61 of 100
Game 71 of 100
Game 81 of 100
Game 91 of 100
Done
CPU times: user 21min 55s, sys: 3.48 s, total: 21min 58s
Wall time: 21min 57s


array([21., 12., 67.])