In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('NChain-v0')
env.reset()

0

In [43]:
env.step(0)

(2, 0, False, {})

### Naive Reward Table

In [44]:
def naive_sum_reward_agent(env, num_episodes=500, freq=50):
    r_table = np.zeros([env.observation_space.n, env.action_space.n])
    r_sum = 0.0
    for episode in range(num_episodes):
        s = env.reset()
        done = False
        while not done:
            if np.sum(r_table[s, :]) == 0:
                a = env.action_space.sample()
            else:
                a = np.argmax(r_table[s, :])
            new_s, r, done, _ = env.step(a)
            r_table[s, a] += r
            s = new_s
            r_sum += r
            
        if (episode+1) % freq == 0:
            print('Episode {} Total Reward: {}'.format(episode+1, r_sum/freq))
            r_sum = 0.0
    return r_table

In [50]:
naive_sum_reward_agent(env)

Episode 50 Total Reward: 1005.8
Episode 100 Total Reward: 1010.0
Episode 150 Total Reward: 1007.92
Episode 200 Total Reward: 1004.84
Episode 250 Total Reward: 1009.6
Episode 300 Total Reward: 1010.36
Episode 350 Total Reward: 1004.44
Episode 400 Total Reward: 1003.96
Episode 450 Total Reward: 1008.32
Episode 500 Total Reward: 1004.84


array([[ 99670.,      0.],
       [     0., 319296.],
       [     0.,  64472.],
       [     0.,  12584.],
       [     0.,   7482.]])

##### Q: What are the problems with this approach?

## Basic Q-Learning

In [51]:
def eps_greedy_q_learning_with_table(env, num_episodes=500, freq=50):
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    r_sum = 0.0
    
    epsi = 0.5
    epsi_min = 0.01
    epsi_decay_factor = 0.999
    
    gamma = 0.95
    lr = 0.8
    for episode in range(num_episodes):
        s = env.reset()
        
        if epsi > epsi_min:
            epsi *= epsi_decay_factor
            
        done = False
        while not done:
            if np.random.random() < epsi or np.sum(q_table[s, :]) == 0:
                a = env.action_space.sample()
            else:
                a = np.argmax(q_table[s, :])
            new_s, r, done, _ = env.step(a)
            q_table[s, a] += lr * (r + gamma * np.max(q_table[new_s, :]) - q_table[s, a])
            s = new_s
            r_sum += r
        if (episode+1) % freq == 0:
            print('Episode {} Total Reward: {}'.format(episode+1, r_sum/freq))
            r_sum = 0.0
    return q_table

In [52]:
eps_greedy_q_learning_with_table(env)

Episode 50 Total Reward: 1588.76
Episode 100 Total Reward: 1662.12
Episode 150 Total Reward: 1657.32
Episode 200 Total Reward: 1706.12
Episode 250 Total Reward: 1707.68
Episode 300 Total Reward: 1744.32
Episode 350 Total Reward: 1755.88
Episode 400 Total Reward: 1770.72
Episode 450 Total Reward: 1811.84
Episode 500 Total Reward: 1750.76


array([[41.37070639, 41.51471144],
       [41.43445376, 41.79253673],
       [41.04298802, 42.69620204],
       [42.60572151, 42.72748195],
       [43.16423547, 43.66766907]])

## Deep Q-Learning

In [53]:
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


In [54]:
def initialize_nn():
    model = Sequential()
    model.add(Dense(10, input_dim=5, activation='relu'))
    model.add(Dense(2, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model

In [55]:
def eps_greedy_deep_q_learning(env, num_episodes=500, freq=50):
    r_sum = 0.0
    
    gamma = 0.95
    lr = 0.1
    
    epsi = 0.8
    epsi_decay_factor = 0.999
    epsi_min = 0.01
    
    model = initialize_nn()
    
    for episode in range(num_episodes):
        s = env.reset()
        
        if epsi > epsi_min:
            epsi *= epsi_decay_factor
        
        done = False
        while not done:
            tmp1 = np.identity(5)[s : s + 1]
            
            if np.random.random() < epsi:
                a = env.action_space.sample()
            else:
                a = np.argmax(model.predict(tmp1))
            new_s, r, done, _ = env.step(a)

            tmp2 = np.identity(5)[new_s : new_s + 1]
            target = r + gamma * np.max(model.predict(tmp2))
            
            target_vec = model.predict(tmp1)
            target_vec[0][a] = target
            model.fit(tmp1, target_vec, epochs=1, verbose=0, batch_size=1)
            
            s = new_s
            r_sum += r
        if (episode+1) % freq == 0:
            print('Episode {} Total Reward: {}'.format(episode+1, r_sum/freq))
            r_sum = 0.0
    return model

In [56]:
eps_greedy_deep_q_learning(env)

Episode 50 Total Reward: 1401.88
Episode 100 Total Reward: 1472.92
Episode 150 Total Reward: 1503.8
Episode 200 Total Reward: 1574.96
Episode 250 Total Reward: 1603.08
Episode 300 Total Reward: 1682.6
Episode 350 Total Reward: 1706.92
Episode 400 Total Reward: 1764.32
Episode 450 Total Reward: 1804.48
Episode 500 Total Reward: 1901.76


<keras.engine.sequential.Sequential at 0x1a23282860>

## Deep Q-learning with replay memory

In [57]:
def initialize_two_layer_nn():
    model = Sequential()
    model.add(Dense(10, input_dim=5, activation='relu'))
    model.add(Dense(5,  activation='relu'))
    model.add(Dense(2, activation='linear'))
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    return model

In [58]:
from collections import deque
import random

def eps_greedy_deep_q_learning_with_memory(env, num_episodes=500, freq=50):
    r_sum = 0.0
    
    gamma = 0.95
    lr = 0.1
    
    epsi = 0.8
    epsi_decay_factor = 0.999
    epsi_min = 0.01
    
    model = initialize_two_layer_nn()
    memory = deque(maxlen=2000)
    batch_size = 32
    for episode in range(num_episodes):
        s = env.reset()
        
        if epsi > epsi_min:
            epsi *= epsi_decay_factor
            
        done = False
        while not done:
            tmp1 = np.identity(5)[s : s + 1]
            
            if np.random.random() < epsi:
                a = env.action_space.sample()
            else:
                a = np.argmax(model.predict(tmp1))
            new_s, r, done, _ = env.step(a)

            memory.append((s, a, r, new_s, done))
            
            s = new_s
            r_sum += r
            
        if len(memory) < batch_size:
            minibatch = random.sample(memory, len(memory))
        else:
            minibatch = random.sample(memory, batch_size)  
            
        for ss, aa, rr, new_ss, dd in minibatch:
            target = rr
            if not dd:
                tmp2 = np.identity(5)[new_ss : new_ss + 1]
                target = rr + gamma * np.max(model.predict(tmp2))
                    
            tmp1 = np.identity(5)[ss : ss + 1]
            target_vec = model.predict(tmp1)
                
            target_vec[0][aa] = target
            model.fit(tmp1, target_vec, epochs=1, verbose=0, batch_size=1)
            
        if (episode+1) % freq == 0:
            print('Episode {} Total Reward: {}'.format(episode+1, r_sum/freq))
            r_sum = 0.0
    return model

In [59]:
deepQ_model = eps_greedy_deep_q_learning_with_memory(env)

Episode 50 Total Reward: 1298.52
Episode 100 Total Reward: 1283.64
Episode 150 Total Reward: 1413.0
Episode 200 Total Reward: 1556.92
Episode 250 Total Reward: 1622.0
Episode 300 Total Reward: 1687.0
Episode 350 Total Reward: 1719.76
Episode 400 Total Reward: 1766.08
Episode 450 Total Reward: 1810.16
Episode 500 Total Reward: 1891.56


In [60]:
import time

def play_game_with_nn(env, model):
    s = env.reset()
    done = False
    while not done:
        tmp1 = np.identity(5)[s : s + 1]
        a = np.argmax(model.predict(tmp1))
        
        new_s, r, done, _ = env.step(a)
        print(env.step(a))
        s = new_s
        time.sleep(.5) 

In [61]:
play_game_with_nn(env,deepQ_model)

(2, 0, False, {})
(1, 0, False, {})
(0, 2, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(1, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, Fal

(1, 0, False, {})
(3, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(0, 2, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(0, 2, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(0, 2, False, {})
(2, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(1, 0, False, {})
(1, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(0, 2, False, {})
(2, 0, False, {})
(1, 0, False, {})
(1, 0, False, {})
(3, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(0, 2, False, {})
(2, 0, False, {})
(4, 0, False, {})
(4, 10, False, {})
(4, 10, False, {})
(4, 10, False, {})
(1, 0, True, {})
(3, 0, True, {})
