In [None]:
import numpy as np
import gym
import matplotlib.pyplot as plt

import tensorflow as tf

%matplotlib inline

# Load GYM environment

In [None]:
env = gym.make('FrozenLake-v1', render_mode='rgb_array', is_slippery=False)

NUM_STATES = env.observation_space.n
NUM_ACTIONS = env.action_space.n

print(f'States: {NUM_STATES}')
print(f'Actions: {NUM_ACTIONS}')

# Settings

In [None]:
lr = .8
gamma = .95

NUM_EPISODES = 3000
MAX_STEPS = 100
REWARD_AVERAGE_WINDOW = 20

# DQN Model

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Dense(
    NUM_ACTIONS,
    activation=None,
    use_bias=False,
    kernel_initializer=tf.initializers.random_uniform(0, 0.01))])

def evalQ(s):
    return model(tf.one_hot([s], NUM_STATES)).numpy()[0]

# Prepare model for training

In [None]:
def loss(q1, q2):
    return tf.resuce_sum(tf.square(q1 - q2))

optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=lr)
model.compile(loss=loss, optimizer=optimizer)

pathLenList = []
totalRewardList = []
totalRewardAverageList = []

# DQN training

In [None]:
for i in range(NUM_EPISODES):
    eps = 1. / ((i/50) + 10)
    
    s = env.reset()
    
    totalReward = 0
    step = 0
    
    while step < MAX_STEPS:
        step += 1
        
        Q_s = evalQ(s)
        
        if np.random.rand() < eps:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q_s)
        s1, r, done, _ = env.step(a)
        
        Q_s1 = evalQ(s1)
        
        Q_target = Q_s
        if done:
            Q_target[a] = r
        else:
            Q_target[a] = r + gamma * np.max(Q_s1a)
        Q_target = tf.constant(Q_target)[None, ...]
        model.train_on_batch(tf.one_hot([s], NUM_STATES), Q_target)
        
        totalReward += r
        s = s1
        if done:
            break
    pathLenList.append(step)
    totalRewardList.append(totalReward)
    
    if i % REWARD_AVERAGE_WINDOW == 0 and i >= REWARD_AVERAGE_WINDOW:
        totalRewardAverage = np.mean(totalRewardList[-REWARD_AVERAGE_WINDOW:])
        totalRewardAveragesList.append(totalRewardAverage)
        if i % 100 == 0:
            print('Episode {}: average total reward = {}').format(i, totalRewardAverage)

# Q-Table

In [None]:
print(Q)

# Plot trajectory length lover time

In [None]:
plt.plot(pathLenList)
plt.grid()

# Plot average total reward over time

In [None]:
plt.plot(totalRewardAveragesList)
plt.grid()

# Run similation

In [None]:
s = env.reset()

for _ in range(1000):
    env.render()
    a = np.argmax(evalQ(s))
    s, r, done, _ = env.step(a)
    if done:
        print(f'Reward = {r}')
        break
env.close()