In [31]:
import gym
import numpy as np
import random
from gym import envs
import time
import matplotlib.pyplot as plt

In [None]:
# to retrieve everything
envs.registry.all()

# Broken Example

In [26]:
env = gym.make('Taxi-v3')

In [48]:
class QAlgo():
    def __init__(self, env, learning_rate=0.9, discount_rate=0.9, eps=1.0, decay=0.01):
        self.state_size = env.observation_space.n
        self.action_size = env.action_space.n
        self.q_table = np.zeros((self.state_size, self.action_size))
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.eps = eps
        self.decay = decay
    
    def reset_eps(self):
        self.eps = 1
    
    def reduce_eps(self, episode):
        self.eps = np.exp(-self.decay * episode)
    
    def get_sample(self, state):
        if random.uniform(0, 1) < self.eps:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.q_table[state,:])
    
    def update_table(self, new_state, state, action, reward):
        self.q_table[state, action] = self.q_table[state, action] + self.learning_rate * reward + self.discount_rate * np.max(self.q_table[new_state,:]) - self.q_table[state, action]

In [49]:
goon = QAlgo(env)
goon.reset_eps()

In [None]:
max_steps = 100
for i in range(1000):
    state = env.reset()
    done = False
    for j in range(max_steps):
        action = goon.get_sample(state)
        new_state, reward, done, info = env.step(action)
        goon.update_table(new_state, state, action, reward)
        state = new_state
        if done:
            break
    goon.reduce_eps(i)
    if i % 100 == 0:
        env.render()
env.close()

In [None]:
state = env.reset()
done = False
rewards = 0
for s in range(max_steps):
    action = np.argmax(goon.q_table[state,:])
    new_state, reward, done, info = env.step(action)
    rewards += reward
    env.render()
    if done:
        break
print(rewards)

In [53]:
def main():

    # create Taxi environment
    env = gym.make('Taxi-v3')

    # initialize q-table
    state_size = env.observation_space.n
    action_size = env.action_space.n
    qtable = np.zeros((state_size, action_size))

    # hyperparameters
    learning_rate = 0.9
    discount_rate = 0.8
    epsilon = 1.0
    decay_rate= 0.005

    # training variables
    num_episodes = 1000
    max_steps = 99 # per episode

    # training
    for episode in range(num_episodes):

        # reset the environment
        state = env.reset()
        done = False

        for s in range(max_steps):

            # exploration-exploitation tradeoff
            if random.uniform(0,1) < epsilon:
                # explore
                action = env.action_space.sample()
            else:
                # exploit
                action = np.argmax(qtable[state,:])

            # take action and observe reward
            new_state, reward, done, info = env.step(action)

            # Q-learning algorithm
            qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

            # Update to our new state
            state = new_state

            # if done, finish episode
            if done == True:
                break

        # Decrease epsilon
        epsilon = np.exp(-decay_rate*episode)

    print(f"Training completed over {num_episodes} episodes")
    input("Press Enter to watch trained agent...")

    # watch trained agent
    state = env.reset()
    done = False
    rewards = 0

    for s in range(max_steps):

        print(f"TRAINED AGENT")
        print("Step {}".format(s+1))

        action = np.argmax(qtable[state,:])
        new_state, reward, done, info = env.step(action)
        rewards += reward
        env.render()
        print(f"score: {rewards}")
        state = new_state

        if done == True:
            break

    env.close()

if __name__ == "__main__":
    main()



Training completed over 1000 episodes


Press Enter to watch trained agent... 


TRAINED AGENT
Step 1
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
score: -1
TRAINED AGENT
Step 2
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
score: -2
TRAINED AGENT
Step 3
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
score: -3
TRAINED AGENT
Step 4
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
score: -4
TRAINED AGENT
Step 5
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[42m_[0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
score: -5
TRAINED AGENT
Step 6
+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[42m_[0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (West)
score: -6
TRAINED AGENT
Step 7
+---------+
|[35mR[0m: | : :G|
| :[42m_[0m| : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)

In [54]:
import nle

In [55]:
env = gym.make('NetHackScore-v0')

In [62]:
env.character

'mon-hum-neu-mal'

In [57]:
obs = env.reset()

In [61]:
env

{'glyphs': array([[2359, 2359, 2359, ..., 2359, 2359, 2359],
        [2359, 2359, 2359, ..., 2359, 2359, 2359],
        [2359, 2359, 2359, ..., 2359, 2359, 2359],
        ...,
        [2359, 2359, 2359, ..., 2359, 2359, 2359],
        [2359, 2359, 2359, ..., 2359, 2359, 2359],
        [2359, 2359, 2359, ..., 2359, 2359, 2359]], dtype=int16),
 'chars': array([[32, 32, 32, ..., 32, 32, 32],
        [32, 32, 32, ..., 32, 32, 32],
        [32, 32, 32, ..., 32, 32, 32],
        ...,
        [32, 32, 32, ..., 32, 32, 32],
        [32, 32, 32, ..., 32, 32, 32],
        [32, 32, 32, ..., 32, 32, 32]], dtype=uint8),
 'colors': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'specials': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0,