In [2]:
from pprint import pprint
import gym
import nle
import numpy as np
import random
from gym import envs
import time
import matplotlib.pyplot as plt

In [3]:
env = gym.make('NetHack-v0', max_episode_steps=100000)
obs = env.reset()

In [6]:
print(env.observation_space)

Dict(blstats:Box([-2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648
 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648
 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648
 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648 -2147483648
 -2147483648 -2147483648], [2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647 2147483647 2147483647 2147483647 2147483647
 2147483647 2147483647], (26,), int64), chars:Box([[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], [[255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 ...
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]
 [255 255 255 ... 255 255 255]], (21, 79), uint8), colors:Box([[0

In [68]:
value = 0
neg_acts = 0
pos_acts = 0
neu_acts = 0
RUNS = 1000000
for i in range(RUNS):
    action = env.action_space.sample()
    obs, reward, done, info = env.step(action)
    if reward < 0.0:
        neg_acts += 1
    elif reward > 0.0:
        pos_acts += 1
    else:
        neu_acts += 1
    value += reward
    if done:
        env.reset()
print(f'Neg Acts: {neg_acts/RUNS:.4f}\tPos Acts: {pos_acts/RUNS:.4f}\tNeu Acts: {neu_acts/RUNS:.4f}\n')
print(f'Total Value: {value}\tReward Rate: {value/RUNS:.4f}')

Neg Acts: 1	Pos Acts: 0.0008	Neu Acts: 0.9992

Total Value: 6112.0	Reward Rate: 0.0061


In [79]:
def test_algo(model, env):
    value = 0
    neg_acts = 0
    pos_acts = 0
    neu_acts = 0
    RUNS = 1000000
    obs = env.reset()
    for i in range(RUNS):
        action = model.sample(obs)
        new_obs, reward, done, info = env.step(action)
        model.update(obs, new_obs, reward, action)
        if reward < 0.0:
            neg_acts += 1
        elif reward > 0.0:
            pos_acts += 1
        else:
            neu_acts += 1
        value += reward
        if done:
            new_obs = env.reset()
        obs = new_obs
    print(f'Neg Acts: {neg_acts/RUNS:.4f}\tPos Acts: {pos_acts/RUNS:.4f}\tNeu Acts: {neu_acts/RUNS:.4f}\n')
    print(f'Total Value: {value}\tReward Rate: {value/RUNS:.4f}')

In [80]:
class SimpModel():
    def __init__(self, env):
        self.env = env
    
    def sample(self, obs):
        return self.env.action_space.sample()
    
    def update(self, obs, new_obs, reward, action):
        pass

In [75]:
goon = SimpModel(env)
test_algo(goon, env)

Neg Acts: 0.0000	Pos Acts: 0.0008	Neu Acts: 0.9992

Total Value: 5547.0	Reward Rate: 0.0055


In [78]:
class NextSimp():
    def __init__(self, env):
        self.env = env
        self.p_reward_dist = np.ones(env.action_space.n)
    
    def sample(self, obs):
        indexes = list(range(self.p_reward_dist.shape[0]))
        return np.random.choice(indexes, p=self.p_reward_dist/np.sum(self.p_reward_dist))
    
    def update(self, obs, new_obs, reward, action):
        if reward > 0:
            self.p_reward_dist[action] += 1

In [81]:
boon = NextSimp(env)
test_algo(boon, env)

Neg Acts: 0.0000	Pos Acts: 0.0029	Neu Acts: 0.9971

Total Value: 21913.0	Reward Rate: 0.0219
