In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [108]:
from number_guess_environment import NumberGuess3

In [111]:
env = NumberGuess3()

## Explore Environment

In [114]:
print("Random valid answer number:", env.observation_space.sample())
print("Random valid guess number:", env.action_space.sample())
print("\nNumber of entries in observation space X action space",env.observation_space.n*env.action_space.n )
print("\nThe observation states:\n", env.observation_states)


r = np.array([0, 1, 2, 3, 4,  0,  0,  0,  0,  1])
print("\nA given state in the observation states:")
print(r)
print("Here, the first two hidden numbers have been guessed (1, 3).\n"
      "And for this third hidden number, only 4 has been guessed,"
      " and it was not correct.")

print("\nLocate this given state (its index/row) in the whole set of observation states:")
r_idx = np.where((env.observation_states == r).all(axis=1))[0][0]
print(r_idx)
print("There is a 1-to-1 map between the state and its index/row.")
print("Checking the mapping function:", env.get_state_from_array(r) == r_idx)
print("Checking the mapping function again:", 
      np.array_equal(env.observation_states[r_idx], env.get_array_from_state(r_idx))
     )

print(env.get_dict_from_state(r_idx))

print("\nNew Game")

env.be_verbose = True
env.reset()
print("Current state: (", env.state,")", env.get_dict_from_state())
print("Guessing 3...result is...")
print(env.get_dict_from_state(env.step(3)[0]))

Random valid answer number: 376
Random valid guess number: 2

Number of entries in observation space X action space 5120

The observation states:
 [[-1 -1 -1 ...  0  0  0]
 [-1 -1 -1 ...  0  0  1]
 [-1 -1 -1 ...  0  1  0]
 ...
 [ 0  1  2 ...  1  0  1]
 [ 0  1  2 ...  1  1  0]
 [ 0  1  2 ...  1  1  1]]

A given state in the observation states:
[0 1 2 3 4 0 0 0 0 1]
Here, the first two hidden numbers have been guessed (1, 3).
And for this third hidden number, only 4 has been guessed, and it was not correct.

Locate this given state (its index/row) in the whole set of observation states:
993
There is a 1-to-1 map between the state and its index/row.
Checking the mapping function: True
Checking the mapping function again: True
{'answer0': 0, 'answer1': 1, 'answer2': 2, 'answer3': 3, 'answer4': 4, 'guess0': 0, 'guess1': 0, 'guess2': 0, 'guess3': 0, 'guess4': 1, 'current_position': 4}

New Game
NumberGuess::reset: hidden answer list: [4, 0, 3, 2, 1]
Current state: ( 0 ) {'answer0': -1, 'answ

In [115]:
env.be_verbose = False
print("\nA few random games:")
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    n_guesses = 0
    while not done:
        n_guesses += 1
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}')


A few random games:
Episode:1 Score:-9 NGuesses:17
Episode:2 Score:-16 NGuesses:24
Episode:3 Score:-7 NGuesses:15
Episode:4 Score:0 NGuesses:8
Episode:5 Score:-16 NGuesses:24
Episode:6 Score:-20 NGuesses:28
Episode:7 Score:-6 NGuesses:14
Episode:8 Score:-18 NGuesses:26
Episode:9 Score:-12 NGuesses:20
Episode:10 Score:-20 NGuesses:28


## Train-Test

In [118]:
from q_learning_utils import train_test, default_params, update_q_table

In [176]:
# initial q table full of zeroes -- 32*32*5 = 5120 entries
init_q_table = np.zeros([env.observation_space.n, env.action_space.n])
print(init_q_table.shape)

(1024, 5)


In [192]:
# Train
env = NumberGuess3(False)
q_table, avg_reward = train_test(env, init_q_table, n_episodes = 10000, do_train = True)
print(f"average reward: {avg_reward}")

average reward: -15.8929


In [193]:
np.savetxt("qtable_10k_ep.csv", q_table, delimiter=",")

In [194]:
# Test
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")

average reward: -4.09
average reward: -5.28
average reward: -3.87


```
q_table[832]) = [-1,-1,-1,-1,0]
```

In this example, the 832 entry in the q_table means that for state 832, guessing 0, 1, 2, or 3 are all bad guesses.

What state is this?
```
env.get_dict_from_state(832)

{'answer0': 0,
 'answer1': 1,
 'answer2': 2,
 'answer3': 3,
 'answer4': -1,
 'guess0': 0,
 'guess1': 0,
 'guess2': 0,
 'guess3': 0,
 'guess4': 0,
 'current_position': 4}
```

It's the state where 0, 1, 2, and 3 have all been correctly guessed in the first 4 spots. Only 4 remains unguessed. Further: you just identified the third number and this is your first guess for the fourth round.

So this one example makes sense at least!

In [195]:
# Train
env = NumberGuess3(False)
q_table, avg_reward = train_test(env, init_q_table, n_episodes = 50000, do_train = True)
print(f"average reward: {avg_reward}")

average reward: -15.96158


In [196]:
np.savetxt("qtable_50k_ep.csv", q_table, delimiter=",")

In [197]:
# Test
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")

average reward: -4.2
average reward: -4.71
average reward: -3.91


50k training might be a bit better than 10k, but -3 or -4 feels like the limit; 4 or 5 wrong guesses for every right guess.

Let's watch it play

In [200]:
env.be_verbose = True

In [201]:
train_test(env, q_table, n_episodes = 3, do_train = False)[1]

NumberGuess::reset: hidden answer list: [1, 4, 2, 0, 3]
current position: 0/4 | guess:2 | correct answer:1
current position: 0/4 | guess:4 | correct answer:1
current position: 0/4 | guess:3 | correct answer:1
current position: 0/4 | guess:1 | correct answer:1
current position: 1/4 | guess:3 | correct answer:4
current position: 1/4 | guess:0 | correct answer:4
current position: 1/4 | guess:2 | correct answer:4
current position: 1/4 | guess:3 | correct answer:4
current position: 1/4 | guess:1 | correct answer:4
current position: 1/4 | guess:1 | correct answer:4
current position: 1/4 | guess:3 | correct answer:4
current position: 1/4 | guess:1 | correct answer:4
current position: 1/4 | guess:4 | correct answer:4
current position: 2/4 | guess:4 | correct answer:2
current position: 2/4 | guess:3 | correct answer:2
current position: 2/4 | guess:2 | correct answer:2
current position: 3/4 | guess:3 | correct answer:0
current position: 3/4 | guess:0 | correct answer:0
current position: 4/4 | gu

-4.0

OK, nevermind, it's horrible. I wonder why??

In [202]:
# Train
env = NumberGuess3(False)
q_table, avg_reward = train_test(env, init_q_table, n_episodes = 1_000_000, do_train = True)
print(f"average reward: {avg_reward}")

average reward: -15.993973


In [203]:
np.savetxt("qtable_1M_ep.csv", q_table, delimiter=",")

In [None]:
# Test
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")