In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np

In [4]:
from number_guess_environment import NumberGuess2

In [26]:
env = NumberGuess2()

## Explore Environment

In [28]:
print("Random valid answer number:", env.observation_space.sample())
print("Random valid guess number:", env.action_space.sample())
print("\nThe observation states:\n", env.observation_states)

r = np.array([3, 1, -1, -1, -1,  0,  0,  0,  0,  1])
print("\nA given state in the observation states:")
print(r)
print("Here, the first two hidden numbers have been guessed (3, 1).\n"
      "And for this third hidden number, only 4 has been guessed,"
      " and it was not correct.")

print("\nLocate this given state (its index/row) in the whole set of observation states:")
r_idx = np.where((env.observation_states == r).all(axis=1))[0][0]
print(r_idx)
print("There is a 1-to-1 map between the state and its index/row.")
assert env.get_state_from_array(r) == r_idx
assert np.array_equal(env.observation_states[4321], env.get_array_from_state(4321))

print("\nA few random games:")
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    n_guesses = 0
    while not done:
        n_guesses += 1
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score} NGuesses:{n_guesses}')
    
env.be_verbose = True
env.reset()
print(env.get_dict_from_state())
print(env.state)
j = env.get_dict_from_state()["current_position"]
print("guessing 3...result is...")
print(env.get_dict_from_state(env.update_state(3)))
print("guessing 2...result is...")
print(env.get_dict_from_state(env.update_state(2)))
print("current status, visualized differently:")
print(env.get_array_from_state())

Random valid answer number: 58707
Random valid guess number: 0

The observation states:
 [[-1 -1 -1 ...  0  0  0]
 [-1 -1 -1 ...  0  0  1]
 [-1 -1 -1 ...  0  1  0]
 ...
 [ 4  3  2 ...  1  0  1]
 [ 4  3  2 ...  1  1  0]
 [ 4  3  2 ...  1  1  1]]

A given state in the observation states:
[ 3  1 -1 -1 -1  0  0  0  0  1]
Here, the first two hidden numbers have been guessed (3, 1).
And for this third hidden number, only 4 has been guessed, and it was not correct.

Locate this given state (its index/row) in the whole set of observation states:
6817
There is a 1-to-1 map between the state and its index/row.

A few random games:
Episode:1 Score:-6 NGuesses:16
Episode:2 Score:-49 NGuesses:59
Episode:3 Score:-19 NGuesses:29
Episode:4 Score:-11 NGuesses:21
Episode:5 Score:-7 NGuesses:17
Episode:6 Score:-14 NGuesses:24
Episode:7 Score:-4 NGuesses:14
Episode:8 Score:-10 NGuesses:20
Episode:9 Score:-12 NGuesses:22
Episode:10 Score:-18 NGuesses:28
NumberGuess::reset: hidden answer list: [0, 2, 1, 3, 

## Train-Test

In [19]:
from q_learning_utils import train_test, default_params, update_q_table

In [35]:
# initial q table full of zeroes -- 500000 entries
init_q_table = np.zeros([env.observation_space.n, env.action_space.n])
print(init_q_table.size)

521600


In [32]:
# Train
env = NumberGuess2(False)
q_table, avg_reward = train_test(env, init_q_table, n_episodes = 100000, do_train = True)
print(f"average reward: {avg_reward}")

average reward: -15.98762


In [33]:
q_table

array([[1.68067902, 2.57732359, 2.54561801, 1.69140438, 3.94931937],
       [3.4672871 , 2.85705787, 1.75406698, 2.04974363, 1.82334799],
       [1.87329896, 2.69024635, 2.41934679, 1.69024731, 1.62169705],
       ...,
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [34]:
# Test
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")
avg_reward = train_test(env, q_table, n_episodes = 100, do_train = False)[1]
print(f"average reward: {avg_reward}")

average reward: -9.46
average reward: -7.16
average reward: -7.94


In [36]:
np.savetxt("qtable_100k_ep.csv", q_table, delimiter=",")

With 10,000 training episodes, we don't outperform random guessing, but with a 100k, we do outperform random guessing!

100,000 training episodes:
average reward: -9.46
average reward: -7.16
average reward: -7.94

Compared with -15-ish for random guessing

In [38]:
env.be_verbose = True

In [45]:
train_test(env, q_table, n_episodes = 3, do_train = False)[1]

NumberGuess::reset: hidden answer list: [2, 0, 4, 1, 3]
current position: 0/4 | guess:4 | correct answer:2
current position: 0/4 | guess:0 | correct answer:2
current position: 0/4 | guess:3 | correct answer:2
current position: 0/4 | guess:2 | correct answer:2
current position: 1/4 | guess:4 | correct answer:0
current position: 1/4 | guess:1 | correct answer:0
current position: 1/4 | guess:0 | correct answer:0
current position: 2/4 | guess:4 | correct answer:4
current position: 3/4 | guess:3 | correct answer:1
current position: 3/4 | guess:1 | correct answer:1
current position: 4/4 | guess:3 | correct answer:3
NumberGuess::reset: hidden answer list: [3, 0, 2, 1, 4]
current position: 0/4 | guess:4 | correct answer:3
current position: 0/4 | guess:0 | correct answer:3
current position: 0/4 | guess:3 | correct answer:3
current position: 1/4 | guess:4 | correct answer:0
current position: 1/4 | guess:3 | correct answer:0
current position: 1/4 | guess:2 | correct answer:0
current position: 1/4

-13.0

Examining a few games:

Game 1:
```
NumberGuess::reset: hidden answer list: [3, 0, 2, 1, 4]
current position: 0/4 | guess:4 | correct answer:3
current position: 0/4 | guess:0 | correct answer:3
current position: 0/4 | guess:3 | correct answer:3
current position: 1/4 | guess:4 | correct answer:0
current position: 1/4 | guess:3 | correct answer:0 <-- Blunder, 3 already an answer
current position: 1/4 | guess:2 | correct answer:0
current position: 1/4 | guess:1 | correct answer:0
current position: 1/4 | guess:0 | correct answer:0
current position: 2/4 | guess:2 | correct answer:2
current position: 3/4 | guess:4 | correct answer:1
current position: 3/4 | guess:1 | correct answer:1
current position: 4/4 | guess:4 | correct answer:4
```

Game 2:
```
NumberGuess::reset: hidden answer list: [2, 3, 1, 4, 0]
current position: 0/4 | guess:4 | correct answer:2
current position: 0/4 | guess:0 | correct answer:2
current position: 0/4 | guess:3 | correct answer:2
current position: 0/4 | guess:2 | correct answer:2 <-- A fine start.
current position: 1/4 | guess:4 | correct answer:3
current position: 1/4 | guess:1 | correct answer:3 <-- Uh oh. We go off the rails.
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3
current position: 1/4 | guess:0 | correct answer:3 <-- Recovered, probably by random exploration
current position: 1/4 | guess:4 | correct answer:3
current position: 1/4 | guess:3 | correct answer:3
current position: 2/4 | guess:2 | correct answer:1
current position: 2/4 | guess:2 | correct answer:1 <-- A (mere) localized blunder?
current position: 2/4 | guess:3 | correct answer:1 <-- Back on track?
current position: 2/4 | guess:1 | correct answer:1
current position: 3/4 | guess:1 | correct answer:4 <-- Bad from here down, worse than random guessing. 
current position: 3/4 | guess:1 | correct answer:4     Is it avoiding 0?
current position: 3/4 | guess:1 | correct answer:4
current position: 3/4 | guess:3 | correct answer:4
current position: 3/4 | guess:1 | correct answer:4
current position: 3/4 | guess:1 | correct answer:4
current position: 3/4 | guess:1 | correct answer:4
current position: 3/4 | guess:4 | correct answer:4
current position: 4/4 | guess:3 | correct answer:0
current position: 4/4 | guess:3 | correct answer:0
current position: 4/4 | guess:1 | correct answer:0
current position: 4/4 | guess:1 | correct answer:0
current position: 4/4 | guess:1 | correct answer:0
current position: 4/4 | guess:2 | correct answer:0
current position: 4/4 | guess:0 | correct answer:0
```

Clearly we have some rabbit holes in the q-table.

First thought: seems like more training would fill in these holes. But it already took 5 minutes to train 1M episodes.

Second thought: so easy to see how we could assist it by restricting the q-table and being "smarter" about training. But that's not the point is it! The optimization is clear: the 

Let's start by having it switch to random guess earlier.

Switched it to guess randomly after 5 guesses for a single hidden number

In [58]:
env.be_verbose = False
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])

-5.16
-5.75
-6.51
-5.64
-5.98


Clearly better than only switching to random guesses after the score has gotten very bad.

I've been avoiding it, but no more: we need to train more. At least to learn if training here is the limitation, and I suspect it is. The observation space is just too big.

In [59]:
# Let's go to 1M training episodes!
env = NumberGuess2(False)
init_q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table, avg_reward = train_test(env, init_q_table, n_episodes = 1_000_000, do_train = True)
np.savetxt("qtable_1M_ep.csv", q_table, delimiter=",")

This took about 2 hours

In [60]:
q_table

array([[1.85611052, 2.66806927, 2.05253676, 2.00784239, 3.68050107],
       [1.90868957, 1.87984946, 3.77498962, 2.56256128, 2.68705413],
       [2.78960193, 2.68353916, 3.62283225, 2.61005441, 2.70097519],
       ...,
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ]])

In [64]:
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])
print(train_test(env, q_table, n_episodes = 100, do_train = False)[1])

-4.15
-5.1
-4.66
-5.13
-4.02


Hmm maybe only slightly better than 100k training.

In [None]:
env.be_verbose = 
train_test(env, q_table, n_episodes = 3, do_train = False)[1]