In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from game_simulation import CoinGameSimulation

### Strategy 5: make use of Q-learning.
This strategy uses Q-learning with an $\epsilon$-greedy policy to learn the best strategy to play.

The following wikipedia pages was used as a reference 
[Q learning](https://en.wikipedia.org/wiki/Q-learning).

$$
Q^{n e w}\left(s_{t}, a_{t}\right) \leftarrow \underbrace{Q\left(s_{t}, a_{t}\right)}_{\text {old value }}+\underbrace{\alpha}_{\text {learning rate }} \cdot \overbrace{(\underbrace{r_{t}}_{\text {reward }}+\underbrace{\gamma}_{\text {discount factor }} \cdot \underbrace{\max _{a} Q\left(s_{t+1}, a\right)}_{\text {estimate of optimal future value }}-\underbrace{Q\left(s_{t}, a_{t}\right)}_{\text {old value }})}^{\text {temporal difference }}
$$

In [3]:
alpha = 0.05 # learning rate 
gamma = 0.99 # discount factor
# epsilon is the exploration rate
min_epsilon = 0.01
max_epsilon = 1
espilon_decay = 0.001

In [4]:
env = CoinGameSimulation()
shape_of_state = tuple(d.n for d in env.observation_space[:-1])
Q_table = np.zeros((*shape_of_state, env.action_space.n))
scores = []
rewards = []
num_trained_episodes = 0
av_rewards = []

def epslion_greedy(state):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q_table[state])

In [None]:
num_episodes = 0
rewards = []

for itr in tqdm(range(num_episodes)):
    env.reset()
    (n_heads, n_tails, n_flips_left) = env.observe()
    state = (n_heads, n_tails)

    done = False
    while not done:
        move = epslion_greedy(state)
        (n_heads, n_tails, n_flips_left), reward, done, _  = env.step(move)
        next_state = (n_heads, n_tails)
        rewards.append(reward)
        
        Q_table[state][move] = Q_table[state][move] + alpha * (reward + gamma * np.max(Q_table[next_state]) - Q_table[state][move])

        state = next_state

    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-espilon_decay*itr)

    scores.append(env.score)
    av_rewards.append(np.sum(rewards))
    rewards = []      
    num_trained_episodes += 1
    env.reset()
clear_output(wait=True)
plt.plot(np.array(scores).reshape(1000,-1).mean(axis=0))
plt.show()

In [None]:
epsilon

In [None]:
np.argmax(Q_table[1,0])