<a href="https://colab.research.google.com/github/YI-CHENG-SHIH645/ML/blob/master/RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import display, HTML
from scipy.stats import norm
from itertools import product
import numpy as np
import pandas as pd

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

# Multi-armed Bandit 
\\
每台拉霸機的 payoff 都是高斯分佈

In [2]:
levers_mu = [1.2, 1.0, 0.8, 1.4]
payoffs = [norm(loc=mu, scale=1.0) for mu in levers_mu]

Q-value: 平均獎勵 \\
$ Q^{new}_k = Q^{old}_k + \frac{1}{n}(R_n - Q^{old}_k) $

In [3]:
snap_shot_at = [1, 2, 3, 4, 50, 100, 500, 1000, 5000]

cols = pd.MultiIndex.from_tuples(list(product([''], ['Trial', 'Decision', 'Lever\nChosen', 'Payoff'])))
multi = pd.MultiIndex.from_tuples(list(product([f'Lever {i}(stats)' for i in range(1, 5)], ['Q-val', 'Nobs'])))
cols = cols.append(multi)
cols = cols.append(pd.MultiIndex.from_tuples(list(product([''], ['Avg Gain\nper trial']))))
results = pd.DataFrame(columns=cols, index=range(len(snap_shot_at)))
pretty_print(results.head(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Lever 1(stats),Lever 1(stats),Lever 2(stats),Lever 2(stats),Lever 3(stats),Lever 3(stats),Lever 4(stats),Lever 4(stats),Unnamed: 13_level_0
Unnamed: 0_level_1,Trial,Decision,Lever Chosen,Payoff,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Avg Gain per trial
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,


In [4]:
def simulate(eps_scheduler):
    Q = np.array([.0, .0, .0, .0])
    nobs = np.array([0, 0, 0, 0])

    for i in range(1, 5001):
        if np.random.random() < eps_scheduler(i):
            lever = np.random.randint(1, 5)
            decision = 'Explore'
        else:
            lever = np.argmax(Q) + 1
            decision = 'Exploit'
        payoff = payoffs[lever-1].rvs()
        nobs[lever-1] += 1
        Q[lever-1] = Q[lever-1] + 1/nobs[lever-1] * (payoff - Q[lever-1])
        if i in snap_shot_at:
            row = snap_shot_at.index(i)
            results.loc[row, ('', 'Trial')] = i
            results.loc[row, ('', 'Decision')] = decision
            results.loc[row, ('', 'Lever\nChosen')] = lever
            results.loc[row, ('', 'Payoff')] = payoff.round(3)
            for j in range(4):
                results.loc[row, (f'Lever {j+1}(stats)', 'Q-val')] = Q[j].round(3)
            for j in range(4):
                results.loc[row, (f'Lever {j+1}(stats)', 'Nobs')] = nobs[j]
            results.loc[row, ('', 'Avg Gain\nper trial')] = ((Q * nobs).sum()/nobs.sum()).round(3)
    return results

In [5]:
for eps_scheduler in [lambda n: 0.1,
                      lambda n: 0.01,
                      lambda n: 0.5,
                      lambda n: 0.995**(n-1)]:
    pretty_print(simulate(eps_scheduler))
    print('\n\n')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Lever 1(stats),Lever 1(stats),Lever 2(stats),Lever 2(stats),Lever 3(stats),Lever 3(stats),Lever 4(stats),Lever 4(stats),Unnamed: 13_level_0
Unnamed: 0_level_1,Trial,Decision,Lever Chosen,Payoff,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Avg Gain per trial
0,1,Exploit,1,1.265,1.265,1,0.0,0,0.0,0,0.0,0,1.265
1,2,Exploit,1,2.757,2.011,2,0.0,0,0.0,0,0.0,0,2.011
2,3,Exploit,1,0.271,1.431,3,0.0,0,0.0,0,0.0,0,1.431
3,4,Exploit,1,1.53,1.456,4,0.0,0,0.0,0,0.0,0,1.456
4,50,Exploit,1,2.274,1.205,48,0.419,1,0.0,0,0.734,1,1.18
5,100,Exploit,4,0.365,1.139,86,0.659,2,-0.069,2,1.273,10,1.119
6,500,Explore,2,2.437,1.112,117,1.039,17,0.998,11,1.452,355,1.349
7,1000,Exploit,4,2.128,1.138,125,1.005,24,0.834,25,1.434,826,1.372
8,5000,Exploit,4,1.85,1.205,229,1.082,115,0.958,136,1.383,4520,1.357







Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Lever 1(stats),Lever 1(stats),Lever 2(stats),Lever 2(stats),Lever 3(stats),Lever 3(stats),Lever 4(stats),Lever 4(stats),Unnamed: 13_level_0
Unnamed: 0_level_1,Trial,Decision,Lever Chosen,Payoff,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Avg Gain per trial
0,1,Exploit,1,1.387,1.387,1,0.0,0,0.0,0,0.0,0,1.387
1,2,Exploit,1,0.61,0.998,2,0.0,0,0.0,0,0.0,0,0.998
2,3,Exploit,1,1.564,1.187,3,0.0,0,0.0,0,0.0,0,1.187
3,4,Exploit,1,2.636,1.549,4,0.0,0,0.0,0,0.0,0,1.549
4,50,Exploit,1,1.933,1.172,50,0.0,0,0.0,0,0.0,0,1.172
5,100,Exploit,1,0.585,1.235,100,0.0,0,0.0,0,0.0,0,1.235
6,500,Exploit,1,2.444,1.212,489,-0.243,1,0.153,1,0.909,9,1.201
7,1000,Exploit,1,0.106,1.222,985,1.091,5,0.153,1,0.909,9,1.217
8,5000,Exploit,1,3.171,1.202,4948,1.196,20,0.432,10,1.148,22,1.2







Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Lever 1(stats),Lever 1(stats),Lever 2(stats),Lever 2(stats),Lever 3(stats),Lever 3(stats),Lever 4(stats),Lever 4(stats),Unnamed: 13_level_0
Unnamed: 0_level_1,Trial,Decision,Lever Chosen,Payoff,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Avg Gain per trial
0,1,Exploit,1,1.226,1.226,1,0.0,0,0.0,0,0.0,0,1.226
1,2,Explore,2,1.246,1.226,1,1.246,1,0.0,0,0.0,0,1.236
2,3,Exploit,2,1.625,1.226,1,1.435,2,0.0,0,0.0,0,1.366
3,4,Explore,4,0.237,1.226,1,1.435,2,0.0,0,0.237,1,1.084
4,50,Exploit,1,0.04,1.343,33,1.152,9,0.318,2,0.349,6,1.148
5,100,Explore,1,3.627,1.369,66,1.245,15,0.546,9,0.685,10,1.208
6,500,Exploit,4,1.53,1.201,220,0.901,54,0.872,53,1.403,173,1.204
7,1000,Explore,2,1.18,1.202,286,0.972,103,0.895,115,1.409,496,1.246
8,5000,Explore,1,0.828,1.161,754,0.978,608,0.793,637,1.412,3001,1.242







Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Lever 1(stats),Lever 1(stats),Lever 2(stats),Lever 2(stats),Lever 3(stats),Lever 3(stats),Lever 4(stats),Lever 4(stats),Unnamed: 13_level_0
Unnamed: 0_level_1,Trial,Decision,Lever Chosen,Payoff,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Q-val,Nobs,Avg Gain per trial
0,1,Explore,4,0.518,0.0,0,0.0,0,0.0,0,0.518,1,0.518
1,2,Explore,4,0.811,0.0,0,0.0,0,0.0,0,0.665,2,0.665
2,3,Explore,2,1.703,0.0,0,1.703,1,0.0,0,0.665,2,1.011
3,4,Explore,2,1.71,0.0,0,1.707,2,0.0,0,0.665,2,1.186
4,50,Explore,3,-0.206,1.354,10,1.141,10,0.671,18,1.473,12,1.094
5,100,Explore,2,1.392,1.232,20,1.234,20,0.906,27,1.309,33,1.17
6,500,Exploit,4,2.001,1.18,71,0.843,46,0.89,57,1.424,326,1.275
7,1000,Exploit,4,3.36,1.199,76,0.898,51,0.89,57,1.406,816,1.335
8,5000,Exploit,4,1.597,1.208,77,0.898,51,0.89,58,1.398,4814,1.384







上例中 \\
環境不會有變化，我們也就不需要狀態 \\

若今天環境會隨時間、行為而有所變化，就需要考慮當前狀態 \\
Q 值本來是所有動作都會紀錄一個，變成所有列得出的(狀態, 動作)都會紀錄一個 \\
Q(a) -> Q(s, a)

環境因為動作而有所變化，所處的環境又影響能夠達成的目標 \\
=> 規劃目標 \\
=> 最後獎勵總和最多 \\
=> $ G = \sum_{k=t+1}^TR_k$ \\
=> 不會只著眼於當前 \\

$ Q^{new}(s, a) = Q^{old}(s, a) + \frac{1}{n}(G - Q^{old}(s, a)) $ \\
=> trial 越多，Q 收斂到 "future expected reward"

$ Q^{new}(s, a) = Q^{old}(s, a) + \alpha(G - Q^{old}(s, a)), \alpha < 1 $

# The Game of Nim

In [6]:
class Nim:
    def __init__(self, matches: int):
        self.init_matches = matches
        self.current_matches = self.init_matches
    
    def reset(self):
        self.current_matches = self.init_matches
        return self.current_matches

    def max_action(self):
        return np.clip(self.current_matches, 1, 3)    

    def step(self, action: int):
        assert (action in [1, 2, 3]) and (action <= self.current_matches)
        self.current_matches -= action

        done = self.current_matches == 0
        reward = -1 if done else 0

        if not done:
            a = np.random.randint(1, self.max_action() +1)
            self.current_matches -= a
            done = self.current_matches == 0
            reward = 1 if done else 0

        return self.current_matches, action, reward, done

class NimSimulator:
    def __init__(self, nim_instance, epsilon_scheduler):
        self.nim_game = nim_instance
        self.epsilon_scheduler = epsilon_scheduler
        self.state_col = 'State (= number of matches left)'
        self.Q_table = self.create_Q_table(self.nim_game.init_matches)
        self.sa, self.r_history = [], []

    def create_Q_table(self, matches):
        cols = pd.MultiIndex.from_tuples(list(product([''], ['Matches\npicked up'])))
        multi = pd.MultiIndex.from_tuples(list(product([self.state_col], range(1, matches+1))))
        cols = cols.append(multi)
        Q_table = pd.DataFrame(columns=cols, index=range(1, 4))
        Q_table.loc[:, ('', 'Matches\npicked up')] = [1, 2, 3]

        return Q_table

    def simulate(self, n_run=5000, alpha=0.05, method='MC'):
        self.Q_table.loc[:, self.state_col] = 0

        for i in range(1, n_run+1):
            # 現有多少火柴
            s = self.nim_game.reset()

            # 遊戲是否結束
            done = False

            while not done:
                k = self.nim_game.max_action()
                # exploration
                if np.random.random() < self.epsilon_scheduler(i):
                    a = np.random.randint(1, k+1)
                # exploitation
                else:
                    a = np.argmax(self.Q_table.loc[:k, (self.state_col, s)]) + 1
                s_next, a, r, done = self.nim_game.step(a)

                if method == 'MC':
                    self._mc('record', s, a, r)
                else:
                    self._td(s, a, r, s_next, alpha=alpha)
                s = s_next

            if method == 'MC':
                self._mc('update', alpha=alpha)

    def _mc(self, state, s=None, a=None, r=None, alpha=0.05):
        if state == 'record':
            # 每次 run episode 的過程中要紀錄經過了哪些 state，在 episode 結束後才知道更新哪些
            self.sa.append((s, a))
            self.r_history.append(r)
        else:
            G = np.array(self.r_history)[::-1].cumsum()[::-1]
            for idx, (s, a) in enumerate(self.sa):
                Gt = G[idx]
                Q_old = self.Q_table.loc[a, (self.state_col, s)]
                self.Q_table.loc[a, (self.state_col, s)] = Q_old + alpha * (Gt - Q_old)
            self.sa.clear()
            self.r_history.clear()
    
    def _td(self, s, a, r, s_next, alpha=0.05):
        Q_old = self.Q_table.loc[a, (self.state_col, s)]
        k = self.nim_game.max_action()
        Q_next = np.max(self.Q_table.loc[:k, (self.state_col, s_next)]) if s_next > 0 else 0
        self.Q_table.loc[a, (self.state_col, s)] = Q_old + alpha * (r + Q_next - Q_old)

In [7]:
simulator = NimSimulator(Nim(8), lambda n: 0.9995**(n-1))
simulator.simulate(method='MC')
pretty_print(simulator.Q_table.round(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left)
Unnamed: 0_level_1,Matches picked up,1,2,3,4,5,6,7,8
1,1,-1.0,1.0,-0.081,0.231,-0.11,0.849,0,0.602
2,2,0.0,-1.0,1.0,0.107,0.416,-0.109,0,0.57
3,3,0.0,0.0,-1.0,1.0,-0.147,0.126,0,0.955


In [8]:
simulator.simulate(method='TD')
pretty_print(simulator.Q_table.round(3))

Unnamed: 0_level_0,Unnamed: 1_level_0,State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left),State (= number of matches left)
Unnamed: 0_level_1,Matches picked up,1,2,3,4,5,6,7,8
1,1,-1.0,1.0,0.126,0.469,0.202,0.998,0,0.857
2,2,0.0,-1.0,1.0,0.184,0.51,0.329,0,0.819
3,3,0.0,0.0,-1.0,1.0,-0.019,0.359,0,1.0
