In [18]:
import numpy as np

class BernoulliBandit :
    def __init__(self, K) :
        self.probs = np.random.uniform(size=K)
        self.K = K
        self.best_idx = np.argmax(self.probs)
        self.best_prob = self.probs[self.best_idx]
        
    def step(self, k:int) :
        if np.random.rand() < self.probs[k] :
            return 1
        return 0
    def status(self) :
        print(self.probs, self.K, self.best_prob, self.best_idx)

bb = BernoulliBandit(10)
reward = bb.step(int(2))
print(reward)
bb.status()

1
[0.68023941 0.45007456 0.83983248 0.99371238 0.42765172 0.83419885
 0.53664422 0.19829556 0.14298862 0.36819451] 10 0.9937123793337669 3


In [21]:
class Solver :
    def __init__(self, bandit) :
        self.bandit = bandit
        self.counter = [0 for i in range(bandit.K)]
        self.regrets = [] #[0 for i in range(bandit.K)] 是懊悔记录，不是每个臂的懊悔，别跟reward混淆
    def policy(self) :
        # choose one bandit and return
        return np.random.randint(0, self.bandit.K)

    def update_regret(self, k) :
        regret = self.bandit.best_prob - self.bandit.probs[k]
        self.regrets.append(regret)
    
    def run(self, max_step) :
        for i in range(max_step) :
            k = self.policy()
            self.counter[k] += 1
            self.update_regret(k)
            
    def status(self) :
        print("counter:", self.counter)
        print("regrets:", self.regrets)
s = Solver(bb)
s.run(10)
s.status()

counter: [0, 2, 0, 2, 1, 1, 1, 2, 1, 0]
regrets: [0.15951352703764154, 0.5660606615284806, 0.8507237622377661, 0.5436378152795822, 0.7954168219216354, 0.0, 0.5436378152795822, 0.45706816404935324, 0.0, 0.7954168219216354]


MDP：马尔科夫决策过程  
MP：马尔科夫过程  
MP的策略$$\pi(a|s)$$是已经确定了的。但MDP是不确定的、会变化的。强化学习就是要不断调整迭代$$\pi(a|s)$$，最终得到一个最优的$$\pi(a|s)$$，所以研究的主要是MDP。