In [1]:
import os
import sys
from matplotlib import pyplot as plt
import random
import numpy as np
import pandas as pd
import tqdm
from abc import ABCMeta, abstractmethod, abstractproperty
from copy import copy
%matplotlib inline
plt.style.use("ggplot")
%load_ext autoreload
%autoreload 2

## Exercise 5.1 估计black jacket 各个状态value

In [2]:
class Env(metaclass=ABCMeta):
    def __init__(self,rng_seed=random.randint(0,1000)):
        self.rng = np.random.RandomState(rng_seed)
    
    @staticmethod
    def get_actions(self):
        pass
    
    @abstractmethod
    def step(self,action):
        pass
        # return done,observation,reward,extra
        
    @abstractmethod
    def get_observation(self):
        pass
    
    @abstractmethod
    def observation_to_state(self,observation):
        pass
    
    @abstractmethod
    def get_history(self,):
        pass
    
    @abstractmethod
    def get_state_shape(self):
        pass
        
    @abstractmethod
    def reset(self):
        pass
    
class BlackJackEnv(Env):
    def __init__(self,dealer_stick=17,bust_threshold=21):
        super().__init__()
        self.dealer_stick = dealer_stick
        self.cards = [1,2,3,4,5,6,7,8,9,10,10,10,10]
        self.reset()
        self.actions = ["stick","hit"]
        self.bust_threshold = bust_threshold
    
    def get_actions(self):
        return self.actions
    
    def dealer_play(self):
        while self.get_max_number(self.dealer_cards) < self.dealer_stick:
            self.dealer_cards.append(self.get_random_card())
    
    def get_reward(self):
        dealer_num = self.get_max_number(self.dealer_cards)
        player_num = self.get_max_number(self.player_cards)
        if player_num > self.bust_threshold:
            reward = -1
        elif dealer_num > self.bust_threshold:
            reward = 1
        else:
            if dealer_num > player_num:
                reward = -1
            elif dealer_num == player_num:
                reward = 0
            else:
                # dealer_num < player_num
                reward = 1
        return reward
    
    def step(self,action):
        assert(action in self.actions)
        # nature
        if len(self.player_cards) == 2 and self.get_max_number(self.player_cards) == self.bust_threshold:
            if self.get_max_number(self.dealer_cards) == self.bust_threshold:
                return True,self.get_observation(),0,None
            else:
                return True,self.get_observation(),1,None
        self.action_history.append(action)
        
        if action == "stick":
            self.dealer_play()
            reward = self.get_reward()
            return True,self.get_observation(),reward,None
        elif action == "hit":
            self.player_cards.append(self.get_random_card())
            if self.get_max_number(self.player_cards) < self.bust_threshold:
                return False,self.get_observation(),0,None
            elif self.get_max_number(self.player_cards) == self.bust_threshold:
                return True,self.get_observation(),1,None
            else:
                # player busted
                return True,self.get_observation(),-1,None
        # return done,observation,reward,extra
    
    def has_unused_ace(self,cards):
        if 1 in cards and sum(cards) + 10 <= self.bust_threshold:
            return True
        else:
            return False
    
    def get_max_number(self,cards):
        sumcards = sum(cards)
        if self.has_unused_ace(cards):
            return sumcards + 10
        else:
            return sumcards
    
    def get_max_unbusted_number(self,cards):
        sumcards = sum(cards)
        assert(sumcards <= self.bust_threshold)
        if self.has_unused_ace(cards):
            return sumcards + 10
        else:
            return sumcards
    
    def get_observation(self):
        return (self.has_unused_ace(self.player_cards),self.player_cards,self.dealer_showcard)
    
    def observation_to_state(self,observation):
        unused_ace,player_cards,dealer_showcard = observation
        return int(unused_ace),self.get_max_unbusted_number(self.player_cards),dealer_showcard
        
    def get_state_shape(self):
        return [2,22,11]
        
    def get_random_card(self):
        return self.rng.choice(self.cards)
    
    def reset(self):
        self.player_cards = [self.get_random_card() for i in range(2)]
        
        self.dealer_cards = [self.get_random_card() for i in range(2)]
        self.dealer_showcard = self.dealer_cards[0]
        self.action_history = []
    
    def get_history(self):
        str_player =  f"player [{self.player_cards[:2]}] -> {self.player_cards[2:]} dealer [{self.dealer_cards[:2]}] -> {self.dealer_cards[2:]}"
        return "{} \n {}".format(str_player,"|".join(self.action_history))

### 21点环境使用示例

In [3]:
bje = BlackJackEnv()

In [4]:
bje.reset()
print(bje.get_observation())
while True:
    print(bje.get_observation())
    print(bje.observation_to_state(bje.get_observation()))
    done,observation,reward,extra = bje.step(random.choice(["hit","stick"]))
    if done:
        break 
print(reward)

(True, [5, 1], 4)
(True, [5, 1], 4)
(1, 16, 4)
(False, [5, 1, 10], 4)
(0, 16, 4)
-1


In [5]:
print(bje.get_history())

player [[5, 1]] -> [10] dealer [[4, 8]] -> [1, 4] 
 hit|stick


### 进行Monte Carlo rollout

In [37]:
class Strategy(metaclass=ABCMeta):
    def __init__(self,env:Env):
        self.env = env
    
    @staticmethod
    def choose_action(self):
        pass
    
class ThresholdBlackjackStrategy(Strategy):
    def __init__(self,env:Env,stick_threshold=20):
        self.env = env
        self.stick_threshold = stick_threshold
    
    def choose_action(self):
        if self.env.get_max_unbusted_number(self.env.player_cards) >= self.stick_threshold:
            return "stick"
        else:
            return "hit"

class ValueEstimator(metaclass=ABCMeta):
    def __init__(self,env:Env,strategy:Strategy):
        self.env = env
        self.strategy = strategy
        self.values = np.zeros(self.env.get_state_shape(),dtype=np.float)
    
    @abstractmethod
    def estimate_value(self,steps):
        pass

class MonteCarloValueEstimator(ValueEstimator):
    def __init__(self,env:Env,strategy:Strategy):
        super().__init__(env,strategy)
        self.values_sum = np.zeros(self.env.get_state_shape(),dtype=np.float)
        self.values_count = np.zeros(self.env.get_state_shape(),dtype=np.int)
        
    def monte_carlo_run(self):
        self.env.reset()
        states = []
        actions = []
        rewards = []
        while True:
            state = self.env.observation_to_state(self.env.get_observation())
            states.append(state)
            
            nextstep = self.strategy.choose_action()
            done,observation,reward,extra = self.env.step(nextstep)
            actions.append(nextstep)
            rewards.append(reward)
            if done:
                break
        
        # value calculation
        G = 0
        for one_state,one_reward in zip(states[::-1],rewards[::-1]):
            G += one_reward
            self.values_count[one_state] += 1
            self.values_sum[one_state] += G
        
    def estimate_value(self,steps):
        for one_step in tqdm.tqdm(range(steps)):
            self.monte_carlo_run()
        self.values = self.values_sum / (self.values_count + 1e-6)

In [38]:
bje = BlackJackEnv()
tbs = ThresholdBlackjackStrategy(bje)
mce = MonteCarloValueEstimator(bje,tbs)

In [39]:
mce.estimate_value(500000)

100%|██████████| 500000/500000 [00:51<00:00, 9632.99it/s] 


In [45]:
pd.DataFrame(mce.values[0][:,1:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.535865,-0.475,-0.43318,-0.345833,-0.379913,-0.427273,-0.278261,-0.403162,-0.469027,-0.49332
5,-0.531513,-0.487059,-0.528541,-0.5,-0.459566,-0.493534,-0.484018,-0.51049,-0.519751,-0.505826
6,-0.581843,-0.522175,-0.517665,-0.521008,-0.500709,-0.509383,-0.485714,-0.527086,-0.486804,-0.525087
7,-0.547248,-0.529959,-0.519598,-0.529286,-0.553517,-0.44317,-0.539337,-0.471658,-0.507114,-0.532063
8,-0.585771,-0.541329,-0.498363,-0.5601,-0.522388,-0.550922,-0.547562,-0.518882,-0.545079,-0.557273
9,-0.598653,-0.55155,-0.511312,-0.514544,-0.494451,-0.492023,-0.491182,-0.465506,-0.446521,-0.530926


In [46]:
pd.DataFrame(mce.values[1][:,1:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 使用threshold strategy的总收益

In [47]:
values = []
for one_card in bje.cards:
    for other_card in bje.cards:
        cards = [one_card,other_card]
        unused_ace = int(bje.has_unused_ace(cards))
        card_num = bje.get_max_unbusted_number(cards)
        
        for one_other_player_card in bje.cards:
            one_value = mce.values[unused_ace,card_num,one_other_player_card]
            values.append(one_value)

In [48]:
total_average_value = np.average(values)

In [49]:
print(total_average_value)
# 相当于使用这种bullshit策略打21点进去一局有-0.33的收益，就是怎么打怎么亏的意思

-0.3356978396668988
