In [1]:
import rlcard
import numpy as np
import random
import gym
from tqdm import tqdm

In [2]:
all_cards = ['SA','S2','S3','S4','S5','S6','S7','S8','S9','ST','SJ','SQ','SK',
             'HA','H2','H3','H4','H5','H6','H7','H8','H9','HT','HJ','HQ','HK',
             'CA','C2','C3','C4','C5','C6','C7','C8','C9','CT','CJ','CQ','CK',
             'DA','D2','D3','D4','D5','D6','D7','D8','D9','DT','DJ','DQ','DK']

In [3]:
class BlackjackEnv(gym.Env):
    """Wraps Blackjack as an OpenAI Gym environment."""
    
    def __init__(self):
        """Defines Action and Observation Spaces for the environment"""
        self._rlcard_env = rlcard.make('blackjack')
        self.action_space = gym.spaces.Discrete(self._rlcard_env.num_actions)
        self.observation_space = gym.spaces.Box(0, 31, shape=self._rlcard_env.state_shape[0], dtype=np.int32)

    def seed(self, seed):
        """Seed for generation of random behavior."""
        self._rlcard_env.seed(seed)

    def reset(self):
        """Resets the environment to an initial state and returns the initial observation."""
        all_obs, _ = self._rlcard_env.reset()
        obs = tuple(all_obs["obs"].tolist())
        player0_hand = all_obs["raw_obs"]["player0 hand"]
        dealer_hand = all_obs["raw_obs"]["dealer hand"]
        unknown_card_list = [card for card in all_cards if card not in player0_hand+dealer_hand]
        
        return obs, player0_hand, dealer_hand, unknown_card_list

    def step(self, action):
        """Run one timestep of the environment's dynamics.
        
        Args:
            action: an action provided by the agent
        
        Returns:
            observation: an observation provided by the environment
            reward (float): the reward returned as a result of taking the action
            terminated (bool): whether a terminal state is reached
            info (dictionary): an empty dictionary to conform to gym.Env        
        """
        all_obs, _ = self._rlcard_env.step(action)
        obs = tuple(all_obs["obs"].tolist())
        player0_hand = all_obs["raw_obs"]["player0 hand"]
        dealer_hand = all_obs["raw_obs"]["dealer hand"]
        unknown_card_list = [card for card in all_cards if card not in player0_hand+dealer_hand]
        done = False
        reward = 0.0
        if self._rlcard_env.is_over():
            done = True
            reward = float(self._rlcard_env.get_payoffs()[0])
        return obs, player0_hand, dealer_hand, unknown_card_list, reward, done, {}

In [4]:
rank2score = {"A":11, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7, "8":8, "9":9, "T":10, "J":10, "Q":10, "K":10}
def get_score(hand):
    score = 0
    count_a = 0
    for card in hand:
        score += rank2score[card[1:]]
        if card[1] == 'A':
            count_a += 1
    while score > 21 and count_a > 0:
        count_a -= 1
        score -= 10
    return score

In [5]:
def player0_hit_card(player0_hand, remain_card_list): 
    now_player0_hand = player0_hand.copy()
    now_remain_card_list = remain_card_list.copy()
    
    card = random.choice(now_remain_card_list)
    now_remain_card_list.remove(card)
    now_player0_hand.append(card)
    now_player0_score = get_score(now_player0_hand)
    return now_player0_score, now_remain_card_list
    
def dealer_draw_card(dealer_score, hidden, dealer_hand, remain_card_list):
    now_dealer_hand = dealer_hand.copy()
    now_remain_card_list = remain_card_list.copy()
    now_dealer_score = dealer_score
    while now_dealer_score<17:
        #dealer draws
        card = random.choice(now_remain_card_list)
        now_remain_card_list.remove(card)
        now_dealer_hand.append(card)
        now_dealer_score = get_score(now_dealer_hand + [hidden])
    return now_dealer_score

def policy(obs, player0_hand, dealer_hand, unknown_card_list, trials =1000): 
    assert obs[0]<=21
    
#     if obs[0]<17:      
        
    player0_hit_wintie_trials = 0
    player0_stand_wintie_trials = 0
    #loop for all unknown cards as hidden card in dealer's hand
    for hidden in unknown_card_list:
        dealer_score = get_score(dealer_hand+[hidden])
        remain_card_list = [card for card in unknown_card_list if card != hidden] 
        if dealer_score >= 17:
            # if the dealer already >=17, directly compare the score
            #hit win trials
            for _ in range(trials):
                now_player0_score, _ = player0_hit_card(player0_hand, remain_card_list)
                if now_player0_score<=21 and now_player0_score>=dealer_score:
                    player0_hit_wintie_trials += 1

            #stand win trials
            if obs[0]>= dealer_score:
                player0_stand_wintie_trials += trials

        else:
            #if the dealer <17, still need to draw
            #hit win trials
            for _ in range(trials):
                player0_score, now_remain_card_list = player0_hit_card(player0_hand, remain_card_list)
                # if player0 >21, bust and pass
                if player0_score>21: 
                    continue
                else:
                    dealer_score_after_draw = dealer_draw_card(dealer_score, hidden, dealer_hand, now_remain_card_list)
                    if dealer_score_after_draw >21: #dealer busts
                         player0_hit_wintie_trials += 1
                    else: #dealer doesn't bust but small than player
                        if dealer_score_after_draw<=player0_score:
                            player0_hit_wintie_trials += 1

            #stand win trials
            for _ in range(trials):
                dealer_score_after_draw = dealer_draw_card(dealer_score, hidden, dealer_hand, remain_card_list)
                if dealer_score_after_draw >21:
                     player0_stand_wintie_trials += 1
                else:
                    if dealer_score_after_draw<=obs[0]:
                        player0_stand_wintie_trials += 1
        
    if player0_hit_wintie_trials>player0_stand_wintie_trials:
        return 'hit'
    else: return 'stand'

    

In [6]:
env = BlackjackEnv()
actions = {'hit':0, 'stand':1}


In [12]:
mean_reward_list_10 = []
# 10 trials
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = policy(obs, player0_hand, dealer_hand, unknown_card_list, trials=10)
                action = actions[action]
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_reward_list_10.append(sum(reward_list)/len(reward_list))

100%|███████████████████| 1000/1000 [00:05<00:00, 184.75it/s, mean reward=-.079]
100%|███████████████████| 1000/1000 [00:05<00:00, 188.39it/s, mean reward=-.075]
100%|███████████████████| 1000/1000 [00:05<00:00, 190.14it/s, mean reward=-.054]
100%|████████████████████| 1000/1000 [00:05<00:00, 190.32it/s, mean reward=-.07]
100%|███████████████████| 1000/1000 [00:05<00:00, 191.54it/s, mean reward=-.064]
100%|███████████████████| 1000/1000 [00:05<00:00, 190.98it/s, mean reward=-.039]
100%|███████████████████| 1000/1000 [00:05<00:00, 191.95it/s, mean reward=-.055]
100%|███████████████████| 1000/1000 [00:05<00:00, 189.11it/s, mean reward=-.022]
100%|███████████████████| 1000/1000 [00:05<00:00, 192.91it/s, mean reward=-.042]
100%|███████████████████| 1000/1000 [00:05<00:00, 190.42it/s, mean reward=-.094]


In [14]:
mean_reward_10 = np.array(mean_reward_list_10)
print(mean_reward_10.mean())
print(mean_reward_10.std())

-0.059399999999999994
0.020308618859981593


In [18]:
mean_reward_list_50 = []
# 50 trials
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = policy(obs, player0_hand, dealer_hand, unknown_card_list, trials=50)
                action = actions[action]
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_reward_list_50.append(sum(reward_list)/len(reward_list))

100%|████████████████████| 1000/1000 [00:19<00:00, 51.38it/s, mean reward=-.048]
100%|████████████████████| 1000/1000 [00:19<00:00, 51.96it/s, mean reward=-.004]
100%|████████████████████| 1000/1000 [00:19<00:00, 51.15it/s, mean reward=-.064]
100%|████████████████████| 1000/1000 [00:19<00:00, 51.50it/s, mean reward=-.068]
100%|████████████████████| 1000/1000 [00:18<00:00, 52.86it/s, mean reward=-.041]
100%|████████████████████| 1000/1000 [00:19<00:00, 50.55it/s, mean reward=-.081]
100%|████████████████████| 1000/1000 [00:18<00:00, 52.80it/s, mean reward=-.069]
100%|████████████████████| 1000/1000 [00:19<00:00, 51.88it/s, mean reward=-.057]
100%|████████████████████| 1000/1000 [00:19<00:00, 51.75it/s, mean reward=-.036]
100%|████████████████████| 1000/1000 [00:19<00:00, 52.32it/s, mean reward=-.047]


In [19]:
mean_reward_50 = np.array(mean_reward_list_50)
print(mean_reward_50.mean())
print(mean_reward_50.std())

-0.051500000000000004
0.020674863965695157


In [43]:
mean_reward_list_100 = []
# 100 trials
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = policy(obs, player0_hand, dealer_hand, unknown_card_list, trials=100)
                action = actions[action]
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_reward_list_100.append(sum(reward_list)/len(reward_list))

100%|████████████████████| 1000/1000 [00:36<00:00, 27.58it/s, mean reward=-.023]
100%|████████████████████| 1000/1000 [00:37<00:00, 26.80it/s, mean reward=-.061]
100%|████████████████████| 1000/1000 [00:36<00:00, 27.68it/s, mean reward=-.068]
100%|████████████████████| 1000/1000 [00:37<00:00, 26.85it/s, mean reward=-.073]
100%|████████████████████| 1000/1000 [00:37<00:00, 26.91it/s, mean reward=-.032]
100%|████████████████████| 1000/1000 [00:35<00:00, 28.08it/s, mean reward=-.072]
100%|████████████████████| 1000/1000 [00:35<00:00, 28.10it/s, mean reward=-.044]
100%|████████████████████| 1000/1000 [00:36<00:00, 27.24it/s, mean reward=0.014]
100%|████████████████████| 1000/1000 [00:35<00:00, 27.96it/s, mean reward=-.022]
100%|████████████████████| 1000/1000 [00:37<00:00, 26.92it/s, mean reward=-.029]


In [49]:
mean_reward_100 = np.array(mean_reward_list_100)
print(mean_reward_100.mean())
print(mean_reward_100.std())

-0.041
0.026566896694947266


In [43]:
# 500 trials
mean_reward_list_500 = []
# 100 trials
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = policy(obs, player0_hand, dealer_hand, unknown_card_list, trials=500)
                action = actions[action]
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_reward_list_500.append(sum(reward_list)/len(reward_list))

100%|████████████████████| 1000/1000 [02:55<00:00,  5.70it/s, mean reward=-.019]
100%|████████████████████| 1000/1000 [02:51<00:00,  5.83it/s, mean reward=-.054]
100%|████████████████████| 1000/1000 [02:50<00:00,  5.85it/s, mean reward=-.083]
100%|████████████████████| 1000/1000 [02:50<00:00,  5.88it/s, mean reward=-.049]
100%|████████████████████| 1000/1000 [02:54<00:00,  5.74it/s, mean reward=-.055]
100%|████████████████████| 1000/1000 [02:54<00:00,  5.73it/s, mean reward=-.035]
100%|████████████████████| 1000/1000 [02:47<00:00,  5.95it/s, mean reward=-.033]
100%|█████████████████████| 1000/1000 [02:53<00:00,  5.76it/s, mean reward=-.07]
100%|████████████████████| 1000/1000 [02:55<00:00,  5.69it/s, mean reward=-.033]
100%|████████████████████| 1000/1000 [02:53<00:00,  5.78it/s, mean reward=-.047]


In [44]:
mean_reward_500 = np.array(mean_reward_list_500)
print(mean_reward_500.mean())
print(mean_reward_500.std())

-0.0478
0.018043281298034456


In [38]:
# 1000 trials
mean_reward_list_1000 = []
# 100 trials
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = policy(obs, player0_hand, dealer_hand, unknown_card_list, trials=1000)
                action = actions[action]
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_reward_list_1000.append(sum(reward_list)/len(reward_list))

100%|████████████████████| 1000/1000 [05:52<00:00,  2.84it/s, mean reward=-.052]
100%|████████████████████| 1000/1000 [05:43<00:00,  2.91it/s, mean reward=-.018]
100%|████████████████████| 1000/1000 [05:44<00:00,  2.90it/s, mean reward=0.003]
100%|████████████████████| 1000/1000 [05:38<00:00,  2.95it/s, mean reward=-.035]
100%|████████████████████| 1000/1000 [05:39<00:00,  2.94it/s, mean reward=-.072]
100%|████████████████████| 1000/1000 [05:44<00:00,  2.90it/s, mean reward=-.105]
100%|████████████████████| 1000/1000 [05:45<00:00,  2.89it/s, mean reward=-.062]
100%|████████████████████| 1000/1000 [05:42<00:00,  2.92it/s, mean reward=0.037]
100%|████████████████████| 1000/1000 [05:35<00:00,  2.98it/s, mean reward=-.075]
100%|████████████████████| 1000/1000 [05:39<00:00,  2.94it/s, mean reward=-.051]


In [40]:
mean_reward_1000 = np.array(mean_reward_list_1000)
print(mean_reward_1000.mean())
print(mean_reward_1000.std())

-0.043
0.03926830783214372


In [None]:
def chart_policy(dealer_hand, player0_hand):
    
    if get_score(player0_hand) == 21:
        return 1
    
    
    
    dealer_dim = ['2','3','4','5','6','7','8','9','T','J','Q','K','A']
    hard_dim = [i for i in range(20, 3, -1)] #[20~4]
    soft_dim = [i for i in range(21, 1, -1)] #[21~2]
    
    hard_chart = 1-np.array(
        [
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,1,1,1,1,1,1,1,1],
         [0,0,0,0,0,1,1,1,1,1,1,1,1],
         [0,0,0,0,0,1,1,1,1,1,1,1,1],
         [0,0,0,0,0,1,1,1,1,1,1,1,1],
         [1,1,0,0,0,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1]])
    soft_chart = 1-np.array(
        [[0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,0,0,0,0,0,0],
         [0,0,0,0,0,0,0,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],
         [1,1,1,1,1,1,1,1,1,1,1,1,1],])
    
    has_A = False
    no_A_hand = player0_hand.copy()
    for card in player0_hand:
        if card[1] == 'A':
            has_A = True
            no_A_hand.remove(card)
            break
            
    dealer_index = dealer_dim.index(dealer_hand[0][1])
    
    if has_A:
        soft_score = int(get_score(no_A_hand))
        soft_index = soft_dim.index(soft_score)
        
        action = soft_chart[soft_index, dealer_index]
    else:
        hard_score = int(get_score(player0_hand))
        hard_index = hard_dim.index(hard_score)
        action = hard_chart[hard_index, dealer_index]
    
    return action
   

In [41]:
mean_chart_reward_list = []
for j in range(10):
    reward_list = []
    with tqdm(range(1000)) as tbar:
        for _ in tbar:
            obs, player0_hand, dealer_hand, unknown_card_list = env.reset()
            while True:
                action = chart_policy(dealer_hand, player0_hand)
                obs, player0_hand, dealer_hand, unknown_card_list, reward, done, _ = env.step(action)
                if done:
                    reward_list.append(reward)
                    tbar.set_postfix({'mean reward':sum(reward_list)/len(reward_list)})
                    break
    mean_chart_reward_list.append(sum(reward_list)/len(reward_list))

100%|███████████████████| 1000/1000 [00:02<00:00, 345.06it/s, mean reward=-.012]
100%|███████████████████| 1000/1000 [00:02<00:00, 346.83it/s, mean reward=-.033]
100%|███████████████████| 1000/1000 [00:02<00:00, 348.32it/s, mean reward=-.015]
100%|███████████████████| 1000/1000 [00:02<00:00, 338.65it/s, mean reward=-.098]
100%|███████████████████| 1000/1000 [00:02<00:00, 338.09it/s, mean reward=-.052]
100%|███████████████████| 1000/1000 [00:02<00:00, 347.09it/s, mean reward=-.028]
100%|███████████████████| 1000/1000 [00:02<00:00, 341.22it/s, mean reward=-.034]
100%|████████████████████| 1000/1000 [00:02<00:00, 344.32it/s, mean reward=-.02]
100%|███████████████████| 1000/1000 [00:02<00:00, 342.13it/s, mean reward=0.003]
100%|███████████████████| 1000/1000 [00:02<00:00, 339.31it/s, mean reward=-.076]


In [42]:
mean_chart_reward = np.array(mean_chart_reward_list)
print(mean_chart_reward.mean())
print(mean_chart_reward.std())

-0.036500000000000005
0.029272000273298713
