In [287]:
import numpy as np
import pickle
import pandas as pd

In [396]:
class BlackJack():
    
    
    def __init__(self, learn_rate=0.01, exp_rate=0.1, n_step=True):
        self.cards = [2,3,4,5,6,7,8,9,10,10,10,10,11]
        
        self.player = []
        self.dealer = []
        
        self.stayed = None
        self.doubled = None
        self.blackjack = None
        self.option_aces = None
        
        # (player hand value, # of optionable aces, dealer's visible card)
        self.state = (0,0,0)
        
        self.actions = {
            0: self.double,
            1: self.draw,
            2: self.stay
        }
        
        self.exp_rate = exp_rate
        self.learn_rate = learn_rate
        self.n_step = n_step
        
    
    def initialize_Q_values(self):
        self.Q_values = {}
        '''Initialize all the state action to 0'''
        for i in range(4, 22):
            for j in range(0,2):
                for k in range(2,12):
                    self.Q_values[(i, j, k)] = {}
                    for a in range(0, 3):
                            self.Q_values[(i, j, k)][a] = 0
    
        
    def initialize_game(self):
        self.player = []
        self.dealer = []
        
        self.draw()
        self.draw()
        self.draw('D')
        self.draw('D')
        
        self.stayed = False
        self.doubled = False
        self.blackjack = sum(self.player) == 21
        
        self.state = (self.observe_cards(self.player)[0], 
                      self.observe_cards(self.player)[1], 
                      self.dealer[0])
     
        self.state_actions = []
    
    def draw(self, party='P'):
        card = np.random.choice(self.cards)
        if party == 'P': 
            self.player.append(card) 
        if party == 'D':
            self.dealer.append(card)   
    
    
    def double(self):
        self.double = True
        self.draw()
        self.stayed = True
    
    
    def stay(self):
        self.stayed = True
    
    
    @staticmethod
    def observe_cards(hand):
        hand_value = sum(hand)
        option_aces = hand.count(11)
        
        if hand_value > 21:
            while option_aces > 0 and hand_value > 21:
                hand_value -= 10
                option_aces -= 1
            return (hand_value, option_aces)

        else:
            return (hand_value, option_aces)
                
    
    def dealer_play(self):
        while self.observe_cards(self.dealer)[0] < 17:    
            self.draw('D')
        
        
    def score(self):
        
        self.dealer_play()

        gain_loss = 2 if self.doubled else 1 # double or not
        gain_loss = 1.5 if self.blackjack else gain_loss # If blackjack

        if self.observe_cards(self.player)[0] > 21 or \
            self.observe_cards(self.player)[0] < self.observe_cards(self.dealer)[0] <= 21:
            return -gain_loss

        elif self.observe_cards(self.player)[0] == self.observe_cards(self.dealer)[0]:
            return 0

        else:
            return gain_loss
    
    
    def check_end(self):
        if any([self.observe_cards(self.player)[0] > 21, self.stayed, self.blackjack]):
            return True
        else:
            return False
            
    
    def take_action(self, action_id):
        
        self.actions[action_id]()
        
        # Store past state and action
        self.state_actions.append((self.state, action_id))
        
        # Update state to  new state
        self.state = (self.observe_cards(self.player)[0] 
                      ,self.observe_cards(self.player)[1]
                      ,self.dealer[0])
    
    
    def choose_action(self):
        # Exploration
        if np.random.uniform(0,1) <= self.exp_rate:
            action_id = np.random.choice(list(self.actions.keys()))
        # Greedy
        else:
            action_id = max(self.Q_values[self.state], key=self.Q_values[self.state].get)
        
        return action_id
                
        
    def assign_reward(self):
        reward = self.score()
        
#         for state_action in reversed(self.state_actions):
#             state, action = state_action[0], state_action[1]
#             curr_value = self.Q_values[state][action]
#             reward = curr_value + self.learn_rate*(reward - curr_value)
            
#             self.Q_values[state][action] = reward
        
        for i, state_actions in enumerate(reversed(self.state_actions)):
            state, action = state_actions[0], state_actions[1]
            curr_value = self.Q_values[state][action]
            reward = curr_value + self.learn_rate*(reward - curr_value)
            self.Q_values[state][action] = reward
            
            # Propagate reward for all past states rather than last one
            if self.n_step and len(self.state_actions[i:]) > 1:
                for earlier_state_action in self.state_actions[i+1::-1]:
                    earl_state, earl_action = earlier_state_action[0], earlier_state_action[1]
                    earl_value = self.Q_values[earl_state][earl_action]
                    earl_reward = earl_value + self.learn_rate * i * (reward - earl_value)
                    self.Q_values[earl_state][earl_action] = earl_reward

    
    def save_policy(self, file='policy'):
        fw = open(file, 'wb')
        pickle.dump(self.Q_values, fw)
        fw.close()
        
    
    def load_policy(self, file='policy'):
        fr = open(file, 'rb')
        self.Q_values = pickle.load(fr)
        fr.close()

In [508]:
# BJ = BlackJack()
# BJ.initialize_Q_values()   

# policy = 'policy_n_step'
# BJ = BlackJack(exp_rate=0.01, n_step=True)
policy = 'policy'
BJ = BlackJack(exp_rate=0.01, n_step=False)
BJ.load_policy(policy)  


for i in range(10001):
    BJ.initialize_game()
    while not BJ.check_end():
        action = BJ.choose_action()
        BJ.take_action(action)
    
    BJ.assign_reward()
    
    if i%10000 == 0:
        BJ.save_policy(file=policy)

In [509]:
BJ.Q_values

{(4, 0, 2): {0: -0.22351683457917343,
  1: -0.11728259017959684,
  2: -0.2661197585329125},
 (4, 0, 3): {0: -0.18158081220365832,
  1: -0.07767619573341197,
  2: -0.191878352290415},
 (4, 0, 4): {0: -0.17445901036319056,
  1: -0.04501513027098267,
  2: -0.13617756108181933},
 (4, 0, 5): {0: -0.10699130745591669,
  1: -0.04028081418320858,
  2: -0.2560606297940849},
 (4, 0, 6): {0: -0.04351239429135396,
  1: 0.029308811973913176,
  2: -0.17813068233445},
 (4, 0, 7): {0: -0.4672361371278004,
  1: -0.13748519792431665,
  2: -0.46623740441432837},
 (4, 0, 8): {0: -0.4984637221370272,
  1: -0.19063589267604,
  2: -0.4006079209719746},
 (4, 0, 9): {0: -0.42303507246778815,
  1: -0.28327476348756786,
  2: -0.4830861344608231},
 (4, 0, 10): {0: -0.7775471338358101,
  1: -0.36134332084645543,
  2: -0.6847054455515267},
 (4, 0, 11): {0: -0.7587137991942633,
  1: -0.4826715941846207,
  2: -0.7287356264560575},
 (4, 1, 2): {0: 0, 1: 0, 2: 0},
 (4, 1, 3): {0: 0, 1: 0, 2: 0},
 (4, 1, 4): {0: 0, 1: 0

In [510]:
count_results = {'win': 0,
                'draw': 0,
                'loose': 0}

end_score = {'win': [],
            'draw': [],
            'loose': []}

for i in range(10000):
    BJ.initialize_game()
    while not BJ.check_end():
        action = BJ.choose_action()
        BJ.take_action(action)
    
    reward = BJ.score()
    
    if reward ==0:
        count_results['draw'] += 1
        end_score['draw'].append(reward)
    elif reward > 0:
        count_results['win'] += 1  
        end_score['win'].append(reward)
    else:
        count_results['loose'] += 1 
        end_score['loose'].append(reward)       

In [511]:
count_results

{'win': 4211, 'draw': 966, 'loose': 4823}

In [512]:
total_win_loose = end_score['win'] + end_score['loose']

In [513]:
np.mean(total_win_loose)

-0.043446978082798314

In [514]:
def worded_action(value_dict):
    worded={
        0: 'double',
        1: 'draw',
        2: 'stay'
    }
    
    action = max(value_dict, key=value_dict.get)
    return worded[action]


def highlight_action(action):
    color_code = {
        'draw':'blue',
        'double':'yellow',
        'stay':'red'
    }
    return 'background-color:' + color_code[action]

In [515]:
Q_values_no_option = {}
for key in list(BJ.Q_values.keys()):
    if key[1]==0:
        Q_values_no_option[(key[0], key[2])] = worded_action(BJ.Q_values[key])

df_no_option = pd.DataFrame()

for key, value in Q_values_no_option.items():
    df_no_option.loc[key[0], key[1]] = value

df_no_option.style.applymap(highlight_action)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11
4,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
5,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
6,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
7,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
8,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
9,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
10,double,draw,double,draw,draw,draw,draw,double,draw,draw
11,draw,double,double,double,double,double,draw,draw,double,draw
12,draw,stay,stay,stay,draw,draw,draw,draw,draw,draw
13,stay,draw,stay,stay,stay,draw,draw,draw,draw,draw


In [516]:
Q_values_option = {}
for key in list(BJ.Q_values.keys()):
    if key[1]==1:
        Q_values_option[(key[0], key[2])] = worded_action(BJ.Q_values[key])

df_no_option = pd.DataFrame()
        
for key, value in Q_values_option.items():
    df_option.loc[key[0], key[1]] = value

df_option.style.applymap(highlight_action)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11
4,double,double,double,double,double,double,double,double,double,double
5,double,double,double,double,double,double,double,double,double,double
6,double,double,double,double,double,double,double,double,double,double
7,double,double,double,double,double,double,double,double,double,double
8,double,double,double,double,double,double,double,double,double,double
9,double,double,double,double,double,double,double,double,double,double
10,double,double,double,double,double,double,double,double,double,double
11,double,double,double,double,double,double,double,double,double,double
12,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
13,draw,draw,draw,draw,draw,draw,draw,draw,draw,draw
