In [1]:
import torch
import random
import torch.nn as nn
import matplotlib.pyplot as plt
from itertools import chain
from collections import Counter

### new strategy: generate a crap ton of random rounds and recalibrate after. better accounts for stochasticity

In [184]:
class Blackjack:
    def __init__(self):
        self.reset()

        self.trajectories = []

        self.terminal_states = {} # dictionary of dictionaries
        self.non_terminal_states = {} # dictionary of tuples

        self.Q = {}

    def init_q(self):
        for x in self.trajectories:
            J = len(x) - 1

            for j in range(J):
                self.Q[x[j]] = 0.

    def calibrate_q(self):
        for x in self.Q.keys():
            numerator = 0.
            denominator = 0.

            if x in self.terminal_states.keys():
                numerator += self.terminal_states[x][0]
                denominator += self.terminal_states[x][1]

                #print(x, numerator, denominator)

            if x in self.non_terminal_states.keys():
                
                for y, n in self.non_terminal_states[x].items():

                    other_y_options = [(*y[:-1], i) for i in range(4)]

                    best_mean = -100.
                    best_bottom = 0.
                    
                    for yi in other_y_options:
                        if yi not in self.Q.keys():
                            continue

                        if self.Q[yi] > best_mean:
                            best_mean = self.Q[yi]
                            best_bottom = n

                    numerator += best_mean * n
                    denominator += n

                #print(x, numerator, denominator)

            self.Q[x] = numerator / denominator


    def recalibrate_aux(self):
        for x in self.trajectories:

            # non terminal states
            J = len(x) - 2

            for j in range(J):
                if x[j] not in self.non_terminal_states.keys():
                    sub_dict = {x[j+1]: 1}
                    self.non_terminal_states[x[j]] = sub_dict

                else:
                    if x[j+1] not in self.non_terminal_states[x[j]].keys():
                        self.non_terminal_states[x[j]][x[j+1]] = 1
                        
                    else:
                        self.non_terminal_states[x[j]][x[j+1]] += 1

            # terminal state
            if x[-2] not in self.terminal_states.keys():
                count_and_reward = [x[-1], 1]
                self.terminal_states[x[-2]] = count_and_reward

            else:
                self.terminal_states[x[-2]][0] += x[-1]
                self.terminal_states[x[-2]][1] += 1

            
    def reset(self):
        self.deck = 4 * (list(range(2, 12)) + [10, 10, 10])
        self.shoe = 8 * self.deck
        random.shuffle(self.shoe)

        self.policy = None
        self.value = None

        self.running_count = 0.

    def get_multiplier(self):
        return 1. + (1. * max(0, self.get_capped_true_count()))

    def is_soft(self, c1, c2):
        return c1 == 11 or c2 == 11

    def get_score(self, hand, c, soft):
        """returns the score and its softness given the existing hand and a new card"""
        if c != 11 and not soft:
            return False, hand + c
    
        if c == 11 and not soft: # if valid with 1 or 11, then is still soft (and add 11). else add 1 only
            if hand + 11 <= 21: # and by extension vaild with hand + 1
                return True, hand + 11
    
            return False, hand + 1
    
        if c != 11 and soft: # if total > 21, then subtract 10 and it's no longer soft. else, add the card and it's still soft 
            if hand + c > 21:
                return False, hand + c - 10
    
            return True, hand + c
    
        # else c == 11 and it's soft
        # my argument: we add one and it remains soft since by construction, a hand cannot be soft
        # unless it is of total at least 11
        return True, hand + 1

    def get_true_count(self):
        """returns the true count exactly based on the running count and length of the shoe"""
        return round(self.running_count / round(len(self.shoe) / 52))

    def get_capped_true_count(self):
        return 0. #max(min(self.get_true_count(), 2), -2)

    def deal_card(self, hidden = False):
        card = self.shoe.pop(0)

        if not hidden:
            self.unhide(card)
    
        return card

    def unhide(self, card):
        if card <= 6:
            self.running_count += 1
        elif card >= 10:
            self.running_count -= 1

    def blackjack(self, c1, c2):
        return (c1 == 11 and c2 == 10) or (c1 == 10 and c2 == 11)

    def get_action(self, doublable, splittable):
        p = 0.3 if doublable else 0.01
        q = 0.3 if splittable else 0.01
        
        probabilities = torch.tensor([(1 - p - q) / 2, (1 - p - q) / 2, p, q])
        return torch.distributions.Categorical(probabilities).sample().item()

    def get_trained_action(self, y):
        other_y_options = [(*y[:-1], i) for i in range(4)]

        best_option = 0
        value = -1e10
        
        for yi in other_y_options:
            if yi in bj.Q.keys():
                if bj.Q[yi] > value:
                    best_option = yi[-1]
                    value = bj.Q[yi]
                    
        return best_option


    def evaluate_player_hand(self, c1, c2, d1, eval = False):
        """two cards plus the dealer upcard"""
        
        doublable = True
        is_second = False
        first = False
        
        if c2 == None:
            c2 = self.deal_card()
            splittable = False
            hand = 1

        else:
            splittable = c1 == c2
            hand = 0

        player = 0.
        is_second = False
        multiplier = 1.

        soft, player = self.get_score(player, c1, False)
        soft, player = self.get_score(player, c2, soft)

            
        while player < 21:

            if not eval:
                action = self.get_action(doublable, splittable)
                self.trajectory.append((soft, player, doublable, splittable, self.get_capped_true_count(), hand, is_second, d1, action))
                
            else:
                action = self.get_trained_action((soft, player, doublable, splittable, self.get_capped_true_count(), hand, is_second, d1, None))


            #print('State:', 'soft' if soft else '', player, 'on', d1, '| Action:', action)
            
            if action == 0: # stand
                break

            elif action == 1: # hit
                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)

                doublable = False
                splittable = False

            elif action == 2: # double
                
                # punish if not allowed
                if not doublable:
                    return _, _, _, True
                
                multiplier = 2.

                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)
                
                break
                
            else: # split

                # punish if not allowed
                if not splittable:
                    return _, _, _, True

                cnew = self.deal_card()
                is_second = True

                # now reset everything
                player = 0.

                soft, player = self.get_score(player, c1, False)
                soft, player = self.get_score(player, cnew, soft)

                splittable = False

        #print('Exiting with', player)
                
        return player, is_second, multiplier, False

    def evaluate_dealer_hand(self, d1, d2):
        dealer = 0.
        self.unhide(d2)

        soft, dealer = self.get_score(dealer, d1, False)
        soft, dealer = self.get_score(dealer, d2, soft)

        while dealer < 17:
            d = self.deal_card()
            soft, dealer = self.get_score(dealer, d, soft)

        return dealer

    def blackjack_round(self, eval):
        # deal player cards
        c1 = self.deal_card()
        c2 = self.deal_card()

        d1 = self.deal_card()
        d2 = self.deal_card(hidden = True)

        player_natural = self.blackjack(c1, c2)
        dealer_natural = self.blackjack(d1, d2)

        if dealer_natural:
            
            self.unhide(d2)
            if not player_natural:
                return -1. # dealer blackjack

            return 0. # tie

        if player_natural:
            return 1.5 # player blackjack
            

        # evaluate player hand
        player, is_second, multiplier, did_illegal_move = self.evaluate_player_hand(c1, c2, d1, eval)
        
        if did_illegal_move:
            return -33.

        # if there's a second then do the second
        if is_second:
            player2, _, multiplier2, did_illegal_move = self.evaluate_player_hand(c2, None, d1, eval)

            if did_illegal_move:
                return -33.    
        
        # if both busted then we're done
        if player > 21 and not is_second:
            self.unhide(d2)
            return -1. * multiplier

        if player > 21 and is_second and player2 > 21:
            self.unhide(d2)
            return -1. * (multiplier + multiplier2)
            
        # if not then process dealer hand
        dealer = self.evaluate_dealer_hand(d1, d2)

        #if is_second:
        #    print('At this point we have player', player, 'and second player', player2, 'and dealer', dealer)
        #print('Dealer hit to', dealer)

        if not is_second:
            if dealer > 21:
                return 1 * multiplier

            elif player > dealer:
                return 1 * multiplier

            elif player < dealer:
                return -1 * multiplier

            return 0

        else: # then there is a second
            final_return = 0.

            # first hand
            if player > 21: 
                final_return -= multiplier
                
            elif dealer > 21:
                final_return += multiplier

            elif player > dealer:
                final_return += multiplier

            elif player < dealer:
                final_return -= multiplier
                
            # second hand
            if player2 > 21: 
                final_return -= multiplier2
                
            elif dealer > 21:
                final_return += multiplier2

            elif player2 > dealer:
                final_return += multiplier2

            elif player2 < dealer:
                final_return -= multiplier2

            return final_return


    def do_game(self, eval):

        winnings = 0.
        hands = 0.

        self.shoe = [6, 6] + self.shoe
        
        while len(self.shoe) > 60:

            self.trajectory = []

            multiplier = self.get_multiplier()
            w = self.blackjack_round(eval)

            winnings += (multiplier * 10 * w)
            hands += 1

            if len(self.trajectory) == 0:
                continue # skip if it's a blackjack, no processing to do
                
            self.trajectory.append(w)
            self.trajectories.append(self.trajectory)

            #print('Won', w)
            #print()

        return winnings / hands

    def do_games(self, n_games, eval = False):

        winnings = []
        
        for n in range(n_games):
            self.reset() # reset everything
            
            w = self.do_game(eval)
            winnings.append(w)


            if n % 4000 == 0:
                print('Game', n, '...')

        return winnings
        

In [None]:
bj = Blackjack()
w = bj.do_games(10000)

Game 0 ...


In [None]:
plt.hist(w, bins = 30)

In [None]:
bj.init_q()
bj.recalibrate_aux()

for _ in range(50):
    bj.calibrate_q()


In [None]:
w = bj.do_games(200, eval = True) # keep in mind the above must be done first to populate the Q
print(sum(w) / len(w))
plt.hist(w, bins = 30)

## best hands by payout (1.0 scale)

In [None]:
for k, v in bj.Q.items():
    if v > 2.0:
        print(k, v)

In [None]:
y = (True, 20.0, True, False, 0, 0, True, 2, 2)
other_y_options = [(*y[:-1], i) for i in range(4)]

for yi in other_y_options:
    if yi in bj.Q.keys():
        print(yi, bj.Q[yi])

In [None]:
# soft, player, doublable, splittable, self.get_capped_true_count(), hand, is_second, d1, action
y = (False, 20.0, True, True, 0, 0, False, 6, 2)
other_y_options = [(*y[:-1], i) for i in range(4)]

for yi in other_y_options:
    if yi in bj.Q.keys():
        print(yi, bj.Q[yi])

## how many hands

In [None]:
state_action_pairs = [x[:-1] for x in bj.trajectories]
state_action_pairs = list(chain.from_iterable(state_action_pairs))
len(state_action_pairs)

In [None]:
table = Counter(state_action_pairs)
table.keys().__len__()

In [None]:
# 4306 was (and then we can allow the count to vary and just multiply :)