In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain
from collections import Counter
import pandas as pd

### new strategy: vanilla q learning with weights

In [51]:
class Blackjack:
    def __init__(self):
        self.reset()

        self.state_actions = pd.DataFrame(columns = ['state', 'action'])
        self.rewards = np.array([])
        self.seen = np.array([])

        self.epsilon = 0.99 #0.1
        self.alpha = 0.2

            
    def reset(self):
        self.deck = 4 * (list(range(2, 12)) + [10, 10, 10])
        self.shoe = 8 * self.deck
        random.shuffle(self.shoe)

        self.running_count = 0.

    def get_multiplier(self):
        return 1. + (1. * max(0, self.get_capped_true_count()))

    def is_soft(self, c1, c2):
        return c1 == 11 or c2 == 11

    def get_score(self, hand, c, soft):
        """returns the score and its softness given the existing hand and a new card"""
        if c != 11 and not soft:
            return False, hand + c
    
        if c == 11 and not soft: # if valid with 1 or 11, then is still soft (and add 11). else add 1 only
            if hand + 11 <= 21: # and by extension vaild with hand + 1
                return True, hand + 11
    
            return False, hand + 1
    
        if c != 11 and soft: # if total > 21, then subtract 10 and it's no longer soft. else, add the card and it's still soft 
            if hand + c > 21:
                return False, hand + c - 10
    
            return True, hand + c
    
        # else c == 11 and it's soft
        # my argument: we add one and it remains soft since by construction, a hand cannot be soft
        # unless it is of total at least 11
        return True, hand + 1

    def get_true_count(self):
        """returns the true count exactly based on the running count and length of the shoe"""
        return round(self.running_count / round(len(self.shoe) / 52))

    
    def get_capped_true_count(self):
        return max(min(self.get_true_count(), 7), -7)

    
    def deal_card(self, hidden = False):
        card = self.shoe.pop(0)

        if not hidden:
            self.unhide(card)
    
        return card

    
    def unhide(self, card):
        if card <= 6:
            self.running_count += 1
        elif card >= 10:
            self.running_count -= 1

    
    def blackjack(self, c1, c2):
        return (c1 == 11 and c2 == 10) or (c1 == 10 and c2 == 11)      


    def evaluate_player_hand(self, c1, c2, d1, eval = False):
        """two cards plus the dealer upcard"""
        
        doublable = True
        is_second = False
        first = False

        sequence = []
        
        if c2 == None:
            c2 = self.deal_card()
            splittable = False
            hand = 1

        else:
            splittable = c1 == c2
            hand = 0

        player = 0.
        is_second = False
        multiplier = 1.

        soft, player = self.get_score(player, c1, False)
        soft, player = self.get_score(player, c2, soft)
            
        while player < 21:

            #print(player, soft, d1, doublable, splittable)
            state = (player, soft, d1)
            sequence.append(state)

            options = 2 + doublable + splittable

            inds = self.state_actions.state == state

            if not doublable:
                inds = inds & (self.state_actions.action != 2)

            if not splittable:
                inds = inds & (self.state_actions.action != 3)

            
            if (random.random() < self.epsilon) or inds.sum() == 0:
                if options == 2:
                    action = 0 + (random.random() < 0.5)
    
                elif options == 3:
                    action = 0 + (random.random() < 0.3333) + (random.random() < 0.6666)
    
                else:
                    action = 0 + (random.random() < 0.25) + (random.random() < 0.50) + (random.random() < 0.75)

            else:
                the_max = self.rewards[inds].argmax()
                action = self.state_actions[inds].iloc[the_max].action
                    
            sequence.append(action)
            # now get to this
            
            if action == 0: # stand
                break

            elif action == 1: # hit
                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)

                doublable = False
                splittable = False

            elif action == 2: # double
                
                multiplier = 2.

                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)
                
                break
                
            else: # split

                cnew = self.deal_card()
                is_second = True

                # now reset everything
                player = 0.

                soft, player = self.get_score(player, c1, False)
                soft, player = self.get_score(player, cnew, soft)

                splittable = False


        return player, is_second, multiplier, sequence

    
    def evaluate_dealer_hand(self, d1, d2):
        dealer = 0.
        self.unhide(d2)

        soft, dealer = self.get_score(dealer, d1, False)
        soft, dealer = self.get_score(dealer, d2, soft)

        while dealer < 17:
            d = self.deal_card()
            soft, dealer = self.get_score(dealer, d, soft)

        return dealer

    
    def blackjack_round(self, eval):
        # deal player cards
        c1 = self.deal_card()
        c2 = self.deal_card()

        d1 = self.deal_card()
        d2 = self.deal_card(hidden = True)

        player_natural = self.blackjack(c1, c2)
        dealer_natural = self.blackjack(d1, d2)

        if dealer_natural:
            
            self.unhide(d2)
            if not player_natural:
                return [-1.] # dealer blackjack

            return [0.] # tie

        if player_natural:
            return [1.5] # player blackjack
            

        # evaluate player hand
        player, is_second, multiplier, sequence = self.evaluate_player_hand(c1, c2, d1, eval)

        # if there's a second then do the second
        if is_second:
            player2, _, multiplier2, sequence2 = self.evaluate_player_hand(c2, None, d1, eval) 
        
        # if both busted then we're done
        if player > 21 and not is_second:
            self.unhide(d2)
            return sequence, (-1. * multiplier)

        if player > 21 and is_second and player2 > 21:
            self.unhide(d2)
            return sequence, -1. * multiplier, sequence2, -1 * multiplier2
            
        # if not then process dealer hand
        dealer = self.evaluate_dealer_hand(d1, d2)

        if not is_second:
            if dealer > 21:
                return sequence, 1 * multiplier

            elif player > dealer:
                return sequence, 1 * multiplier

            elif player < dealer:
                return sequence, -1 * multiplier

            return sequence, 0

        else: # then there is a second
            r1 = 0
            r2 = 0

            # first hand
            if player > 21: 
                r1 -= multiplier
                
            elif dealer > 21:
                r1 += multiplier

            elif player > dealer:
                r1 += multiplier

            elif player < dealer:
                r1 -= multiplier
                
            # second hand
            if player2 > 21: 
                r2 -= multiplier2
                
            elif dealer > 21:
                r2 += multiplier2

            elif player2 > dealer:
                r2 += multiplier2

            elif player2 < dealer:
                r2 -= multiplier2

            return sequence, r1, sequence2, r2

    def calibrate(self, w):
        seq, r = w
        r *= 10

        for i in range(int(len(seq) / 2)):
            inds = (self.state_actions.state == seq[2*i]) & (self.state_actions.action == seq[2*i + 1])

            if inds.sum() == 0:
                tup = pd.DataFrame(data = [[None, seq[2*i + 1]]], columns = ['state', 'action'])
                tup['state'] = [seq[2*i]]
                self.state_actions = pd.concat([self.state_actions, tup])
                
                self.rewards = np.append(self.rewards, r)
                self.seen = np.append(self.seen, 1)

            else:
                self.seen[inds] += 1

        # now everything is here. we can do the q learning loop now
        seq_reversed = list(reversed(seq))

        for i in range(int(len(seq) / 2)):
            inds = (self.state_actions.action == seq_reversed[2*i]) & (self.state_actions.state == seq_reversed[2*i + 1])
            
            if i == 0: # terminal state
                self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * r)

            else:
                inds_for_max = (self.state_actions.state == seq_reversed[2*i + 1])
                r_best = self.rewards[inds_for_max].max()

                self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * r_best)

    
    def calibrate_split(self, w):
        seq1, r1, seq2, r2 = w

        #print(r1, r2)

        # add the split to the table
        inds = (self.state_actions.state == seq1[0]) & (self.state_actions.action == seq1[1])

        # add to table if necessary
        if inds.sum() == 0:

            #print('before', self.state_actions.shape)
            
            tup = pd.DataFrame(data = [[None, seq1[1]]], columns = ['state', 'action'])
            tup['state'] = [seq1[0]]
            self.state_actions = pd.concat([self.state_actions, tup])
            
            self.rewards = np.append(self.rewards, r1 + r2)
            self.seen = np.append(self.seen, 1)

            #print('after', self.state_actions.shape)

        else:
            self.seen[inds] += 1
            self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * (r1 + r2))

        # now pass the actual hands through the table
        self.calibrate((seq1[2:], r1))
        self.calibrate((seq2, r2))
        
    
    def do_game(self, eval):

        winnings = 0.
        self.reset()
        
        while len(self.shoe) > 60:

            multiplier = self.get_multiplier()
            w = self.blackjack_round(eval)

            if len(w) == 2:
                self.calibrate(w)
                w_round = w[-1]

            elif len(w) == 4:

                self.calibrate_split(w)
                w_round = w[-1] + w[-3]

            else:
                w_round = w[0]

            winnings += (multiplier * 10 * w_round)

        return winnings
        

    def do_games_until_convergence(self):
        self.winnings = []

        n = 0
        done = False

        while not done:

            w = self.do_game(eval = False)

            if n == 4000:
                done = True

            if n % 10 == 0:
                self.winnings.append(w)
                
            if n % 20 == 0:
                print('=== Game', n, '| Seen s-a pairs:', self.seen.size,'===')

            n += 1
        

In [52]:
bj = Blackjack()
w = bj.do_games_until_convergence()

=== Game 0 | Seen s-a pairs: 62 ===
=== Game 20 | Seen s-a pairs: 311 ===
=== Game 40 | Seen s-a pairs: 390 ===
=== Game 60 | Seen s-a pairs: 438 ===
=== Game 80 | Seen s-a pairs: 465 ===
=== Game 100 | Seen s-a pairs: 499 ===
=== Game 120 | Seen s-a pairs: 522 ===
=== Game 140 | Seen s-a pairs: 542 ===
=== Game 160 | Seen s-a pairs: 559 ===
=== Game 180 | Seen s-a pairs: 584 ===
=== Game 200 | Seen s-a pairs: 592 ===
=== Game 220 | Seen s-a pairs: 605 ===
=== Game 240 | Seen s-a pairs: 622 ===
=== Game 260 | Seen s-a pairs: 632 ===
=== Game 280 | Seen s-a pairs: 643 ===
=== Game 300 | Seen s-a pairs: 651 ===
=== Game 320 | Seen s-a pairs: 659 ===
=== Game 340 | Seen s-a pairs: 667 ===
=== Game 360 | Seen s-a pairs: 673 ===
=== Game 380 | Seen s-a pairs: 678 ===
=== Game 400 | Seen s-a pairs: 684 ===
=== Game 420 | Seen s-a pairs: 692 ===
=== Game 440 | Seen s-a pairs: 696 ===
=== Game 460 | Seen s-a pairs: 700 ===
=== Game 480 | Seen s-a pairs: 703 ===
=== Game 500 | Seen s-a pairs: 7

KeyboardInterrupt: 

In [None]:
bj.state_actions

In [None]:
bj.seen

In [None]:
bj.rewards

In [None]:
bj.state_actions.state

In [None]:
inds = (bj.state_actions.state.map(lambda x: x[0]) == 4.0)
bj.state_actions[inds]

In [None]:
# out of curiosity, wat is the best hand we've seen over 100 times
inds = bj.seen > 200
print(inds.sum())

rank = bj.rewards[inds].argsort()

In [None]:
bj.rewards[inds][rank[:5]]

In [None]:
bj.state_actions[inds].iloc[rank[:5]] # the 5 worst

In [None]:
print(bj.rewards[inds][rank[-5:]])
bj.state_actions[inds].iloc[rank[-5:]] # the 5 best