In [1]:
import random
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain
from collections import Counter
import pandas as pd

### new strategy: vanilla q learning with weights

In [54]:
class Blackjack:
    def __init__(self):
        self.reset()

        self.state_actions = pd.DataFrame(columns = ['state', 'action'])
        self.rewards = np.array([])
        self.seen = np.array([])

        self.epsilon = 0.99 #0.1
        self.alpha = 0.2

            
    def reset(self):
        self.deck = 4 * (list(range(2, 12)) + [10, 10, 10])
        self.shoe = 8 * self.deck
        random.shuffle(self.shoe)

        self.running_count = 0.

    def get_multiplier(self):
        return 1. + (1. * max(0, self.get_capped_true_count()))

    def is_soft(self, c1, c2):
        return c1 == 11 or c2 == 11

    def get_score(self, hand, c, soft):
        """returns the score and its softness given the existing hand and a new card"""
        if c != 11 and not soft:
            return False, hand + c
    
        if c == 11 and not soft: # if valid with 1 or 11, then is still soft (and add 11). else add 1 only
            if hand + 11 <= 21: # and by extension vaild with hand + 1
                return True, hand + 11
    
            return False, hand + 1
    
        if c != 11 and soft: # if total > 21, then subtract 10 and it's no longer soft. else, add the card and it's still soft 
            if hand + c > 21:
                return False, hand + c - 10
    
            return True, hand + c
    
        # else c == 11 and it's soft
        # my argument: we add one and it remains soft since by construction, a hand cannot be soft
        # unless it is of total at least 11
        return True, hand + 1

    def get_true_count(self):
        """returns the true count exactly based on the running count and length of the shoe"""
        return round(self.running_count / round(len(self.shoe) / 52))

    
    def get_capped_true_count(self):
        return max(min(self.get_true_count(), 7), -7)

    
    def deal_card(self, hidden = False):
        card = self.shoe.pop(0)

        if not hidden:
            self.unhide(card)
    
        return card

    
    def unhide(self, card):
        if card <= 6:
            self.running_count += 1
        elif card >= 10:
            self.running_count -= 1

    
    def blackjack(self, c1, c2):
        return (c1 == 11 and c2 == 10) or (c1 == 10 and c2 == 11)      


    def evaluate_player_hand(self, c1, c2, d1, eval = False):
        """two cards plus the dealer upcard"""
        
        doublable = True
        is_second = False
        first = False

        sequence = []
        
        if c2 == None:
            c2 = self.deal_card()
            splittable = False
            hand = 1

        else:
            splittable = c1 == c2
            hand = 0

        player = 0.
        is_second = False
        multiplier = 1.

        soft, player = self.get_score(player, c1, False)
        soft, player = self.get_score(player, c2, soft)
            
        while player < 21:

            #print(player, soft, d1, doublable, splittable)
            state = (player, soft, d1)
            sequence.append(state)

            options = 2 + doublable + splittable

            inds = self.state_actions.state == state

            if not doublable:
                inds = inds & (self.state_actions.action != 2)

            if not splittable:
                inds = inds & (self.state_actions.action != 3)

            
            if (random.random() < self.epsilon) or inds.sum() == 0:
                if options == 2:
                    action = 0 + (random.random() < 0.5)
    
                elif options == 3:
                    action = 0 + (random.random() < 0.3333) + (random.random() < 0.6666)
    
                else:
                    action = 0 + (random.random() < 0.25) + (random.random() < 0.50) + (random.random() < 0.75)

            else:
                the_max = self.rewards[inds].argmax()
                action = self.state_actions[inds].iloc[the_max].action
                    
            sequence.append(action)
            # now get to this
            
            if action == 0: # stand
                break

            elif action == 1: # hit
                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)

                doublable = False
                splittable = False

            elif action == 2: # double
                
                multiplier = 2.

                c = self.deal_card()
                soft, player = self.get_score(player, c, soft)
                
                break
                
            else: # split

                cnew = self.deal_card()
                is_second = True

                # now reset everything
                player = 0.

                soft, player = self.get_score(player, c1, False)
                soft, player = self.get_score(player, cnew, soft)

                splittable = False


        return player, is_second, multiplier, sequence

    
    def evaluate_dealer_hand(self, d1, d2):
        dealer = 0.
        self.unhide(d2)

        soft, dealer = self.get_score(dealer, d1, False)
        soft, dealer = self.get_score(dealer, d2, soft)

        while dealer < 17:
            d = self.deal_card()
            soft, dealer = self.get_score(dealer, d, soft)

        return dealer

    
    def blackjack_round(self, eval):
        # deal player cards
        c1 = self.deal_card()
        c2 = self.deal_card()

        d1 = self.deal_card()
        d2 = self.deal_card(hidden = True)

        player_natural = self.blackjack(c1, c2)
        dealer_natural = self.blackjack(d1, d2)

        if dealer_natural:
            
            self.unhide(d2)
            if not player_natural:
                return [-1.] # dealer blackjack

            return [0.] # tie

        if player_natural:
            return [1.5] # player blackjack
            

        # evaluate player hand
        player, is_second, multiplier, sequence = self.evaluate_player_hand(c1, c2, d1, eval)

        # if there's a second then do the second
        if is_second:
            player2, _, multiplier2, sequence2 = self.evaluate_player_hand(c2, None, d1, eval) 
        
        # if both busted then we're done
        if player > 21 and not is_second:
            self.unhide(d2)
            return sequence, (-1. * multiplier)

        if player > 21 and is_second and player2 > 21:
            self.unhide(d2)
            return sequence, -1. * multiplier, sequence2, -1 * multiplier2
            
        # if not then process dealer hand
        dealer = self.evaluate_dealer_hand(d1, d2)

        if not is_second:
            if dealer > 21:
                return sequence, 1 * multiplier

            elif player > dealer:
                return sequence, 1 * multiplier

            elif player < dealer:
                return sequence, -1 * multiplier

            return sequence, 0

        else: # then there is a second
            r1 = 0
            r2 = 0

            # first hand
            if player > 21: 
                r1 -= multiplier
                
            elif dealer > 21:
                r1 += multiplier

            elif player > dealer:
                r1 += multiplier

            elif player < dealer:
                r1 -= multiplier
                
            # second hand
            if player2 > 21: 
                r2 -= multiplier2
                
            elif dealer > 21:
                r2 += multiplier2

            elif player2 > dealer:
                r2 += multiplier2

            elif player2 < dealer:
                r2 -= multiplier2

            return sequence, r1, sequence2, r2

    def calibrate(self, w):
        seq, r = w
        r *= 10

        for i in range(int(len(seq) / 2)):
            inds = (self.state_actions.state == seq[2*i]) & (self.state_actions.action == seq[2*i + 1])

            if inds.sum() == 0:
                tup = pd.DataFrame(data = [[None, seq[2*i + 1]]], columns = ['state', 'action'])
                tup['state'] = [seq[2*i]]
                self.state_actions = pd.concat([self.state_actions, tup])
                
                self.rewards = np.append(self.rewards, r)
                self.seen = np.append(self.seen, 1)

            else:
                self.seen[inds] += 1

        # now everything is here. we can do the q learning loop now
        seq_reversed = list(reversed(seq))

        for i in range(int(len(seq) / 2)):
            inds = (self.state_actions.action == seq_reversed[2*i]) & (self.state_actions.state == seq_reversed[2*i + 1])
            
            if i == 0: # terminal state
                self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * r)

            else:
                inds_for_max = (self.state_actions.state == seq_reversed[2*i + 1])
                r_best = self.rewards[inds_for_max].max()

                self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * r_best)

    
    def calibrate_split(self, w):
        seq1, r1, seq2, r2 = w

        #print(r1, r2)

        # add the split to the table
        inds = (self.state_actions.state == seq1[0]) & (self.state_actions.action == seq1[1])

        # add to table if necessary
        if inds.sum() == 0:

            #print('before', self.state_actions.shape)
            
            tup = pd.DataFrame(data = [[None, seq1[1]]], columns = ['state', 'action'])
            tup['state'] = [seq1[0]]
            self.state_actions = pd.concat([self.state_actions, tup])
            
            self.rewards = np.append(self.rewards, r1 + r2)
            self.seen = np.append(self.seen, 1)

            #print('after', self.state_actions.shape)

        else:
            self.seen[inds] += 1
            self.rewards[inds] = ((1 - self.alpha) * self.rewards[inds]) + (self.alpha * (r1 + r2))

        # now pass the actual hands through the table
        self.calibrate((seq1[2:], r1))
        self.calibrate((seq2, r2))
        
    
    def do_game(self, eval):

        winnings = 0.
        self.reset()
        
        while len(self.shoe) > 60:

            multiplier = self.get_multiplier()
            w = self.blackjack_round(eval)

            if len(w) == 2:
                self.calibrate(w)
                w_round = w[-1]

            elif len(w) == 4:

                self.calibrate_split(w)
                w_round = w[-1] + w[-3]

            else:
                w_round = w[0]

            winnings += (multiplier * 10 * w_round)

        return winnings
        

    def do_games_until_convergence(self):
        self.winnings = []

        n = 0
        done = False

        while not done:

            w = self.do_game(eval = False)

            if n == 4000:
                done = True

            if n % 10 == 0:
                self.winnings.append(w)
                
            if n % 20 == 0:
                print('=== Game', n, '| Seen s-a pairs:', self.seen.size,'===')

            n += 1
        

In [55]:
bj = Blackjack()
w = bj.do_games_until_convergence()

=== Game 0 | Seen s-a pairs: 73 ===
=== Game 20 | Seen s-a pairs: 509 ===
=== Game 40 | Seen s-a pairs: 626 ===
=== Game 60 | Seen s-a pairs: 687 ===
=== Game 80 | Seen s-a pairs: 730 ===
=== Game 100 | Seen s-a pairs: 750 ===
=== Game 120 | Seen s-a pairs: 762 ===
=== Game 140 | Seen s-a pairs: 775 ===
=== Game 160 | Seen s-a pairs: 786 ===
=== Game 180 | Seen s-a pairs: 792 ===
=== Game 200 | Seen s-a pairs: 801 ===
=== Game 220 | Seen s-a pairs: 810 ===
=== Game 240 | Seen s-a pairs: 815 ===
=== Game 260 | Seen s-a pairs: 825 ===
=== Game 280 | Seen s-a pairs: 827 ===
=== Game 300 | Seen s-a pairs: 830 ===
=== Game 320 | Seen s-a pairs: 836 ===
=== Game 340 | Seen s-a pairs: 840 ===
=== Game 360 | Seen s-a pairs: 843 ===
=== Game 380 | Seen s-a pairs: 844 ===
=== Game 400 | Seen s-a pairs: 846 ===
=== Game 420 | Seen s-a pairs: 849 ===
=== Game 440 | Seen s-a pairs: 852 ===
=== Game 460 | Seen s-a pairs: 854 ===
=== Game 480 | Seen s-a pairs: 855 ===
=== Game 500 | Seen s-a pairs: 8

In [56]:
bj.state_actions

Unnamed: 0,state,action
0,"(6.0, False, 5)",1
0,"(15.0, False, 5)",0
0,"(20.0, False, 8)",1
0,"(20.0, False, 5)",1
0,"(11.0, False, 7)",2
...,...,...
0,"(16.0, False, 8)",3
0,"(12.0, True, 6)",3
0,"(6.0, False, 2)",3
0,"(4.0, False, 7)",0


In [57]:
bj.seen

array([ 212.,  650., 1349., 1287.,  225.,  212.,  249., 1004.,  686.,
        451., 2003.,  358.,  367.,  780., 1285., 4394., 2473.,  120.,
       1175.,  711.,  206.,   11.,  623.,  282.,  183.,  593., 4641.,
         70.,  679., 2511., 1200., 3904.,  695.,  411., 1578., 2458.,
       2350.,  500.,  626., 1139.,  712., 1436.,  146., 1133., 3974.,
       2842., 1149.,  662.,  468.,  677., 1042., 1010.,  133.,  328.,
        598.,  787.,  782.,   62., 1083.,  714., 4138.,  213.,  459.,
       1143., 1200., 4271.,  228.,   86., 1079.,  650.,  423.,  623.,
        616.,  163., 1143.,  339.,  570.,  238.,  131.,  101.,  286.,
        605.,  648., 1065.,  330.,  257.,  562., 2419.,   53.,  561.,
        127., 1081., 1307.,  864.,  413.,  582., 1317., 1136.,  615.,
        335., 1120.,  842.,  345.,  147.,  832., 1298., 1061., 1769.,
       2198.,  607., 1080.,  133.,  210., 1089., 1151., 1170.,  247.,
        409.,  579., 1009., 3719., 1009.,  356.,  986.,  222.,   57.,
        389.,  721.,

In [58]:
bj.rewards

array([ 9.14098837e+00, -4.03657983e-01, -9.26186636e+00, -9.51283242e+00,
       -5.63215747e-01, -8.16635556e+00, -9.57221287e+00, -5.63122573e+00,
       -4.04166742e+00,  1.61787742e+01, -2.38187229e+00, -1.46498346e+01,
       -5.36952088e+00, -1.83029314e+01,  6.56254806e+00, -6.85094494e+00,
       -3.51334021e+00, -4.43562874e+00, -1.56210311e+00,  9.75560323e+00,
       -6.29153118e+00, -3.94979635e-01,  9.93070322e+00,  1.02281880e+01,
        1.08654988e+01,  1.75071492e+00, -7.76925209e+00, -1.17406912e+01,
       -2.64035724e-01,  7.25311997e+00, -1.20249941e+01, -9.55893395e+00,
       -8.19231997e+00, -2.26008299e+00,  1.05868408e+01, -6.70963545e-01,
       -5.18274628e+00,  1.19635410e+01,  4.89203110e+00, -1.72892675e+00,
        8.93801317e+00, -3.03173423e+00,  1.02293511e+01, -8.15143295e+00,
       -8.05741023e+00, -1.64974174e+01, -3.79139115e+00,  1.99192994e+00,
        1.33089333e+01,  8.15419491e+00, -7.15994307e+00, -7.66125421e+00,
        1.43490765e+00, -

In [59]:
bj.state_actions.state

0      (6.0, False, 5)
0     (15.0, False, 5)
0     (20.0, False, 8)
0     (20.0, False, 5)
0     (11.0, False, 7)
           ...        
0     (16.0, False, 8)
0      (12.0, True, 6)
0      (6.0, False, 2)
0      (4.0, False, 7)
0    (14.0, False, 11)
Name: state, Length: 880, dtype: object

In [60]:
inds = (bj.state_actions.state.map(lambda x: x[0]) == 4.0)
bj.state_actions[inds]

Unnamed: 0,state,action
0,"(4.0, False, 10)",2
0,"(4.0, False, 11)",1
0,"(4.0, False, 9)",3
0,"(4.0, False, 9)",1
0,"(4.0, False, 7)",1
0,"(4.0, False, 10)",0
0,"(4.0, False, 5)",2
0,"(4.0, False, 6)",2
0,"(4.0, False, 8)",1
0,"(4.0, False, 3)",2


In [61]:
# out of curiosity, wat is the best hand we've seen over 100 times
inds = bj.seen > 200
print(inds.sum())

rank = bj.rewards[inds].argsort()

393


In [62]:
bj.rewards[inds][rank[:5]]

array([-19.96757525, -19.93924928, -19.3800984 , -19.16918442,
       -18.90652404])

In [63]:
bj.state_actions[inds].iloc[rank[:5]] # the 5 worst

Unnamed: 0,state,action
0,"(19.0, False, 10)",2
0,"(20.0, False, 11)",2
0,"(20.0, False, 4)",2
0,"(16.0, False, 3)",2
0,"(19.0, False, 2)",2


In [64]:
print(bj.rewards[inds][rank[-5:]])
bj.state_actions[inds].iloc[rank[-5:]] # the 5 best

[17.11394892 17.11678605 17.42301962 17.902848   18.18042457]


Unnamed: 0,state,action
0,"(9.0, False, 3)",1
0,"(8.0, False, 6)",1
0,"(9.0, False, 11)",1
0,"(8.0, False, 4)",1
0,"(8.0, False, 7)",1
