In [2]:
import random
import sys
import time
import pandas as pd
import numpy as np

In [9]:
class QLearn:
    def __init__(self):
        # key as states
        # value will have {Action: self.QVal}
        # initiate self.Q Table 
        self.QTable = {}
        for i in range(22):
            self.QTable[i] = {'h':0, 's':0}
        # initiate alpha,gamma
    
    def getMax(self,newstate):
        if newstate not in self.QTable:
            return 0
        hit = self.QTable[newstate]['h']
        stick = self.QTable[newstate]['s']
        return max(hit,stick)
    
    def myRand(self, a, p_a):
        if a=='h':
            b = 's'
        else:
            b = 'h'
        return a if random.uniform(0,100) < p_a else b 
    
    def epsilon(self,action,number):
        return self.myRand(action,number)


    def decision(self,state):
        if self.QTable[state]['h']>=self.QTable[state]['s']:
            l2 = self.epsilon('h',88)
        else:
            l2 = self.epsilon('s',88)
        return l2

    def updateQ(self,state,action,newstate,reward):
        oldQ = self.QTable[state][action]
        newQ = oldQ + 0.8 * ((reward + 0.99*self.getMax(newstate)) - oldQ)
        self.QTable[state][action] = newQ

In [None]:
class fakeBlackjack:
    def __init__(self):
        self.Q = QLearn()
    
    def getCard(self):
        c1 = 'B' if random.uniform(0,1) < 0.66 else 'R' 
        # cardType = ['R', 'B', 'B']
        cards = [i for i in range(1,11)]
        # c1 = random.choice(cardType)
        c2 = random.choice(cards)
        return c1,c2
    
    def getFirstCard(self):
        cards = [i for i in range(1,11)]
        return random.choice(cards)          

    def play(self):    
        aiScore = 0
        computerScore = 0
        aiAlive = True
        computerAlive = True

        # 1. Both agent and computer draw card
        aiScore += self.getFirstCard()
        # print('Agent draws black card of '+str(aiScore))
        computerScore += self.getFirstCard()
        # print('Computer draws black card of '+str(computerScore))

        # 2. Agent plays first
        
        while aiAlive:
            oldState = aiScore
            decision = self.Q.decision(aiScore)
            if decision == 'h':
                oldState = aiScore
                cardType,cardVal = self.getCard()
                # print('Agent draws ' + cardType + str(cardVal))
                if cardType == 'B':
                    aiScore += cardVal
                else:
                    aiScore -= cardVal
                # self.updateScore('ai', cardType, cardVal)
                newstate = aiScore
                if aiScore>21 or aiScore<1:
                    reward = -1
                    aiAlive = False
                else:
                    reward = 0
                self.Q.updateQ(oldState, decision, newstate,reward)
                # print('Agent score is ' + str(aiScore))
            else:
                break
        else:
            return 'C'
        
        # 3. Agent score gets recorded 
        # 4. Now computer will play
        while computerAlive:
            oldState = computerScore
            if oldState<17:
                decision = 'h'
            else:
                decision = 's'
            if decision == 'h':
                cardType,cardVal = self.getCard()
                if cardType == 'B':
                    computerScore += cardVal
                else:
                    computerScore -= cardVal
                newstate = computerScore
                if computerScore>21 or computerScore<1:
                    reward = 1
                    computerAlive = False
                else:
                    reward = 0
                self.Q.updateQ(oldState, decision, newstate,reward)
            
            else:
                # The dealer sticks, scores are compared now!
                if aiScore>computerScore:
                    self.Q.updateQ(aiScore, 's', 30, 1)
                    return 'A'
                elif aiScore==computerScore:
                    self.Q.updateQ(aiScore, 's', 30, 0)
                    return 'D'
                else:
                    self.Q.updateQ(aiScore, 's', 30, -1)            
                    return 'C'

        else:
            self.Q.updateQ(aiScore, 's', 30,1)
            return 'A'

    def train(self, number):
        count = 0
        for i in range(number):
            outcome = self.play()
            if outcome == 'A':
                count +=1
        return count

In [8]:
scores = []  
x = fakeBlackjack()  
win = x.train(10000)
reward = 0  
for i in range(1000):
    result = x.play()
    if result == 'A':
        reward-=1
    elif result=='C':
        reward+=1

print('Average reward over 1000 games is ' + str(reward/1000))

Average reward over 1000 games is 0.251
