In [1]:
import sys
sys.path.insert(0, './utils')

import numpy as np
import random
import itertools
import matplotlib.pyplot as plt

from tools import *

# Let's play one against the other 

In [2]:
cards = ["Ace", "King", "Queen", "Jack", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
signs = ["Club", "Diamond", "Heart", "Spade"]
initialW = 100
n = 7
m = 11
it = int(5*1e3)
lr = .5
exp = .4
alpha = 3.
decay = 1.
maxBet = 10
retreat = 1
parameters = [[1, 1] for i in range(maxBet)]

In [3]:
class Cards: 
    
    def __init__(self, firstPlayer, secondPlayer):
        self.cards = 4*list(range(n, m+1))
        self.firstPlayer = firstPlayer
        self.secondPlayer = secondPlayer
        self.turn = random.randint(0, 1)
        
    def deal(self):
        random.shuffle(self.cards)
        self.turn = 1 - self.turn
        self.firstPlayer.hand = [self.cards[0], self.cards[2]]
        self.firstPlayer.sum = sum(self.firstPlayer.hand)
        self.secondPlayer.hand = [self.cards[1], self.cards[3]]
        self.secondPlayer.sum = sum(self.secondPlayer.hand)
        
    def restart(self):
        self.secondPlayer.reset()
        self.firstPlayer.reset()
        self.turn = random.randint(0, 1)
        
    def reward(self, first, second, trainable):
        first.update(second, trainable)
        second.update(first, trainable)
            
    def play(self, first, second, trainable):
        
        first.lastAction = None
        actions = list(range(1, min(maxBet, first.worth, second.worth)+1))
        first.action = first.chooseAction(actions, None)    
        
        second.lastAction = None
        actions = list(range(first.action, min(maxBet, first.worth, second.worth)+1))
        actions.append(0)
        second.action = second.chooseAction(actions, first.action)

        while first.action*second.action > 0 and first.action != second.action:
            
            first.lastAction = first.action
            actions = list(range(second.action , min(maxBet, first.worth, second.worth)+1))
            actions.append(0)
            first.action = first.chooseAction(actions, second.action)
            
            if first.action > 0 and second.action != first.action:
                
                second.lastAction = second.action
                actions = list(range(first.action, min(maxBet, first.worth, second.worth)+1))
                actions.append(0)
                second.action = second.chooseAction(actions, first.action)

        self.reward(first, second, trainable)

    def train(self, rounds = 10, trainable = True):

        score = 0
        for _ in range(rounds):
            self.restart()
            while self.firstPlayer.worth * self.secondPlayer.worth > 0:
                self.deal()
                if self.turn:
                    self.play(self.firstPlayer, self.secondPlayer, trainable)
                else:
                    self.play(self.secondPlayer, self.firstPlayer, trainable)
            score += (self.firstPlayer.worth > 0)
            
        return score
    
    def human(self, results = True, trainable = False):

        self.restart()
        while self.firstPlayer.worth * self.secondPlayer.worth > 0:
            self.deal()
            if self.turn:
                self.play(self.firstPlayer, self.secondPlayer, trainable)
            else:
                self.play(self.secondPlayer, self.firstPlayer, trainable)
            
            if results:
                print("-------------------------")
                print("The {0} hand and last action were: {1}-{2}".format(self.secondPlayer.name, self.secondPlayer.hand, self.secondPlayer.action))
                print("-------------------------")
                print("The {0} hand and last action were: {1}-{2}".format(self.firstPlayer.name, self.firstPlayer.hand, self.firstPlayer.action))
                print("-------------------------")
                print("Current Net Worth of {0}: {1} - Current Net Worth of {2}: {3}".format(self.firstPlayer.name, self.firstPlayer.worth, self.secondPlayer.name, self.secondPlayer.worth))
                print("=========================")
            
        if self.secondPlayer.worth > 0:
            print("\n And the final winner is {0}! :)".format(self.secondPlayer.name))
        else:
            print("\n And the final winner is {0}! :>".format(self.firstPlayer.name))

In [4]:
class Player:
    
    def __init__(self, name = "Player", lr = lr, exp = exp):
        self.worth = initialW
        self.statDict = {}
        self.lr = lr
        self.exp = exp
        self.name = name
        
    def reset(self):
        self.worth = initialW
        
    def chooseAction(self, actions, advAction):
        
        if advAction is None:
            return random.choice(actions)
        
        elif random.random() < exp:
            return random.choice(actions)

        action = None
        vmax = - 1e10
        state = str(self.sum) + "-" + str(advAction)
        for act in actions:
            current = 0 if self.statDict.get(state) is None or self.statDict.get(state).get(str(act)) is None else self.statDict.get(state).get(str(act))
            if vmax < current:
                vmax = current
                action = act
                    
            return action
        
    def update(self, adverser, trainable):
        if trainable:
            if adverser.action > 0:
                if self.lastAction is None:
                    state = str(self.sum) + "-" + str(adverser.action)
                else:
                    state = str(self.sum) + "-" + str(adverser.action) + "-" + str(self.lastAction)
                    
                if self.statDict.get(state) is None:
                    self.statDict[state] = {}
                if self.statDict.get(state).get(str(self.action)) is None:
                    self.statDict[state][str(self.action)] = 0

                if self.action == adverser.action:
                    if self.sum > adverser.sum:
                        self.statDict[state][str(self.action)] += self.lr*(  self.action - self.statDict[state][str(self.action)])
                    elif self.sum < adverser.sum:
                        self.worth -= self.action
                        self.statDict[state][str(self.action)] += self.lr*(- self.action - self.statDict[state][str(self.action)])

                elif self.action == 0:
                    if self.lastAction is None:
                        self.worth -= retreat
                        self.statDict[state][str(self.action)] += self.lr*(- retreat - self.statDict[state][str(self.action)])

                    else:
                        self.worth -= self.lastAction
                        self.statDict[state][str(self.action)] += self.lr*(- self.lastAction - self.statDict[state][str(self.action)])
        
        else:
            if self.action == adverser.action and self.sum < adverser.sum:
                self.worth -= self.action
            elif self.action == 0:
                if self.lastAction is None:
                    self.worth -= retreat
                else:
                    self.worth -= self.lastAction

In [5]:
class HumanPlayer(Player):
    
    def __init__(self, name = "HumanPlayer"):
        self.name = name
        
    def chooseAction(self, actions, advAction):
        print("State: ", self.hand)
        
        if advAction is None: pass
        else:
            print("Adverser played: ", advAction)
            
        action = int(input("Input your action: ")) 
        
        while action not in actions:
            print("Try a number in ", actions)
            action = int(input("Input your action: "))  
            
        return action
    
    def update(self, adverser, trainable):
        if self.action == adverser.action and self.sum < adverser.sum:
            self.worth -= self.action
        elif self.action == 0:
            if self.lastAction is None:
                self.worth -= retreat
            else:
                self.worth -= self.lastAction

In [6]:
## TO TRAIN A BASIC SIMPLE MODEL YOU CAN USE, IT IS NOT VERY SMART
# firstPlayer = Player()
# secondPlayer = Player()
# cards = Cards(firstPlayer, secondPlayer)
# cards.train(it)

In [7]:
# TO PLAY AGAINST A VIRTUAL PLAYER, YOU CAN UNCOMMENT THE FOLLOWING LINES AFTER TRAINING THE AGENT
# firstPlayer.reset()
# human = HumanPlayer()
# cards = Cards(firstPlayer, human)
# cards.human()

In [8]:
class GreedyPlayer(Player):
    
    def __init__(self, name = "GreedyPlayer"):
        self.worth = initialW
        self.name = name
        
    def chooseAction(self, actions, advAction):
        if advAction is None:
            return random.choice(actions)
        if self.sum > m + n:
            return advAction
        return 0
    
    def update(self, adverser, trainable):
        if self.action == adverser.action and self.sum < adverser.sum:
            self.worth -= self.action
        elif self.action == 0:
            if self.lastAction is None:
                self.worth -= retreat
            else:
                self.worth -= self.lastAction

In [9]:
## TO TRAIN A BASIC SIMPLE MODEL YOU CAN USE, IT IS NOT VERY SMART
# firstPlayer = GreedyPlayer()
# secondPlayer = Player()
# cards = Cards(firstPlayer, secondPlayer)
# cards.train(it)

In [10]:
# TO PLAY AGAINST A VIRTUAL PLAYER, YOU CAN UNCOMMENT THE FOLLOWING LINES AFTER TRAINING THE AGENT
# human = HumanPlayer()
# cards = Cards(firstPlayer, human)
# cards.human()

# Thompson Sampling

In [11]:
class ThompsonSampling(Player):
    def __init__(self, parameters = parameters, name = "Thompson", exp = exp):
        self.exp = exp
        self.name = name
        self.parameters = parameters
        self.nbDraws = np.zeros((2*(m-n) + 1, maxBet + 1))
        self.cumRewards = np.zeros((2*(m-n) + 1, maxBet + 1))

    def chooseAction(self, actions, advAction):
        if advAction is None:
            return random.choice(actions)
        
        elif random.random() < self.exp:
            return random.choice(actions)
        
        """
        hand, action = state.split("-")
        cards = [int(card) for card in hand[1:-1].split(",")]
        action = int(action)
        
        estimated = random.betavariate(self.parameters[action-1][0], self.parameters[action-1][1])
        estimated = 2*n + int(2*(m-n)*estimated)
    
        if sum(cards) >= estimated: 
            return int(action)
        return 0 
        """
        action = advAction
        
        expression = np.zeros(maxBet + 1)
        for i in range(len(actions)):
            act = actions[i]
            expression[act] = np.random.beta(self.parameters[action - 1][0] + self.cumRewards[self.sum-2*n, act],
                                             self.parameters[action - 1][1] + self.nbDraws[self.sum-2*n, act] - self.cumRewards[self.sum-2*n, act])
        return randmax(expression)
        
    def update(self, adverser, trainable):
        if trainable:
            if self.action == adverser.action:
                if self.sum > adverser.sum:
                    self.parameters[adverser.action - 1][0] += 0.5
                elif self.sum < adverser.sum:
                    self.worth -= adverser.action
                    self.parameters[adverser.action - 1][1] += 0.5
                    
            elif self.action == 0:
                if self.lastAction is None:
                    self.worth -= retreat
                else:
                    self.worth -= self.lastAction
                self.parameters[adverser.action - 1][1] += 0.5
            
        else:
            if self.action == adverser.action and self.sum < adverser.sum:
                self.worth -= self.action
            elif self.action == 0:
                if self.lastAction is None:
                    self.worth -= retreat
                else:
                    self.worth -= self.lastAction

In [12]:
# firstPlayer = ThompsonSampling()
# secondPlayer = Player()
# cards = Cards(firstPlayer, secondPlayer)
# cards.train(it, bandit = True)

In [13]:
# cards = Cards(firstPlayer, HumanPlayer())
# cards.human(human = True, bandit = True, result = True)

# UCB Sampling

In [14]:
class UCB(Player):
    def __init__(self, alpha = alpha, name = "UCB", exp = exp):
        self.exp = exp
        self.name = name
        self.alpha = alpha
        self.nbDraws = np.zeros((2*(m-n) + 1, maxBet + 1))
        self.cumRewards = np.zeros((2*(m-n) + 1, maxBet + 1))

    def chooseAction(self, actions, advAction):
        
        if random.random() < self.exp:
            final = random.choice(actions)
            self.cumRewards[self.sum-2*n, final] += 1
            return final
        
        expression = np.zeros(maxBet + 1)
        calls = np.sum(self.nbDraws)
        
        for i in range(len(actions)):
            act = actions[i]
            if self.nbDraws[self.sum-2*n, act] < 1: expression[act] = np.inf
            else:
                expression[act] = self.cumRewards[self.sum-2*n, act]/self.nbDraws[self.sum-2*n, act] + np.sqrt((self.alpha*np.log(calls+1))/self.nbDraws[self.sum-2*n, act])
        
        final = randmax(expression)
        self.cumRewards[self.sum-2*n, final] += 1
        return final
        
    def update(self, adverser, trainable): 
        if trainable:
            if self.action == adverser.action:
                if self.sum > adverser.sum:
                    self.cumRewards[self.sum-2*n, adverser.action] += adverser.action
                elif self.sum < adverser.sum:
                    self.worth -= adverser.action
                    self.cumRewards[self.sum-2*n, adverser.action] -= adverser.action
            elif self.action == 0:
                if self.lastAction is None:
                    self.worth -= retreat
                else:
                    self.worth -= self.lastAction
                self.cumRewards[self.sum-2*n, adverser.action] -= adverser.action
        else:
            if self.action == adverser.action and self.sum < adverser.sum:
                self.worth -= adverser.action
            elif self.action == 0:
                if self.lastAction is None:
                    self.worth -= retreat
                else:
                    self.worth -= self.lastAction

In [15]:
# firstPlayer = UCB()
# secondPlayer = Player()
# cards = Cards(firstPlayer, secondPlayer)
# cards.train(it, bandit = True)

In [16]:
# cards = Cards(firstPlayer, HumanPlayer())
# cards.human(human = True, bandit = True, result = True)

# Who's better

## TRAIN

In [17]:
firstPlayer = Player(lr = .1, exp = .1)
secondPlayer = Player(lr = .9, exp = 0)
cards = Cards(firstPlayer, secondPlayer)
cards.train(it)

2537

In [18]:
greedyPlayer = GreedyPlayer()
cards = Cards(firstPlayer, greedyPlayer)
cards.train(it)

1402

In [19]:
UCBPlayer = UCB()
cards = Cards(firstPlayer, UCBPlayer)
cards.train(it)

2766

In [20]:
ThompsonPlayer = ThompsonSampling()
cards = Cards(firstPlayer, ThompsonPlayer)
cards.train(it)

2709

## EVALUATE

In [21]:
cards = Cards(greedyPlayer, ThompsonPlayer)
cards.train(it, trainable = False)

3433

In [22]:
cards = Cards(greedyPlayer, UCBPlayer)
cards.train(it, trainable = False)

3420

In [23]:
cards = Cards(UCBPlayer, ThompsonPlayer)
cards.train(it, trainable = False)

2459

In [24]:
cards = Cards(ThompsonPlayer, UCBPlayer)
cards.train(it, trainable = False)

2538

## INFO

# Who's better Zayed or the computer ?

In [25]:
cards = Cards(GreedyPlayer(), HumanPlayer())
cards.human(results = True)