In [58]:
import copy
import numpy as np

# Game environment
Variables and functions that's needed to get the game running. I tried to stay close to the format that was originally used by Saskia (and thus close to the framework from Deepmind).

In [96]:
# Set game
NUM_PLAYER = 3
NUM_HAND = 2
# omit offset=0 as you can't give hints to yourself
TARGET_OFFSETS = list(range(1,NUM_PLAYER))

# colors
GREEN = 0
YELLOW = 1
WHITE = 2
BLUE = 3
RED = 4
ALL_COLORS = [GREEN, YELLOW, WHITE, BLUE, RED]

# ranks
ONE = 0
TWO = 1
THREE = 2
FOUR = 3
FIVE = 4
ALL_RANKS = [ONE, TWO, THREE, FOUR, FIVE]

# intention
PLAY = 0
DISCARD = 1
KEEP = 2


# action
ACTION_TYPES = ['PLAY', 'DISCARD', 'REVEAL_COLOR', 'REVEAL_RANK', 'DEAL', 'INVALID']
# dictionary with all possible values for all possible actions 
possible_actions_lexicon = {'action_type': ACTION_TYPES,'target_offset': TARGET_OFFSETS,
                            'color': ALL_COLORS, 'rank': ALL_RANKS, 
                            'player_index': list(range(NUM_PLAYER)),'card_index': list(range(NUM_HAND))}

# misc.
HINT_MAX = 7
LIFE_MAX = 3
COUNT = [3,2,2,2,1]


def init_context(num_player, num_card):
    '''
    initalize the game
    '''
    return {'life_tokens': LIFE_MAX, # game ends if ==0
            'information_tokens': HINT_MAX,
            # top of played cards 0-index based (-1 means no card for the color played)
            'fireworks': {GREEN: -1, YELLOW: -1, WHITE: -1, BLUE: -1, RED: -1},  
            # trashed/fasely played in forms of e.g. {'color':BLUE, 'rank':ONE}
            'discard_pile': [],  
            # num.instance for [player][card][color][rank]
            'knowledge': [[[COUNT for col in ALL_COLORS]
                           for card in range(num_card)] for plyr in range(num_player)],
            # possible actions: HINT_COLOR, HINT_NUMBER, PLAY, DISCARD
            'last_action': {'action_type':None, 'target_offset':None, 'player_index':None, 'card_index':None, 'color':None, 'rank':None} 
           }


def CardPlayable(card, fireworks):
    '''
    return True if the card can be played
    '''
    # rank should be one high up than the current highest to be played
    if fireworks[card['color']] +1 == card['rank']:
        return True
    else:
        return False

    
def CardUseless(card, fireworks):
    '''
    return True if the card can be surely discarded
    '''
    if fireworks[card['color']] > int(card['rank']):
            return True
    else:
        return False

    
def remaining_copies(card, discard_pile):
    '''
    return number of instance of a given card that is still left in the game
    '''
    if card['rank'] == ONE:  # rank one
        total_copies = 3
    elif card['rank'] == FIVE:  # rank five
        total_copies = 1
    else:
        total_copies = 2
    
    # count how many of the sort given by `card` is discarded
    count = 0
    for discarded in discard_pile:
        col, rank = discarded['color'], discarded['rank']
        if (col == card['color']) and (rank == card['rank']):
            count += 1
    return total_copies - count


def update_game(context, last_action):
    '''
    Adapt context accordingly based on last_action
    '''
    nc = copy.deepcopy(context)
    # update last_action
    nc['last_action'] = copy.deepcopy(last_action)

    # 1) PLAY
    if last_action['action_type'] == 'PLAY':
        # i) update other stuff (hint token, error token, discard piles, fireworks)
        card = {'color':last_action['color'], 'rank':last_action['rank']}
        # if played successfully
        if CardPlayable(card, context['fireworks']):
            # hint token
            if (last_action['rank']  == FIVE) and ( # +1 hint for completing the deck
                nc['information_tokens'] < HINT_MAX): # if hint token isn't full
                nc['information_tokens'] += 1
            # fireworks
            nc['fireworks'][last_action['color']] = last_action['rank']
        # wrong playing
        else:
            # error token
            nc['life_tokens'] -= 1
            # discard piles
            nc['discard_pile'].append(card)
        
        # ii) update knowledge
        # for a player that played the card, any cards newer than played card shift down by 1
        for i in range(last_action['card_index'], NUM_HAND-1):
            # knowledge[player][card]
            nc['knowledge'][last_action['player_index']][i] = context['knowledge'][last_action['player_index']][i+1]
        # draw a new card (highest card index, which is num_card-1)
        nc['knowledge'][nc['last_action']['card_index']][NUM_HAND-1] = [[remaining_copies({'color':col, 'rank':rank}, nc['discard_pile'])
                                                for rank in ALL_RANKS] for col in ALL_COLORS]
                
#     elif nc['last_action']['type'] == 'DISCARD':
# TODO: update for discard, hint etc.
    return nc

# Update
 
1. Factorize  <br>
 $P(i_{\text{total}}|a,c)  \stackrel{i.d.}{=}   \prod_{\text{card}} P(i_{\text{card}}|a,c) $


2. Listener <br>
$P(i|a,c)  =  \sum_{r} p(i,r|a,c)  =  \sum_{r} \dfrac {P(a|i,r,c) P(i,r|c)} {P(a|c)}  =  \dfrac {\sum_{r} P(a|i,r,c) P(i,r|c)} {\sum_{i^*,r^*} P(a|i^*,r^*,c) P(i^*,r^*|c)}  \stackrel{?}{=}  \sum_{r} \dfrac { P(a|i,r,c) P(i|c)P(r|c)} {\sum_{i^*} P(a|i^*,r,c) P(i^*|c)P(r|c)}$
 + iteration over possible realisations are implemented by get_realisations()
 + $P(a|i,r,c)$: comes from speaker
 + $P(i|c)$: prior as we had beforehand, possibly drop c dependence as well
 + $P(r|c)$: compute by fraction of number of instance of the specific realisation
 
 
3. Speaker  <br>
$P(a|r,i,c)  =  \dfrac{P(i,a|r,c)}{\sum_{a^*}P(i,a^*|r,c)}  =  \dfrac {P(i|a,r,c) P(a|r,c)}{\sum_{a^*} P(i|a^*,r,c) P(a^*|r,c)}   \stackrel{(*)}{=}   \dfrac {P(i|r,c_{\text{new}}) P(a|r,c)} {\sum_{a^*} P(i|,r,c_{\text{new}}^*) P(a^*|r,c)}  =  \dfrac {\exp(\alpha U(i|r,c_{\text{new}})) P(a|r,c)} {\sum_{a^*}\exp(\alpha U(i|r,c_{\text{new}}^*)) P(a^*|r,c)}$

 + (*): use update_game() to get new game_state
 + $U(i|r,c_{\text{new}})$: comes from utility
 + $P(a|r,c)$ is $0$ when $a$ is not legal and $\frac{1}{\text{#.action}}$ when $a$ is legal. Since the term is in both numerator and denominator, simply iterating over legal actions is sufficient.


4. Utility <br>
$U(i,r,c_{\text{new}})$

### Notes
- Currently the function update_game() is implemented only for the action_type PLAY. 
- action should contain a key 'player_index' to indicate which player performed the action (or possibly other way of representation in Deepmind environment?)
- realisation here is considered only with a realisation of a single card (at first we thought this doesn't make sense, but the utilty function currently only consider one card anyway)


## 2. Pragmatic listener

### Aux functions

In [89]:
def get_realisations_probs(knowledge, player_index, card_index):
    mylist = []
    for col in ALL_COLORS:
        for rank in ALL_RANKS:
            # realisations that are not possible
            if knowledge[player_index][card_index][col][rank] == 0:
                pass
            else:
                mylist.append(({'color':col, 'rank':rank},  # realisation
                              context['knowledge'][player_index][card_index][realisation['color']][realisation['rank']] /  \
    np.sum(context['knowledge'][player_index][card_index])))  # P(r|c)
    return mylist
                
    
    
def get_intention_prior(intention, context):
    # TODO: currently not context dependent
    return 1/3

### listener

In [49]:
def pragmatic_listener(action, context, player_index, card_index):
    '''
    return a 3 dim simplex
    '''
    # 3 dim simplex with prob for each intention
    probs = []
    
    # compute probability for each intention
    for intention in [PLAY, DISCARD]:  
        numerator = 0
        denominator = 0
        # sum over r
        for r,p in get_realisations_probs(context['knowledge'], player_index, card_index):
            numerator += pragmatic_speaker(action, intention, r, context, player_index, card_index) * \
            get_intention_prior(intention, context) * p
            # sum over i in denom
            for i in [PLAY, DISCARD, KEEP]:
                denominator += pragmatic_speaker(action, i, r, context, player_index, card_index) * \
                get_intention_prior(i, context) * p
        # save the probability
        probs.append(numerator/denominator)
    # P(KEEP|a,c) = 1 - P(PLAY|a,c) - P(DISCARD|a,c)
    probs.append(1 - probs[0] - probs[1])
    return probs
            

## 3. Pragmatic speaker

### Aux functions

In [50]:
# function for creating an instance of an action
def create_action_instance(act_type, info, target_offset, card_index):
    
    # is it a color or rank hint or no hint at all
    if act_type is 'REVEAL_RANK':
        rank_tag = info
        color_tag = None
    elif act_type is 'REVEAL_COLOR':
        color_tag = info
        rank_tag = None
    else:
        color_tag = None
        rank_tag = None
    
    # TODO: include 'player_index'
    action = {'action_type': act_type, 'target_offset': target_offset, 'rank': rank_tag, 'color': color_tag,
             'card_index': card_index}
    return action

# creating all possible actions
# (Note: currently it is not encoded which specific card is hinted at, only the player the hint goes to)
possible_actions = []

# create all play actions
for position in possible_actions_lexicon['card_index']:
    possible_actions.append(create_action_instance(act_type='PLAY', info=None, 
                                                        target_offset=None, card_index=position))

# create all discard actions
for position in possible_actions_lexicon['card_index']:
    possible_actions.append(create_action_instance(act_type='DISCARD', info=None, 
                                                        target_offset=None, card_index=position))      
        
# create all color hints
for offset in possible_actions_lexicon['target_offset']:
        for color in possible_actions_lexicon['color']:
            possible_actions.append(create_action_instance(act_type='REVEAL_COLOR', info=color, 
                                                        target_offset=offset, card_index=None))
            
# create all rank hints
for offset in possible_actions_lexicon['target_offset']:
        for rank in possible_actions_lexicon['rank']:
            possible_actions.append(create_action_instance(act_type='REVEAL_RANK', info=rank, 
                                                        target_offset=offset, card_index=None))
            
print(possible_actions)

current_player = 1

def compute_legal_actions(context):
    legal_actions = []
    
    ####### PLAY ACTIONS ############

    # create all play actions
    for position in possible_actions_lexicon['card_index']:
        legal_actions.append(create_action_instance(act_type='PLAY', info=None, 
                                                        target_offset=None, card_index=position))
    
    ##### DISCARD ACTIONS ############
    
    # discarding a card is not possibly when you have 8 information token
    if context['information_tokens'] is not 8:
        # create all discard actions
        for position in possible_actions_lexicon['card_index']:
            legal_actions.append(create_action_instance(act_type='DISCARD', info=None, 
                                                        target_offset=None, card_index=position))      
    
    ######## HINT ACTIONS ############
    
    # you can only give a hint if there is at least 1 information token
    if context['information_tokens'] is not 0:
        # create all color hints
        
        # TODO: option for not allowing null hints?
        # if so: TODO: seen players loop
        #seen_player = 2
        #if context['estimated_hands'][seen_player][]
        for offset in possible_actions_lexicon['target_offset']:
            for color in possible_actions_lexicon['color']:
                legal_actions.append(create_action_instance(act_type='REVEAL_COLOR', info=color, 
                                                        target_offset=offset, card_index=None))
            
        # create all rank hints
        for offset in possible_actions_lexicon['target_offset']:
            for rank in possible_actions_lexicon['rank']:
                    legal_actions.append(create_action_instance(act_type='REVEAL_RANK', info=rank, 
                                                        target_offset=offset, card_index=None))
    return legal_actions
            
print(compute_legal_actions(context = init_context(num_player=NUM_PLAYER, num_card=NUM_HAND)))

[{'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 0}, {'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 1}, {'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 2}, {'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 3}, {'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 4}, {'action_type': 'DISCARD', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 0}, {'action_type': 'DISCARD', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 1}, {'action_type': 'DISCARD', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 2}, {'action_type': 'DISCARD', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 3}, {'action_type': 'DISCARD', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 4}, {'action_type': 'REVEAL_COLOR', 'target_offs

  if act_type is 'REVEAL_RANK':
  elif act_type is 'REVEAL_COLOR':
  if context['information_tokens'] is not 8:
  if context['information_tokens'] is not 0:


### speaker

In [100]:
def pragmatic_speaker(action, intention, realisation, context, player_index, card_index):
    '''
    return a scala which is P(action|intention,realisation,context)
    '''
    # TODO: adjust rationality parameter dynamically
    alpha = 1
    
    # compute numerator
    numerator = np.exp(alpha * utility(intention, realisation, 
                                       # compute new context
                                       update_game(context, action), 
                                       player_index, card_index))
    
    # compute denominator
    denominator = 0
    # automatically only select actions with P(a*|r,c) != 0
    for a in compute_legal_actions(context):
        print('legal action:', a)
        denominator += np.exp(alpha * utility(intention, realisation, 
                                              # this is how different actions makes a difference
                                              update_game(context, a), 
                                              player_index, card_index))
    
    return numerator/denominator


## 4. Utility
The utility as it is coded currently obviously doesn't fit with our idea. Currently, the utility function only takes intention and realisation as input. However, it should take newly updated knowledge into account as well. This means in terms of code, context['knowledge'] isn't used below.

In [60]:
def utility(intention, card, context, player_index, card_index):
    '''
    return a utility for a given card (decided by card_index, player_index) of a given realisation 
    from various realisations
    '''
    score = 0

    if intention == PLAY:
        # in intention is play and card is playable, 
        # this results in one more card on the fireworks.
        # reward this.
        if CardPlayable(card, context['fireworks']):
            score += 10

        # if intention is play and card is not playable at the time
        else:
            # punish loosing a card from stack
            score -= 1
            # and punish getting a bomb depending on the number of bombs
            if context['life_tokens'] == 3:
                score -= 1
            elif context['life_tokens'] == 2:
                score -= 3
            elif context['life_tokens'] == 1: # game would end directly
                score -= 25

            # if card would still have been relevant in the future,
            # punish loosing it depending on the remaining copies of this card in the deck
            if not CardUseless(card, context['fireworks']):
                if remaining_copies(card, context['discard_pile']) == 2:
                    score -= 1
                elif remaining_copies(card, context['discard_pile']) == 1:
                    score -= 2
                elif remaining_copies(card, context['discard_pile']) == 0:
                    score -= 5


    elif intention == DISCARD:
        # punish loosing a card from stack
        score -= 1

        # reward gaining a hint token:
        score += 0.5

        # punish discarding a playable card
        if CardPlayable(card, context['fireworks']):
            score -= 5

        # if card is not playable right now but would have been relevant in the future, punish
        # discarding it depending on the number of remaining copies in the game
        elif not CardUseless(card, context['fireworks']):
            if remaining_copies(card, context['discard_pile']) == 2:
                score -= 1
            elif remaining_copies(card, context['discard_pile']) == 1:
                score -= 2
            elif remaining_copies(card, context['discard_pile']) == 0:
                score -= 5

        # do we want to reward this additionally? I think rewarding gaining a hint token should be
        # enough, so nothing happens here
        elif CardUseless(card, context['fireworks']):
            pass

    elif intention == KEEP:
        # keeping a playable card is punished, because it does not help the game
        if CardPlayable(card, context['fireworks']):
            score -= 2

        # if card is not playable right now but is relevant in the future of the game reward keeping
        # this card depending on the remaining copies in the game
        elif not CardUseless(card, context['fireworks']):
            if remaining_copies(card, context['discard_pile']) == 2:
                score += 1
            elif remaining_copies(card, context['discard_pile']) == 1:
                score += 2
            elif remaining_copies(card, context['discard_pile']) == 0:
                score += 5

        # punish keeping a useless card
        elif CardUseless(card, context['fireworks']):
            score -= 1

    return score

### Example
the function computing legal_action() doesn't seems to be working correctly as it produces action PLAY without any color and rank information

In [101]:
situation1 = {'life_tokens': 1,
             'information_tokens': 7,
             'fireworks': {0: -1, 1: -1, 2: -1, BLUE: -1, 4: -1},
             'discard_pile': [{'color': BLUE, 'rank': TWO}, {'color': RED, 'rank': FOUR}],
             'knowledge': [[[[3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 1, 2, 2, 1],
                                [3, 2, 2, 1, 1]],
                                # let's say this card was BLUE ONE
                               [[3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 1, 2, 2, 1],
                                [3, 2, 2, 1, 1]]],
                               #
                              [[[3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 1, 2, 2, 1],
                                [3, 2, 2, 1, 1]],
                               [[3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 1, 2, 2, 1],
                                [3, 2, 2, 1, 1]]],
                              [[[3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 2, 2, 2, 1],
                                [3, 1, 2, 2, 1],
                                [3, 2, 2, 1, 1]],
                               # let's say that (2,1) card is fixed to be BLUE TWO, which should be kept
                               [[0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0],
                                [0, 0, 0, 0, 0],
                                [0, 1, 0, 0, 0],
                                [0, 0, 0, 0, 0]]]]}

# in fact this is the only possible realisation
r1 = {'color':BLUE, 'rank':TWO}

a1 =  {'action_type':'PLAY', 'target_offset':0, 'color':BLUE, 'rank':ONE, 'player_index':0 ,'card_index':1}

pragmatic_speaker(action=a1, intention=PLAY, realisation=r1, context=situation1, player_index=2, card_index=1)

legal action: {'action_type': 'PLAY', 'target_offset': None, 'rank': None, 'color': None, 'card_index': 0}


KeyError: None