In [1]:
from IPython.display import HTML
HTML('''<style>.CodeMirror{min-width:100% !important;}</style>''')

In [2]:
import copy

import numpy as np
#first some imports
import torch
torch.set_default_dtype(torch.float64)  # double precision for numerical stability

import matplotlib.pyplot as plt

import pyro
import pyro.distributions as dist
import pyro.poutine as poutine

from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV

# Game environment
Variables and functions that's needed to get the game running. I tried to stay close to the format that was originally used by Saskia (and thus close to the framework from Deepmind).

In [29]:
# colors
GREEN = 0
YELLOW = 1
WHITE = 2
BLUE = 3
RED = 4
ALL_COLORS = [GREEN, YELLOW, WHITE, BLUE, RED]

# ranks
ONE = 0
TWO = 1
THREE = 2
FOUR = 3
FIVE = 4
ALL_RANKS = [ONE, TWO, THREE, FOUR, FIVE]

# intention
PLAY = 0
DISCARD = 1
KEEP = 2

# misc.
HINT_MAX = 7
LIFE_MAX = 3
COUNT = [3,2,2,2,1]


def init_context(num_player, num_card):
    '''
    initalize the game
    '''
    return {'life_tokens': LIFE_MAX, # game ends if ==0
            'information_tokens': HINT_MAX,
            # top of played cards 0-index based (-1 means no card for the color played)
            'fireworks': {GREEN: -1, YELLOW: -1, WHITE: -1, BLUE: -1, RED: -1},  
            # trashed/fasely played in forms of e.g. {'color':BLUE, 'rank':ONE}
            'discard_pile': [],  
            # specific realisation allowed by knowledge, used for update step
            'hand': [[{'color':None, 'rank':None} for c in range(num_card)] for p in range(num_player)],
            # num.instance for [player][card][color][rank]
            'knowledge': [[[COUNT for col in ALL_COLORS]
                           for card in range(num_card)] for plyr in range(num_player)],
            # possible actions: HINT_COLOR, HINT_NUMBER, PLAY, DISCARD
            'last_action': {'type':None, 'pnr':None, 'cnr':None, 'color':None, 'rank':None} 
           }


def CardPlayable(card, fireworks):
    '''
    return True if the card can be played
    '''
    # rank should be one high up than the current highest to be played
    if fireworks[card['color']] +1 == card['rank']:
        return True
    else:
        return False

    
def CardUseless(card, fireworks):
    '''
    return True if the card can be surely discarded
    '''
    if fireworks[card['color']] > int(card['rank']):
            return True
    else:
        return False

    
def remaining_copies(card, discard_pile):
    '''
    return number of instance of a given card that is still left in the game
    '''
    if card['rank'] == ONE:  # rank one
        total_copies = 3
    elif card['rank'] == FIVE:  # rank five
        total_copies = 1
    else:
        total_copies = 2
    
    # count how many of the sort given by `card` is discarded
    count = 0
    for discarded in discard_pile:
        col, rank = discarded['color'], discarded['rank']
        if (col == card['color']) and (rank == card['rank']):
            count += 1
    return total_copies - count


def update_game(context, last_action):
    '''
    Adapt context accordingly based on last_action
    '''
    nc = copy.deepcopy(context)
    # update last_action
    nc['last_action'] = copy.deepcopy(last_action)

    # 1) PLAY
    if last_action['type'] == 'PLAY':
        # i) update other stuff (hint token, error token, discard piles, fireworks)
        card = {'color':last_action['color'],
                         'rank':last_action['rank']}
        # if played successfully
        if CardPlayable(card, context['fireworks']):
            # hint token
            if (last_action['rank']  == FIVE) and ( # +1 hint for completing the deck
                nc['information_tokens'] < HINT_MAX): # if hint token isn't full
                nc['information_tokens'] += 1
            # fireworks
            nc['fireworks'][last_action['color']] = last_action['rank']
        # wrong playing
        else:
            # error token
            nc['life_tokens'] -= 1
            # discard piles
            nc['discard_pile'].append(card)
        
        # ii) update knowledge
        # for a player that played the card, any cards newer than played card shift down by 1
        for i in range(last_action['cnr'], num_card-1):
            # knowledge[player][card]
            nc['knowledge'][last_action['pnr']][i] = context['knowledge'][last_action['pnr']][i+1]
        # draw a new card (highest card index, which is num_card-1)
        nc['knowledge'][nc['last_action']['pnr']][num_card-1] = [[remaining_copies({'color':col, 'rank':rank}, nc['discard_pile'])
                                                for rank in ALL_RANKS] for col in ALL_COLORS]
                
#     elif nc['last_action']['type'] == 'DISCARD':
# TODO: update for discard, hint etc.
    return nc

### Demonstrate usuage

In [4]:
# game with 3 players and 5 cards at hand for each
num_player = 3 
num_card = 5

# initial game state
context = init_context(num_player, num_card)
context

{'life_tokens': 3,
 'information_tokens': 7,
 'fireworks': {0: -1, 1: -1, 2: -1, 3: -1, 4: -1},
 'discard_pile': [],
 'hand': [[{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}]],
 'knowledge': [[[[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2

In [5]:
# plyr1 falsely play blue 2
context = copy.deepcopy(update_game(context, {'type':'PLAY', 'pnr':0, 'cnr':0, 'color':BLUE, 'rank':TWO}) )
context

{'life_tokens': 2,
 'information_tokens': 7,
 'fireworks': {0: -1, 1: -1, 2: -1, 3: -1, 4: -1},
 'discard_pile': [{'color': 3, 'rank': 1}],
 'hand': [[{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}]],
 'knowledge': [[[[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3,

In [6]:
# plyr2 correctly play blue 1
context = copy.deepcopy(update_game(context, {'type':'PLAY', 'pnr':1, 'cnr':0, 'color':BLUE, 'rank':ONE}) )
context

{'life_tokens': 2,
 'information_tokens': 7,
 'fireworks': {0: -1, 1: -1, 2: -1, 3: 0, 4: -1},
 'discard_pile': [{'color': 3, 'rank': 1}],
 'hand': [[{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}]],
 'knowledge': [[[[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 

# Update
After each action, intention, i.e. what should be done with the card, for each hand of each player will be updated. We assume that all agents share the same intention distribution. The computational process of updating is described from bottom to top below.

### *Variables
i: **intention** distribution for each card. Describes the common belieft of what should be done with each card.  <br>
e.g.) [PLAY:0.7, DISCARD:0.25, KEEP:0.05] for 2nd card of my hand 

a: **action** that just happened. <br>
e.g.) Player 3 discarded their 2nd card

c: **context** which includes trash piles, played cards (top board), # hint tokens, # error tokens and knowledge structure of different players. <br>
e.g.) firework=[R:0,G:0,Y:1,B:3,W:0], trash=[R1,B1], errorcoin=2, knowledge[player][card][color][rank] = 2 representing number of instance for that specific combination of color and rank <br>

r: a single **specific** realisation, identical with context up to hands of each player. <br>
e.g.) top board=[R:0,G:0,Y:1,B:3,W:0], trash=[R1,B1], errorcoin=2, knowledge[player][card] = (Blue,2) rpresenting a specific realisation for each hand of each player

In [7]:
# Example of how context looks like
context

{'life_tokens': 2,
 'information_tokens': 7,
 'fireworks': {0: -1, 1: -1, 2: -1, 3: 0, 4: -1},
 'discard_pile': [{'color': 3, 'rank': 1}],
 'hand': [[{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}],
  [{'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None},
   {'color': None, 'rank': None}]],
 'knowledge': [[[[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1],
    [3, 2, 2, 2, 1]],
   [[3, 

### 4. Utility function (hand-crafted)
- $U(i,r)$
- Hand-crafted utilities suggested by Saskia
- Compute utility based on a given realisation and a single intention type (PLAY, DISCARD, KEEP)

In [30]:
def utility(intention, realisation, player_index, card_index):
    '''
    return a utility for a given card (decided by card_index, player_index) of a given realisation 
    from various realisations
    '''
    score = 0
    card = copy.deepcopy(realisation['hand'][player_index][card_index])  # select a card

    if intention == PLAY:
        # in intention is play and card is playable, 
        # this results in one more card on the fireworks.
        # reward this.
        if CardPlayable(card, realisation['fireworks']):
            score += 10

        # if intention is play and card is not playable at the time
        else:
            # punish loosing a card from stack
            score -= 1
            # and punish getting a bomb depending on the number of bombs
            if realisation['life_tokens'] == 3:
                score -= 1
            elif realisation['life_tokens'] == 2:
                score -= 3
            elif realisation['life_tokens'] == 1: # game would end directly
                score -= 25

            # if card would still have been relevant in the future,
            # punish loosing it depending on the remaining copies of this card in the deck
            if not CardUseless(card, realisation['fireworks']):
                if remaining_copies(card, realisation['discard_pile']) == 2:
                    score -= 1
                elif remaining_copies(card, realisation['discard_pile']) == 1:
                    score -= 2
                elif remaining_copies(card, realisation['discard_pile']) == 0:
                    score -= 5


    elif intention == DISCARD:
        # punish loosing a card from stack
        score -= 1

        # reward gaining a hint token:
        score += 0.5

        # punish discarding a playable card
        if CardPlayable(card, realisation['fireworks']):
            score -= 5

        # if card is not playable right now but would have been relevant in the future, punish
        # discarding it depending on the number of remaining copies in the game
        elif not CardUseless(card, realisation['fireworks']):
            if remaining_copies(card, realisation['discard_pile']) == 2:
                score -= 1
            elif remaining_copies(card, realisation['discard_pile']) == 1:
                score -= 2
            elif remaining_copies(card, realisation['discard_pile']) == 0:
                score -= 5

        # do we want to reward this additionally? I think rewarding gaining a hint token should be
        # enough, so nothing happens here
        elif CardUseless(card, realisation['fireworks']):
            pass

    elif intention == KEEP:
        # keeping a playable card is punished, because it does not help the game
        if CardPlayable(card, realisation['fireworks']):
            score -= 2

        # if card is not playable right now but is relevant in the future of the game reward keeping
        # this card depending on the remaining copies in the game
        elif not CardUseless(card, realisation['fireworks']):
            if remaining_copies(card, realisation['discard_pile']) == 2:
                score += 1
            elif remaining_copies(card, realisation['discard_pile']) == 1:
                score += 2
            elif remaining_copies(card, realisation['discard_pile']) == 0:
                score += 5

        # punish keeping a useless card
        elif CardUseless(card, realisation['fireworks']):
            score -= 1

    return score

example

In [35]:
# assume there are only two possible realisations
ex_r_set = [{'life_tokens': 3,
             'information_tokens': 7,
             # BLUE ONE is playable
             'fireworks': {RED: THREE, YELLOW: TWO, GREEN: FOUR, WHITE: -1, BLUE: -1},
             'discard_pile': [{'color':BLUE,'rank':ONE}, {'color':BLUE,'rank':ONE}, {'color':BLUE,'rank':THREE}],
             'hand': [[{'color':BLUE,'rank':TWO}, {'color':RED,'rank':ONE}], 
                     [{'color':BLUE,'rank':FOUR}, {'color':YELLOW,'rank':ONE}],
                      # BLUE ONE is the realisation that is selected
                     [{'color':BLUE,'rank':ONE}, {'color':WHITE,'rank':THREE}]],
             'knowledge_structure': None},

            {'life_tokens': 3,
             'information_tokens': 7,
             'fireworks': {RED: THREE, YELLOW: TWO, GREEN: FOUR, WHITE: -1, BLUE: -1},
             'discard_pile': [{'color':BLUE,'rank':ONE}, {'color':BLUE,'rank':ONE}, {'color':BLUE,'rank':THREE}],
             'hand': [[{'color':BLUE,'rank':TWO}, {'color':RED,'rank':ONE}],
                     [{'color':BLUE,'rank':FOUR}, {'color':YELLOW,'rank':ONE}],
                      # Here it's BLUE FIVE
                     [{'color':BLUE,'rank':FIVE}, {'color':WHITE,'rank':THREE}]],
             'knowledge_structure': None}]
# index of realisation for which the utility should be computed
ex_r = 0

# intention
ex_i = PLAY

# indicies of card
ex_plyr = 2
ex_card = 0

utility(PLAY, ex_r_set[1], ex_plyr, ex_card)  # BLUE ONE in this example

-4

### 3. Pragmatic speaker
- $P(r|i) = \frac { exp( \alpha U(i,r) )} {\sum_{r^*}  exp( \alpha U(i,r^*) )}$
- For a given intention, how well does the chosen realisation corresponds to it with respect to other possible realisations?

In [10]:
def pragmatic_speaker(intention, realisation_set, reali_index, player_index, card_index):
    alpha = 1 # TODO: fit(adjust) rationality parameter according to behaviour of coplyr
    
    # compute numerator
    numerator = utility(intention, realisation_set[reali_index], player_index, card_index)
    numerator = np.exp(alpha * numerator)
    
    # compute denominator
    denominator = 0
    
    # TODO: possible_realisations should come from emul_marg
    for r in realisation_set:  
        summand = utility(intention, r, player_index, card_index)
        summand = np.exp(alpha * summand)
        denominator += summand
        
    return numerator / denominator

Example

In [11]:
pragmatic_speaker(intention=ex_i,
                  realisation_set= ex_r_set,
                  reali_index=ex_r,
                  player_index=ex_plyr,
                  card_index=ex_card
                 )
# Since the 0th realisation is playable, it is very likely that is meant in comparison with another one

0.9999991684719723

### 2. Pragmatic listener
- $P(i|r) \propto P(r|i) P(i)$
- given the realisation of a board, compute how likely each intention is
- computing simply by multiplying likelihood (given by the pragmatic speaker) with the prior and then renormalizing

In [12]:
def pragmatic_listener(realisation_set, reali_index, player_index, card_index):
    # P(i|r) with 0:play, 1:discard, 2:keep
    probs = np.zeros(3)
    
    # TODO: think about a meaningful prior
    intention_prior = [1/3, 1/3, 1/3]
    
    # posterior
    for intention in [PLAY,DISCARD,KEEP]:
        probs[intention] = pragmatic_speaker(intention, realisation_set, reali_index, player_index, card_index
                                            ) * intention_prior[intention]
    # normalize
    return probs /np.sum(probs)

In [13]:
pragmatic_listener(ex_r_set, ex_r, ex_plyr, ex_card)
# The speaker probably meant PLAY based on given realisation

array([0.93860391, 0.04451415, 0.01688194])

### 1. Marginalise
-  $P(i|a,c) = \sum_r P(i,r|a,c) = \sum_r P(i|r,a,c) P(r|a,c) = \sum_r P(i|r) P(r|a,c)$
- Compute expected value by summing over possible realisations given by the knowledge structure
- Context is updated based on new action, constraining possible realisations
- Since different realisations in all hands of all players must be taken into account, the realisation space is high-dimensional. For efficient sampling, an appropriate convergence criterion and efficient sampling algorithm is necessary. Currently, take only parts of samples.


In [14]:
def emul_marg(context, player_index, card_index):
    '''
    :param context: dict, updated general game situations and knowledge structure
    :param player_index: int, 0-based index of player for which the intention should be updated
    :param card_index: int, 0-based index of card " "
    :returns : (,3) np.array, which is a simplex with probability for each sort of intention (PLAY, DISCARD, KEEP)
    '''
    
    # Initialize
    threshold = 1000  # num. of realisations to iterate over. Ideally, sum over all realisations
    # TODO: maybe also implement time limit (Minseok) 
    realisation_set = [] # list of each realisation, at the end length of threshold
    weights = np.zeros((threshold, num_player, num_card)) # P(r|c)
    samples = np.zeros((threshold, 3)) # each sample is a simplex with 3 categories (play, discard, keep)
    
    # TODO: instead of going into prag_listen directly, first get all indicies
    for sample_i in range(threshold):
        # Use rejection sampling
        while True:
            # single instance of a complete realisation indicies (choose reali. for each card for each plyr)          
            c_r_i = np.random.randint(low=0, high=5, size=(num_player,num_card,2)) 
            # numerator: num. instance for a chosen realisation of a chosen card
            # denominator: total num. possible instance for a chosen card
            probs = np.array([[context['knowledge'][pi][i][c_r_i[pi,i,0]][c_r_i[pi,i,1]] /
                          np.sum(context['knowledge'][pi][i])
                               for i in range(num_card)] for pi in range(num_player)])
            
            # Reject samples that cannot happen
            if not np.all(probs > 0):  # rejected
                pass # so go back into infinite loop again
            else:  # accept since no cards with p=0
                # Proceed with choosing realisation
                r = copy.deepcopy(context)
                r['hand'] = [  # fix color and rank for each card for each plyr
                [{'color': c_r_i[pi,i,0], 'rank': c_r_i[pi,i,1]} 
                 for i in range(num_card)] for pi in range(num_player)]  
                realisation_set.append(r)
                weights[sample_i] = probs
                break # break the infinite loop
    
    # Since we don't iterate over all realisations, renormalize P(r|a,c)
    weights = np.exp(np.sum(np.log(weights), axis=(1,2))) # transition back to probability from log
    weights /= np.sum(weights)
    # Use RSA to reason about P(i|r) for given realisation
    for sample_i in range(threshold):
        # scale * 3 dim vec
        #TODO: does this makes sense?? I think so if you think of it as linear weighted summation for each intention
        samples[sample_i] = weights[sample_i] * pragmatic_listener( 
                    realisation_set, sample_i, player_index, card_index)
    return np.sum(samples,axis=0)

Accuracy depending on num.samples (500 not enough, 1000 good enough)

In [273]:
# with 1000 samples
for _ in range(10):
    print(emul_marg(context, ex_plyr, ex_card))

[0.2500643  0.43742528 0.31251042]
[0.29425413 0.39726415 0.30848172]
[0.28524759 0.40563587 0.30911654]
[0.29439947 0.41168367 0.29391686]
[0.26720647 0.43055285 0.30224068]


KeyboardInterrupt: 

Examples

In [18]:
situation1 = {'life_tokens': 1,
             'information_tokens': 7,
             'fireworks': {0: -1, 1: -1, 2: -1, BLUE: -1, 4: -1},
             'discard_pile': [{'color': BLUE, 'rank': TWO}, {'color': RED, 'rank': FOUR}],
             'hand': [[{'color': None, 'rank': None}, {'color': None, 'rank': None}],
              [{'color': None, 'rank': None}, {'color': None, 'rank': None}],
              [{'color': None, 'rank': None}, {'color': None, 'rank': None}]],
             'knowledge': [[[[3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 1, 2, 2, 1],
                [3, 2, 2, 1, 1]],
               [[3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 1, 2, 2, 1],
                [3, 2, 2, 1, 1]]],
              [[[3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 1, 2, 2, 1],
                [3, 2, 2, 1, 1]],
               [[3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 1, 2, 2, 1],
                [3, 2, 2, 1, 1]]],
              [[[3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 2, 2, 2, 1],
                [3, 1, 2, 2, 1],
                [3, 2, 2, 1, 1]],
               # let's say that (2,1) card is fixed to be BLUE TWO, which should be kept
               [[0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0],
                [0, 1, 0, 0, 0],
                [0, 0, 0, 0, 0]]]],
             'last_action': {'type': 'PLAY', 'pnr': 0, 'cnr': 0, 'color': 3, 'rank': 1}}
num_player=3
num_card=2

emul_marg(context=situation1, player_index=2, card_index=1)
# expect keep to be high

array([0.33333333, 0.33333333, 0.33333333])

In [20]:
r1 = [{'life_tokens': 1, 'information_tokens': 7, 'fireworks': {0: -1, 1: -1, 2: -1, 3: -1, 4: -1}, 'discard_pile': [{'color': 3, 'rank': 1}, {'color': 4, 'rank': 3}], 'hand': [[{'color': 1, 'rank': 4}, {'color': 0, 'rank': 1}], [{'color': 1, 'rank': 0}, {'color': 1, 'rank': 2}], [{'color': 3, 'rank': 0}, {'color': 3, 'rank': 1}]], 'knowledge': [[[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]]]], 'last_action': {'type': 'PLAY', 'pnr': 0, 'cnr': 0, 'color': 3, 'rank': 1}},
      {'life_tokens': 1, 'information_tokens': 7, 'fireworks': {0: -1, 1: -1, 2: -1, 3: -1, 4: -1}, 'discard_pile': [{'color': 3, 'rank': 1}, {'color': 4, 'rank': 3}], 'hand': [[{'color': 2, 'rank': 2}, {'color': 2, 'rank': 2}], [{'color': 2, 'rank': 3}, {'color': 3, 'rank': 2}], [{'color': 3, 'rank': 4}, {'color': 3, 'rank': 1}]], 'knowledge': [[[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]]]], 'last_action': {'type': 'PLAY', 'pnr': 0, 'cnr': 0, 'color': 3, 'rank': 1}}, 
      {'life_tokens': 1, 'information_tokens': 7, 'fireworks': {0: -1, 1: -1, 2: -1, 3: -1, 4: -1}, 'discard_pile': [{'color': 3, 'rank': 1}, {'color': 4, 'rank': 3}], 'hand': [[{'color': 2, 'rank': 3}, {'color': 2, 'rank': 4}], [{'color': 0, 'rank': 4}, {'color': 1, 'rank': 4}], [{'color': 1, 'rank': 1}, {'color': 3, 'rank': 1}]], 'knowledge': [[[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]]], [[[3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 2, 2, 2, 1], [3, 1, 2, 2, 1], [3, 2, 2, 1, 1]], [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]]]], 'last_action': {'type': 'PLAY', 'pnr': 0, 'cnr': 0, 'color': 3, 'rank': 1}}]
pragmatic_listener(r1,0,2,1)

array([0.33333333, 0.33333333, 0.33333333])

In [22]:
pragmatic_speaker(DISCARD, r1, 0,2,1)

0.3333333333333333

In [28]:
utility(PLAY,r1[0],2,1)

-28

1. Marginalise $P(i_n|a,c) = P(i|a,c) = \sum_r P(i,r|a,c) = \sum_r P(i|r,a,c) P(r|a,c)$ <br>
$P(r|a,c)$ is already informed by last_action
2. Listener $P(i|r,a,c) = \dfrac{P(a|i,r,c) P(i|r,c)}{\sum_{i^*}P(a|i^*,r,c) P(i^*|r,c)}  $     
3. Speaker $P(a|r,i,c) =  \dfrac{P(i,a|r,c)}{\sum_{a^*}P(i,a^*|r,c)} =  \dfrac{P(i|a,r,c) P(a|r,c)}{\sum_{a^*}P(i|a^*,r,c) P(a^*|r,c)} =  \dfrac{P(i|r,cN) P(a|r,c)}{\sum_{a^*}P(i|,r,cN^*) P(a^*|r,c)}$
4. Utility $U(i,r,cN)$

The problem with this approach is that marginalisation over r happens one level above listener. This causes problem with weighing the right realisation with high weights

# Real shit
 
1. Factorize  <br>
 $P(i_{\text{total}}|a,c)  \stackrel{i.d.}{=}   \prod_{\text{card}} P(i_{\text{card}}|a,c) $


2. Listener <br>
$P(i|a,c)  =  \sum_{r} p(i,r|a,c)  =  \sum_{r} \dfrac {P(a|i,r,c) P(i,r|c)} {P(a|c)}  =  \dfrac {\sum_{r} P(a|i,r,c) P(i,r|c)} {\sum_{i^*,r^*} P(a|i^*,r^*,c) P(i^*,r^*|c)}  \stackrel{?}{=}  \dfrac {\sum_{r} P(a|i,r,c) P(i|c)P(r|c)} {\sum_{i^*,r^*} P(a|i^*,r^*,c) P(i^*|c)P(r^*|c)}$
 
 + $P(a|i,r,c)$: comes from speaker
 + $P(i|c)$: prior as we had beforehand, possibly drop c dependence as well
 + $P(r|c)$: compute by fraction of number of instance of the specific realisation
 
 
3. Speaker  <br>
$P(a|r,i,c)  =  \dfrac{P(i,a|r,c)}{\sum_{a^*}P(i,a^*|r,c)}  =  \dfrac {P(i|a,r,c) P(a|r,c)}{\sum_{a^*} P(i|a^*,r,c) P(a^*|r,c)}   \stackrel{(*)}{=}   \dfrac {P(i|r,c_{\text{new}}) P(a|r,c)} {\sum_{a^*} P(i|,r,c_{\text{new}}^*) P(a^*|r,c)}  =  \dfrac {\exp(\alpha U(i|r,c_{\text{new}})) P(a|r,c)} {\sum_{a^*}\exp(\alpha U(i|r,c_{\text{new}}^*)) P(a^*|r,c)}$

 + (*): use update_game() to get new game_state
 + $U(i|r,c_{\text{new}})$: comes from utility
 + $P(a|r,c)$ is $0$ when $a$ is not legal and $\frac{1}{\text{#.action}}$ when $a$ is legal. Since the term is in both numerator and denominator, simply iterating over legal actions is sufficient.


4. Utility <br>
$U(i,r,c_{\text{new}})$

In [None]:
def pragmatic_speaker(intention, realisation_set, reali_index, player_index, card_index):
    alpha = 1 # TODO: fit(adjust) rationality parameter according to behaviour of coplyr
    
    # compute numerator
    numerator = utility(intention, realisation_set[reali_index], player_index, card_index)
    numerator = np.exp(alpha * numerator)
    
    # compute denominator
    denominator = 0
    
    # TODO: possible_realisations should come from emul_marg
    for r in realisation_set:  
        summand = utility(intention, r, player_index, card_index)
        summand = np.exp(alpha * summand)
        denominator += summand
        
    return numerator / denominator

# Future plans
- Fix pragmatic speaker
- Finish the update_game() function
- Integrate into Deepmind framework
- Find more efficient sampling algorithm (don't sample random realisations)