# ME bias in the single agent setting

## Setup the training

In [None]:
# IMPORTS 

import tensorflow as tf
import numpy as np
import sys
import os
import pickle

sys.path.append("/Users/xxAlv1Nxx/Documents/01a Masters/05 CS/CS 428B Probabilistic Models of Cognition – Language/Project/pragmatic_agents_me_bias")
from RSA_communication_agents import RSASpeaker0, RSAListener0, RSASpeaker1, RSAListener1

In [None]:
# GENERATE INPUT MESSAGES 

def generate_messages(size, n, ME=1):
    """ Generates the message input for the listener
        inputs: 
            size - number of data points
            n - number of states and messages in the system
            ME - number of messages that are withheld from training, here 1 or 2
        outputs: 
            data - training data 
            labels - training labels
    """
    
    data = np.zeros((size, n, n))
    if ME == 1:
        selection = np.random.choice(n-1, size=(size))
    elif ME == 2: 
        selection = np.random.choice([i for i in range(1, n-1)], size=(size))
    for i in range(n):
        data[selection == i, i, :] = 1.
    labels = tf.one_hot(selection, depth=n)
    
    return np.float32(data), labels

In [None]:
# GENERATE BILINGUAL INPUT MESSAGES 

def generate_bilingual_messages(size, n, ME=1, n_blocks=0, lang_edit=0):
    """ Generates the message input for the listener
        inputs: 
            size - number of data points
            n - number of states in the system (n_messages = 2*n)
            ME - number of messages that are withheld from training, here 1 or 2
            n_blocks - number of language blocks (>=1), or not blocked (0, i.e. random)
        outputs: 
            data - training data 
            labels - training labels
    """
    
    data = np.zeros((size, 2*n, n))

    if n_blocks == 0:
        if ME == 1:
            selection = np.random.choice([i for i in range(0, 2*n-1) if i != n-1], size=(size))
        elif ME == 2: 
            selection = np.random.choice([i for i in range(1, 2*n-1) if (i != n-1 and i != n)], size=(size))
    else:
        if ME == 1:
            selection = np.random.choice([i for i in range(0, n-1)], size=(size))
        elif ME == 2: 
            selection = np.random.choice([i for i in range(1, n-1)], size=(size))
        # switch language every other block
        divisor = size // n_blocks
        selection = np.array([v if not (i // divisor) % 2 else v + n for i, v in enumerate(selection)])

    selection = selection + n * lang_edit

    for i in range(2*n):
        data[selection == i, i, :] = 1.
    labels = tf.one_hot(tf.math.floormod(selection, n), depth=n)
    
    return np.float32(data), labels

In [None]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual/L' + str(reasoning) + '/' + str(n) + '_states/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data, labels = generate_bilingual_messages(datasize, n, ME=ME)
        data = tf.convert_to_tensor(data)
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            average_reward = []

            shuffle_indices = np.random.permutation(datasize)
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

In [None]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener_blocked(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual_blocked/L' + str(reasoning) + '/' + str(n) + '_states/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data, labels = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=2)
        data = tf.convert_to_tensor(data)
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            average_reward = []
            
            shuffle_indices_1 = np.random.permutation(datasize//2)
            shuffle_indices_2 = np.random.permutation(datasize//2) + datasize//2
            shuffle_indices = np.concatenate((shuffle_indices_1, shuffle_indices_2))
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

In [None]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener_blocked_e(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100, n_blocks=2):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual_blocked_e/L' + str(reasoning) + '/' + str(n) + '_states/' + str(n_blocks) + '_blocks/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data1, labels1 = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=1)
        data1 = tf.convert_to_tensor(data1)
        data2, labels2 = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=1, lang_edit=1)
        data2 = tf.convert_to_tensor(data2)
        data_all = [data1, data2]
        labels_all = [labels1, labels2]
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            divisor = 100 // n_blocks
            data = data_all[(i // divisor) % 2]
            labels = labels_all[(i // divisor) % 2]
            average_reward = []

            shuffle_indices = np.random.permutation(datasize)
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

## Run the training 

for different agent types, numbers of state (3 and 10) and different number of states being withheld from the training (1 and 2). 

### Literal listener

In [None]:
run_listener(n=3, reasoning=0, n_epochs=50, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=0, n_epochs=50, ME=2)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=0, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=0, n_epochs=100, ME=2)

### Pragmatic listener

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=1, n_epochs=50, ME=2)

In [None]:
run_listener(learning_rate=0.001, n=4, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=5, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=6, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=7, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=8, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=9, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=15, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=20, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=25, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=30, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=40, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=50, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=60, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=80, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=100, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=100, reasoning=0, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=2)

In [None]:
run_listener_blocked(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

In [25]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

run 1 average reward 1 0.7731855
run 2 average reward 1 0.7741935


2021-12-01 20:15:23.555405: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-12-01 20:15:23.635906: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-12-01 20:15:23.697900: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 3 average reward 1 0.8860887


2021-12-01 20:16:32.473726: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 4 average reward 1 0.99495965
run 5 average reward 1 0.8770161
run 6 average reward 1 0.8921371
run 7 average reward 1 0.90120965
run 8 average reward 1 0.875
run 9 average reward 1 0.8840726
run 10 average reward 1 0.7530242
run 11 average reward 1 0.9969758
run 12 average reward 1 0.8991935
run 13 average reward 1 0.99495965
run 14 average reward 1 0.7741935
run 15 average reward 1 0.8719758
run 16 average reward 1 0.8820565
run 17 average reward 1 0.91028225
run 18 average reward 1 0.7016129
run 19 average reward 1 0.90221775
run 20 average reward 1 0.7752016
run 21 average reward 1 0.7701613
run 22 average reward 1 0.88709676
run 23 average reward 1 0.7721774
run 24 average reward 1 0.9989919
run 25 average reward 1 0.8860887
run 26 average reward 1 0.765121
run 27 average reward 1 0.8951613
run 28 average reward 1 0.8860887
run 29 average reward 1 0.6582661
run 30 average reward 1 0.88810486
run 31 average reward 1 0.99596775
run 32 average reward 1 0.9979839
run 33 average re

2021-12-01 20:41:34.919048: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 79 average reward 1 0.9979839
run 80 average reward 1 0.8810484
run 81 average reward 1 0.79334676
run 82 average reward 1 0.7822581
run 83 average reward 1 0.76008064
run 84 average reward 1 0.87096775
run 85 average reward 1 0.9032258
run 86 average reward 1 0.8810484
run 87 average reward 1 0.86995965
run 88 average reward 1 0.8729839
run 89 average reward 1 0.672379
run 90 average reward 1 0.90120965
run 91 average reward 1 0.77116936
run 92 average reward 1 0.99495965
run 93 average reward 1 0.8830645
run 94 average reward 1 0.77116936
run 95 average reward 1 0.87903225
run 96 average reward 1 0.88004035
run 97 average reward 1 1.0
run 98 average reward 1 0.875
run 99 average reward 1 0.9919355
run 100 average reward 1 0.8951613


In [None]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=4)

In [None]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=8)

In [None]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=16)