# ME bias in the single agent setting

## Setup the training

In [None]:
# IMPORTS 

import tensorflow as tf
import numpy as np
import sys
import os
import pickle

sys.path.append("/Users/xxAlv1Nxx/Documents/01a Masters/05 CS/CS 428B Probabilistic Models of Cognition – Language/Project/pragmatic_agents_me_bias")
from RSA_communication_agents import RSASpeaker0, RSAListener0, RSASpeaker1, RSAListener1

In [None]:
# GENERATE INPUT MESSAGES 

def generate_messages(size, n, ME=1):
    """ Generates the message input for the listener
        inputs: 
            size - number of data points
            n - number of states and messages in the system
            ME - number of messages that are withheld from training, here 1 or 2
        outputs: 
            data - training data 
            labels - training labels
    """
    
    data = np.zeros((size, n, n))
    if ME == 1:
        selection = np.random.choice(n-1, size=(size))
    elif ME == 2: 
        selection = np.random.choice([i for i in range(1, n-1)], size=(size))
    for i in range(n):
        data[selection == i, i, :] = 1.
    labels = tf.one_hot(selection, depth=n)
    
    return np.float32(data), labels

In [None]:
# GENERATE BILINGUAL INPUT MESSAGES 

def generate_bilingual_messages(size, n, ME=1, n_blocks=0, lang_edit=0):
    """ Generates the message input for the listener
        inputs: 
            size - number of data points
            n - number of states in the system (n_messages = 2*n)
            ME - number of messages that are withheld from training, here 1 or 2
            n_blocks - number of language blocks (>=1), or not blocked (0, i.e. random)
        outputs: 
            data - training data 
            labels - training labels
    """
    
    data = np.zeros((size, 2*n, n))

    if n_blocks == 0:
        if ME == 1:
            selection = np.random.choice([i for i in range(0, 2*n-1) if i != n-1], size=(size))
        elif ME == 2: 
            selection = np.random.choice([i for i in range(1, 2*n-1) if (i != n-1 and i != n)], size=(size))
    else:
        if ME == 1:
            selection = np.random.choice([i for i in range(0, n-1)], size=(size))
        elif ME == 2: 
            selection = np.random.choice([i for i in range(1, n-1)], size=(size))
        # switch language every other block
        divisor = size // n_blocks
        selection = np.array([v if not (i // divisor) % 2 else v + n for i, v in enumerate(selection)])

    selection = selection + n * lang_edit

    for i in range(2*n):
        data[selection == i, i, :] = 1.
    labels = tf.one_hot(tf.math.floormod(selection, n), depth=n)
    
    return np.float32(data), labels

In [None]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual/L' + str(reasoning) + '/' + str(n) + '_states/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data, labels = generate_bilingual_messages(datasize, n, ME=ME)
        data = tf.convert_to_tensor(data)
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            average_reward = []

            shuffle_indices = np.random.permutation(datasize)
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

In [None]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener_blocked(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual_blocked/L' + str(reasoning) + '/' + str(n) + '_states/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data, labels = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=2)
        data = tf.convert_to_tensor(data)
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            average_reward = []
            
            shuffle_indices_1 = np.random.permutation(datasize//2)
            shuffle_indices_2 = np.random.permutation(datasize//2) + datasize//2
            shuffle_indices = np.concatenate((shuffle_indices_1, shuffle_indices_2))
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

In [19]:
# TRAIN A LISTENER AND SAVE THE PARAMETERS AS WELL AS THE REWARDS AND THE LEXICA

def run_listener_blocked_e(n=3, reasoning=0, n_epochs=50, ME=1, learning_rate=0.001, runs=100, n_blocks=2):
    """ Trains the listener on a single agent Lewis game as described in Experiment 1.
        inputs: 
            n - number of states and messages (in total)
            reasoning - reasoning level of the listener, 0 for literal, 1 for pragmatic 
            ME - number of messages that are left out during training, here 1 or 2
            learning_rate - learning rate for the Adam optimizer
        By default 100 agents are trained and all their lexica and rewards for every epoch are saved (agent-wise).
    """
    
    # setup the training and save the parameters 
    
    n_states = n                      # number of states 
    n_messages = 2*n                  # number of messages
    batch_size = 32                   # batch size
    datasize = 1000                   # number of training data points
    batches = datasize // batch_size  # number of batches per epoch
    
    runs = runs                       # number of runs: 100 speaker-listener pairs are trained 
    init_mean = 0.5                   # mean for initialization of lexicon entries
    init_std = 0.01                   # std for initialization of lexicon entries
    
    constraint = tf.keras.constraints.NonNeg() # constrains the lexica to have entries >= 0
    
    filename = 'data/bilingual_blocked_e/L' + str(reasoning) + '/' + str(n) + '_states/' + str(n_blocks) + '_blocks/'
    if not os.path.exists(filename):
            os.makedirs(filename)
    
    param_dict = {"n_states": n_states, "n_messages": n_messages, "n_epochs": n_epochs, "batch_size": batch_size,
              "datasize": datasize, "initializer_truncated_normal_mean_std": [init_mean, init_std], 
              "learning_rate": learning_rate, "runs": runs, "constraint": constraint}    
    with open(filename + 'param_dict.pickle', 'wb') as handle:
        pickle.dump(param_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    # run the listeners

    for run in range(1,runs+1):
        
        # create data 
        data1, labels1 = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=1)
        data1 = tf.convert_to_tensor(data1)
        data2, labels2 = generate_bilingual_messages(datasize, n, ME=ME, n_blocks=1, lang_edit=1)
        data2 = tf.convert_to_tensor(data2)
        data_all = [data1, data2]
        labels_all = [labels1, labels2]
        lexica = []
        all_rewards = []
        
        # create listener
        lexicon = tf.Variable(tf.initializers.TruncatedNormal(mean=init_mean, stddev=init_std)
                              ([n_states, n_messages]),
                              name="lexicon", 
                              trainable=True, 
                              dtype=tf.float32,
                              constraint=tf.keras.constraints.NonNeg())
        
        if reasoning == 0: 
            listener = RSAListener0(n_states, n_messages, lexicon)
        elif reasoning == 1: 
            listener = RSAListener1(n_states, n_messages, lexicon, alpha=5.)
                
        listener.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss=tf.keras.losses.CategoricalCrossentropy())
        
        # train
        for i in range(n_epochs):
            divisor = 100 // n_blocks
            data = data_all[(i // divisor) % 2]
            labels = labels_all[(i // divisor) % 2]
            average_reward = []

            shuffle_indices = np.random.permutation(datasize)
            data = tf.gather(data, shuffle_indices)
            labels = tf.gather(labels, shuffle_indices)

            for j in range(batches):
                data_batch = data[j*batch_size:(j+1)*batch_size]
                labels_batch = labels[j*batch_size:(j+1)*batch_size]
    
                _, actions = listener.get_states(data_batch)
    
                rewards = tf.einsum('ij,ij->i', labels_batch, actions)
                average_reward.append(np.mean(rewards))
                
                # RL: 
                # Note that we implemented REINFORCE with a work-around using categorical crossentropy. 
                # This can be done by setting the labels to the agent's actions, and weighting the loss
                # function by the rewards. 
                listener.train_on_batch(data_batch, actions, sample_weight=rewards)
            
            mean_reward = np.mean(average_reward)
            all_rewards.append(mean_reward)
            lexica.append(np.copy(listener.lexicon[:]))
                        
        print('run ' + str(run), 'average reward ' +str(ME)+ ' ' + str(mean_reward))
        
        # save rewards and lexica 
        if reasoning == 0:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_'
        elif reasoning == 1:
            filename_full = filename + 'L' + str(reasoning) +'_'+ str(ME) + 'missing_5.0alpha_'
        np.save(filename_full + 'lexicon_run' + str(run), lexica)
        np.save(filename_full + 'rewards_run' + str(run), all_rewards)

## Run the training 

for different agent types, numbers of state (3 and 10) and different number of states being withheld from the training (1 and 2). 

### Literal listener

In [None]:
run_listener(n=3, reasoning=0, n_epochs=50, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=0, n_epochs=50, ME=2)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=0, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=0, n_epochs=100, ME=2)

### Pragmatic listener

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=1, n_epochs=50, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=3, reasoning=1, n_epochs=50, ME=2)

In [None]:
run_listener(learning_rate=0.001, n=5, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=6, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=7, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=8, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=9, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=15, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=20, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=25, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=30, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=40, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=50, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=60, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=80, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=100, reasoning=1, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=100, reasoning=0, n_epochs=100, ME=1)

In [None]:
run_listener(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=2)

In [None]:
run_listener_blocked(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

In [17]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1)

run 1 average reward 1 0.7883065


2021-11-29 22:43:54.557097: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-29 22:43:54.577182: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 2 average reward 1 0.8820565
run 3 average reward 1 0.7903226
run 4 average reward 1 0.89314514
run 5 average reward 1 0.78629035


2021-11-29 22:45:10.696438: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-29 22:45:10.728398: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-29 22:45:10.871493: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-29 22:45:36.920198: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 6 average reward 1 0.7731855


2021-11-29 22:46:33.429052: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 7 average reward 1 0.875
run 8 average reward 1 0.88709676
run 9 average reward 1 0.6935484
run 10 average reward 1 0.8810484
run 11 average reward 1 0.859879


2021-11-29 22:48:47.581157: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


KeyboardInterrupt: 

In [20]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=4)

2021-11-29 22:54:57.320393: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 1 average reward 1 0.9979839
run 2 average reward 1 0.9989919
run 3 average reward 1 0.8820565
run 4 average reward 1 0.86995965
run 5 average reward 1 0.9979839
run 6 average reward 1 1.0
run 7 average reward 1 0.99596775
run 8 average reward 1 0.9989919
run 9 average reward 1 0.9939516
run 10 average reward 1 0.9969758
run 11 average reward 1 0.9989919
run 12 average reward 1 0.8991935


2021-11-29 23:00:49.276340: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 13 average reward 1 0.9979839
run 14 average reward 1 0.9979839
run 15 average reward 1 1.0
run 16 average reward 1 0.9969758
run 17 average reward 1 0.8921371
run 18 average reward 1 0.9989919
run 19 average reward 1 0.9092742
run 20 average reward 1 0.99495965
run 21 average reward 1 0.9979839
run 22 average reward 1 0.8921371
run 23 average reward 1 0.891129
run 24 average reward 1 0.9969758
run 25 average reward 1 1.0
run 26 average reward 1 0.9969758
run 27 average reward 1 0.9989919
run 28 average reward 1 0.9979839
run 29 average reward 1 0.9979839


2021-11-29 23:06:04.184626: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 30 average reward 1 1.0
run 31 average reward 1 0.9989919
run 32 average reward 1 0.9979839
run 33 average reward 1 0.9989919
run 34 average reward 1 1.0
run 35 average reward 1 1.0
run 36 average reward 1 0.89616936
run 37 average reward 1 1.0
run 38 average reward 1 0.9989919
run 39 average reward 1 1.0
run 40 average reward 1 0.99495965
run 41 average reward 1 1.0
run 42 average reward 1 0.9989919
run 43 average reward 1 0.9969758
run 44 average reward 1 0.9989919
run 45 average reward 1 0.9979839
run 46 average reward 1 0.9969758
run 47 average reward 1 0.9989919


2021-11-29 23:12:13.754643: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 48 average reward 1 0.99495965
run 49 average reward 1 1.0
run 50 average reward 1 0.9989919
run 51 average reward 1 0.9979839
run 52 average reward 1 0.9969758
run 53 average reward 1 0.99596775
run 54 average reward 1 0.9979839
run 55 average reward 1 0.9989919
run 56 average reward 1 1.0
run 57 average reward 1 0.8951613
run 58 average reward 1 0.99596775
run 59 average reward 1 0.88508064
run 60 average reward 1 1.0
run 61 average reward 1 0.9969758
run 62 average reward 1 0.88709676
run 63 average reward 1 0.9989919
run 64 average reward 1 1.0
run 65 average reward 1 0.8981855
run 66 average reward 1 0.87903225
run 67 average reward 1 0.9969758
run 68 average reward 1 0.9969758
run 69 average reward 1 0.9969758
run 70 average reward 1 0.9979839
run 71 average reward 1 1.0
run 72 average reward 1 1.0
run 73 average reward 1 1.0
run 74 average reward 1 0.88810486
run 75 average reward 1 0.9052419
run 76 average reward 1 0.9989919
run 77 average reward 1 0.8891129
run 78 average 

In [21]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=8)

2021-11-30 00:02:58.600664: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 1 average reward 1 1.0


2021-11-30 00:03:24.216494: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 2 average reward 1 0.9989919
run 3 average reward 1 1.0


2021-11-30 00:03:55.896905: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:03:56.106370: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 4 average reward 1 0.9979839
run 5 average reward 1 0.9979839
run 6 average reward 1 0.9989919
run 7 average reward 1 0.9989919
run 8 average reward 1 1.0
run 9 average reward 1 0.9979839
run 10 average reward 1 1.0
run 11 average reward 1 0.9979839
run 12 average reward 1 0.99596775
run 13 average reward 1 0.9989919
run 14 average reward 1 0.9969758
run 15 average reward 1 0.9989919
run 16 average reward 1 0.9989919
run 17 average reward 1 0.9989919
run 18 average reward 1 0.9989919
run 19 average reward 1 0.9989919


2021-11-30 00:09:48.173834: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:09:48.201019: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:09:48.221691: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 20 average reward 1 1.0
run 21 average reward 1 0.9989919
run 22 average reward 1 1.0
run 23 average reward 1 0.9989919
run 24 average reward 1 0.9989919
run 25 average reward 1 1.0
run 26 average reward 1 0.9989919
run 27 average reward 1 1.0
run 28 average reward 1 0.9989919
run 29 average reward 1 0.9979839
run 30 average reward 1 0.9989919
run 31 average reward 1 0.9989919
run 32 average reward 1 0.9969758
run 33 average reward 1 0.99495965
run 34 average reward 1 0.9979839
run 35 average reward 1 1.0
run 36 average reward 1 1.0
run 37 average reward 1 0.9979839
run 38 average reward 1 0.99596775
run 39 average reward 1 0.9989919
run 40 average reward 1 0.9979839
run 41 average reward 1 0.9969758
run 42 average reward 1 0.9969758


2021-11-30 00:17:05.171536: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 43 average reward 1 0.9979839
run 44 average reward 1 0.9989919


2021-11-30 00:17:30.584705: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 45 average reward 1 1.0


2021-11-30 00:17:58.788839: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:17:58.861413: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:17:59.158870: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled
2021-11-30 00:17:59.251767: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 46 average reward 1 0.9979839
run 47 average reward 1 0.9979839
run 48 average reward 1 0.9989919
run 49 average reward 1 1.0
run 50 average reward 1 0.9969758
run 51 average reward 1 1.0
run 52 average reward 1 0.9989919
run 53 average reward 1 0.9989919
run 54 average reward 1 0.9989919
run 55 average reward 1 1.0
run 56 average reward 1 0.9989919
run 57 average reward 1 0.9979839
run 58 average reward 1 1.0
run 59 average reward 1 0.9979839
run 60 average reward 1 1.0
run 61 average reward 1 0.9979839


2021-11-30 00:22:47.394643: W tensorflow/core/data/root_dataset.cc:167] Optimization loop failed: Cancelled: Operation was cancelled


run 62 average reward 1 1.0
run 63 average reward 1 0.9989919
run 64 average reward 1 0.9989919
run 65 average reward 1 1.0
run 66 average reward 1 0.9969758
run 67 average reward 1 1.0
run 68 average reward 1 1.0
run 69 average reward 1 0.9979839
run 70 average reward 1 0.9989919
run 71 average reward 1 0.9989919
run 72 average reward 1 0.9989919
run 73 average reward 1 0.9989919
run 74 average reward 1 1.0
run 75 average reward 1 1.0
run 76 average reward 1 1.0
run 77 average reward 1 0.9979839
run 78 average reward 1 0.9979839
run 79 average reward 1 1.0
run 80 average reward 1 0.9989919
run 81 average reward 1 0.9979839
run 82 average reward 1 0.9969758
run 83 average reward 1 0.9989919
run 84 average reward 1 0.9979839
run 85 average reward 1 0.9969758
run 86 average reward 1 0.9979839
run 87 average reward 1 0.9989919
run 88 average reward 1 0.9989919
run 89 average reward 1 0.9979839
run 90 average reward 1 0.9969758
run 91 average reward 1 1.0
run 92 average reward 1 1.0
run 93

In [22]:
run_listener_blocked_e(learning_rate=0.001, n=10, reasoning=1, n_epochs=100, ME=1, n_blocks=16)

run 1 average reward 1 0.9989919
run 2 average reward 1 0.9989919
run 3 average reward 1 0.9979839
run 4 average reward 1 1.0
run 5 average reward 1 1.0
run 6 average reward 1 0.9989919
run 7 average reward 1 1.0
run 8 average reward 1 1.0
run 9 average reward 1 0.9989919
run 10 average reward 1 1.0
run 11 average reward 1 1.0
run 12 average reward 1 0.9989919
run 13 average reward 1 1.0
run 14 average reward 1 1.0
run 15 average reward 1 1.0
run 16 average reward 1 1.0
run 17 average reward 1 1.0
run 18 average reward 1 0.9979839
run 19 average reward 1 1.0
run 20 average reward 1 0.9989919
run 21 average reward 1 0.9989919
run 22 average reward 1 1.0
run 23 average reward 1 0.9989919
run 24 average reward 1 1.0
run 25 average reward 1 0.9969758
run 26 average reward 1 0.9979839
run 27 average reward 1 1.0
run 28 average reward 1 0.9989919
run 29 average reward 1 0.9989919
run 30 average reward 1 1.0
run 31 average reward 1 1.0
run 32 average reward 1 1.0
run 33 average reward 1 1.0
r