In [None]:
#@title
from numba import jit, cuda, roc
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output
import math
import itertools
%matplotlib inline

Here, I have tried to recreate the results of the paper <a href = "https://arxiv.org/abs/1906.07865">Adapting Behavior via Intrinsic Reward: a Survey and Empirical Study.</a>
Code base is taken from  http://jair.adaptingbehavior.com, and I have made proper changes to recreate the experiment 2 of this paper.

## Data Generate

In [None]:
#@title

def generate_action_data(phase_one = 50000, num_steps=150000):
    # For the distractors each step is a random number chosen from a Normal distribution mean 0.0, var 1.0
    #target 1
    distractor1 = np.random.randn(phase_one)
    random_drift = np.random.randn(num_steps-phase_one) * 0.1
    drifter1 = np.cumsum(random_drift) + distractor1[phase_one - 1]
    target1 = np.concatenate((distractor1, drifter1))
    
    #target 2
    # At each step the drifter changes slightly by mean 0.0 and variance 0.1
    random_drift = np.random.randn(phase_one) * 0.1
    drifter2 = np.cumsum(random_drift)
    distractor2 = np.random.randn(num_steps-phase_one) + drifter2[phase_one - 1]
    target2 = np.concatenate((drifter2, distractor2))
    
    #target 3
    # Constant number randomly chosen between [-50, 50]
    constant1 = np.ones(phase_one) * np.random.uniform(-50, 50)
    random_drift = np.random.randn(num_steps-phase_one) * 0.1
    drifter3 = np.cumsum(random_drift) + constant1[phase_one - 1]
    target3 = np.concatenate((constant1, drifter3))
    
    #target 4
    distractor3 = np.random.randn(phase_one)
    constant2 = np.ones(num_steps-phase_one) * np.random.uniform(-50, 50) + distractor3[phase_one - 1]
    target4 = np.concatenate((distractor3, constant2))
    
    #print(target1.shape)
    #print(target2.shape)
    #print(target3.shape)
    #print(target4.shape)
    
    return np.array([target1, target2, target3, target4])

In [None]:
#@title
# Plot Actions
#plt.plot(generate_action_data().T)
data = generate_action_data().T
#plt.legend(["Distractor 1", "Distractor 2", "Constant", "Drifter"])
colors = ["red", "green", "blue", "black"]
for i in range(4):
    plt.plot(data.T[i], color=colors[i])
            
plt.legend(["Target 1: Distractor -> Drifter",
            "Target 2: Drifter -> Distractor",
            "Target 3: Constant -> Drifter",
            "Target 4: Distractor -> Constant"])

##Setup

In [None]:
#@title

class Autostep():
    def __init__(self, alpha=1.0, n=1.0, h=0.0, k=0.1):
        self.alpha = alpha
        self.n = n
        self.h = h
        self.k = k
    
    def update(self, delta):
        self.n = max(np.abs(delta * self.h), 
                     self.n + (1.0/10000.0) * self.alpha * (np.abs(delta * self.h) - self.n))
        
        self.alpha = min(self.alpha * np.exp(self.k * delta * self.h / self.n), 0.5)
        self.h = self.h * (1 - self.alpha) + (self.alpha * delta)
        return self.alpha, self.h, self.n

In [None]:
#@title
class MovingAverage:

    def __init__(self, beta=0.001, one=0.0, method="window"):
        self.avg = 0.0

        # for time based moving average
        self.t = 0

        # for windowed moving average
        self.one = one
        self.beta = beta

        self.method = method

    def update(self, x):

        if self.method == "window":
            self.one = ((1.0 - self.beta) * self.one) + (self.beta * 1.0)
            rate = self.beta / self.one
            self.avg = ((1.0 - rate) * self.avg) + (rate * x)

        if self.method == "time":
            self.t += 1
            self.avg += (x - self.avg) / self.t

        return self.avg

class IncrementalVariance:

    def __init__(self, beta=0.001, one=0.0, method="window"):
        self.sample_mean = MovingAverage(beta=beta, one=one, method=method)
        self.moving_variance = MovingAverage(beta=beta, one=one, method=method)
        self.type = method
        self.differences = []


    def update(self, x):

        old_mean = self.sample_mean.avg

        # update mean
        self.sample_mean.update(x)

        # update variance
        var_sample = (x - old_mean) * (x - self.sample_mean.avg)

        self.moving_variance.update(var_sample)

        return self.moving_variance.avg


In [None]:
#@title
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x))

def select_action(probs):
    return np.random.choice(4, p=probs)

## Design Intrinsic Rewards' functions: Return reward and Prediction error

In [None]:
#@title
class IntrinsicReward:
    def __init__(self):
        self.prediction = 0.0
        self.stepsize = 0.0
        self.autostep_learner = None
        self.is_introspective = None

    def update_prediction(self, x):
        delta = x - self.prediction

        if self.is_introspective:
            self.autostep_learner.update(delta)
            stepsize = self.autostep_learner.alpha
        else:
            stepsize = self.stepsize
        
        self.prediction += stepsize * delta

        return delta

class ErrorDerivative(IntrinsicReward):
    def __init__(self, stepsize=0.03125, is_introspective=False, **kwargs):
        self.stepsize = stepsize
        self.is_introspective = is_introspective
        self.eta = kwargs.get("eta", 1000)
        self.tau = kwargs.get("tau", 100)
        self.squared_errors = []
        self.prediction = 0.0
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):

        delta = self.update_prediction(x)

        self.squared_errors.append(np.square(delta))
        length = len(self.squared_errors)     

        if length > (self.eta + self.tau + 1):
            first = (1.0 / (self.eta + 1)) * np.sum(self.squared_errors[-(self.eta+self.tau+1):-self.tau])
            second = (1.0 / (self.eta + 1)) * np.sum(self.squared_errors[-(self.eta+1):])
            return np.abs(first - second), delta
        else:
            return 0.0, delta

class ExpectedError(IntrinsicReward):
    def __init__(self, stepsize=0.0625, is_introspective=False, **kwargs):
        self.stepsize = stepsize
        self.is_introspective = is_introspective
        self.one = 0.0
        self.beta = kwargs.get("beta", 0.01)
        self.delta_bar = 0.0
        self.prediction = 0.0
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
        
    def update(self, x):
        delta = self.update_prediction(x)

        self.one = (1.0 - self.beta) * self.one + self.beta * 1.0
        rate = self.beta / self.one
        self.delta_bar = (1.0 - rate) * self.delta_bar + rate * delta

        return np.abs(self.delta_bar), delta

class StepSizeChange(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.previous_stepsize = 0.0
        self.is_introspective = is_introspective
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)
        
        if self.is_introspective:
            stepsize = self.autostep_learner.alpha
        else:
            stepsize = 0.0

        stepsize_change = np.abs(self.previous_stepsize - stepsize)
        self.previous_stepsize = stepsize

        return stepsize_change, delta

class ErrorReduction(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.previous_error = 0.0
        self.is_introspective = is_introspective
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)
        im_reward = np.abs(self.previous_error) - np.abs(delta)
        self.previous_error = delta

        return im_reward, delta

class SquaredError(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)
        
        return np.square(delta), delta

class BayesianSurprise(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective

        self.previous_prediction = 0.0
        self.previous_variance = 0.0
        self.eta = 1.0
        self.b = kwargs.get("beta", 0.01)
        
        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)

        self.b = 0.01
        self.eta = (1 - self.b) * self.eta + 1.0

        welfords_var = (1 - self.b) * self.previous_variance + self.b * (x - self.previous_prediction) * (x - self.prediction)
        var = max(welfords_var / self.eta, 10**-2)

        if var != 0.0 and self.previous_variance != 0.0:
            first = 0.5 * np.log2(var / self.previous_variance)
            second = self.previous_variance + np.square(self.previous_prediction - self.prediction) / (2 * var)
            im_reward = first + second - 0.5
        else:
            im_reward = 0.0
        
        self.previous_variance = np.copy(var)
        self.previous_prediction = np.copy(self.prediction)

        return im_reward,delta

class UDE(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective

        self.one = 0.0
        self.beta = kwargs.get("beta", 0.01)
        self.delta_bar = 0.0

        self.count = 0
        
        self.delta_squared_avg = MovingAverage(method="time")
        self.delta_avg = MovingAverage(method="time")

        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        self.count += 1
        delta = self.update_prediction(x)

        self.one = (1.0 - self.beta) * self.one + self.beta * 1.0
        rate = self.beta / self.one
        self.delta_bar = (1.0 - rate) * self.delta_bar + rate * delta

        s_bar = self.delta_squared_avg.update(np.square(delta))
        x_bar = self.delta_avg.update(delta)
        
        if self.count > 30:
            count = self.delta_avg.t
            denom = np.sqrt((s_bar / count) - np.square(x_bar / count))
            im_reward = np.abs(self.delta_bar / (denom + 0.00001))
        else:
            im_reward = 0.0
        
        return im_reward, delta

class UncertaintyChange(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective

        self.beta = kwargs.get("beta_var", 0.001)
        
        self.variance = IncrementalVariance(beta=self.beta)
        self.previous_variance = 0.0

        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)

        var = self.variance.update(self.prediction)
        im_reward = np.abs(self.previous_variance - var)
        self.previous_varaiance = var
        
        return im_reward, delta

class VarianceOfPrediction(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective
        self.beta = kwargs.get("beta_var", 0.001)
        self.variance = IncrementalVariance(beta=self.beta)

        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)

        im_reward = self.variance.update(self.prediction)
        
        return im_reward, delta

class WeightChange(IntrinsicReward):
    def __init__(self, stepsize=0.1, is_introspective=False, **kwargs):
        super().__init__()
        self.stepsize = stepsize
        self.is_introspective = is_introspective

        self.k = kwargs.get("k", 0.1)

        if self.is_introspective:
            self.autostep_learner = Autostep(k=self.k)
    
    def update(self, x):
        delta = self.update_prediction(x)

        if self.is_introspective:
            self.stepsize = self.autostep_learner.alpha

        return np.abs(self.stepsize * delta), delta

## Gradient Bandit Algorithm

In [None]:
#@title
class BehaviorAgent:
    def __init__(self, behavior_stepsize, intrinsic_reward, is_introspective, *args, **kwargs):
        self.intrinsic_rewards = [choose_reward(intrinsic_reward, is_introspective)[0] for _ in range(4)]
        self.one = 0.0
        self.r_bar = 0.0
        self.beta = kwargs.get("beta_r", 0.1)
        self.behavior_stepsize = behavior_stepsize
        self.action_values = np.zeros(4)
        self.probs = np.ones(4) / 4
        self.delta = 0

    def calculate_probs(self):
        self.probs = softmax(self.action_values)

    def choose_action(self):
        self.calculate_probs()
        return select_action(self.probs)

    def update(self, x, chosen_action):
        # get intrinsic_reward
        im_reward, self.delta = self.intrinsic_rewards[chosen_action].update(x)

        # update_average_reward
        self.one = (1.0 - self.beta) * self.one + self.beta * 1.0
        rate = self.beta / self.one
        self.r_bar = (1.0 - rate) * self.r_bar + rate * im_reward
        
        # Update the actions values based on the intrinsic reward
        for action in range(4):
            self.action_values[action] += self.behavior_stepsize * (im_reward - self.r_bar)*((action==chosen_action)*1.0 - self.probs[action])

In [None]:
#@title
def run_experiment(num_steps=150000, is_introspective=True,
                   record_stepsizes=False,
                   intrinsic_reward="Error Derivative"):
    data = generate_action_data(50000, num_steps)
    track_probs = []
    track_stepsizes = []
    track_error = []
    
    behavior_agent = BehaviorAgent(choose_reward(intrinsic_reward, is_introspective)[1], 
                                   intrinsic_reward,
                                   is_introspective,
                                   **choose_reward(intrinsic_reward, is_introspective)[2])

    for i, step_data in enumerate(data.T):
        chosen_action = behavior_agent.choose_action()

        behavior_agent.update(step_data[chosen_action], chosen_action)

        track_probs.append(np.copy(behavior_agent.probs))
        track_error.append(np.copy(np.square(behavior_agent.delta)))
        
        if record_stepsizes:
            track_stepsizes.append(np.copy(np.array([learner.autostep_learner.alpha for learner in behavior_agent.intrinsic_rewards])))
    
    return np.array(track_probs), np.array(track_stepsizes), np.sqrt(np.mean(track_error))



## Try Different hyper-parameters

In [None]:
#Hyperparameter's list
alpha=[]
alpha_p=[]
beta=[]
beta_r=[]

combination_options={}
options_non_introspective={}
options_introspective = {}

for i in range(2,9):
    alpha.append(math.pow(2,-i))
for i in range(2,8):
    alpha_p.append(math.pow(2,-i))
    
for i in range(1,7):
    beta.append(math.pow(10,-i))
    beta_r.append(math.pow(10,-i))
    
eta=[1, 5, 10, 25, 100, 1000]
tau=[1, 5, 10, 25, 100]

##Non-Introspective learning hyper-parameters

In [None]:
track_options=0
is_introspective = False

#UDE
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r, beta]
options_non_introspective["UDE"] = list(itertools.product(*options))

#Weight Change
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r]
options_non_introspective["Weight Change"] = list(itertools.product(*options))

#Bayesian Surprise
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r, beta]
options_non_introspective["Bayesian Surprise"] = list(itertools.product(*options))

#Squared Error
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r]
options_non_introspective["Squared Error"] = list(itertools.product(*options))


#Expected Error
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r, beta]
options_non_introspective["Expected Error"] = list(itertools.product(*options))

#Variance of Prediction
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_var=[] # put your desired list of values here
options = [alpha_p, alpha, beta_r, beta_var]
options_non_introspective["Variance of Prediction"] = list(itertools.product(*options))


#Error Derivative
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
eta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
tau=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r, eta, tau]
options_non_introspective["Error Derivative"] = list(itertools.product(*options))

#Step-size Change
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r]
options_non_introspective["Step-size Change"] = list(itertools.product(*options))

#Error Reduction
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha_p, alpha, beta_r]
options_non_introspective["Error Reduction"] = list(itertools.product(*options))

#Uncertainty Change
alpha_p=[] # put your desired list of values here or use the corresponding variable from the previous cell.
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_var=[] # put your desired list of values here
options = [alpha_p, alpha, beta_r, beta_var]
options_non_introspective["Uncertainty Change"] = list(itertools.product(*options))


combination_options[is_introspective] = options_non_introspective

##Introspective learner's hyperparameters

In [None]:
track_options=0
is_introspective = True

#UDE
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r, beta]
options_introspective["UDE"] = list(itertools.product(*options))

#Weight Change
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r]
options_introspective["Weight Change"] = list(itertools.product(*options))

#Bayesian Surprise
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r, beta]
options_introspective["Bayesian Surprise"] = list(itertools.product(*options))

#Squared Error
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r]
options_introspective["Squared Error"] = list(itertools.product(*options))


#Expected Error
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r, beta]
options_introspective["Expected Error"] = list(itertools.product(*options))

#Variance of Prediction
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_var=[] # put your desired list of values here
options = [alpha, beta_r, beta_var]
options_introspective["Variance of Prediction"] = list(itertools.product(*options))


#Error Derivative
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
eta=[] # put your desired list of values here or use the corresponding variable from the previous cell.
tau=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r, eta, tau]
options_introspective["Error Derivative"] = list(itertools.product(*options))

#Step-size Change
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r]
options_introspective["Step-size Change"] = list(itertools.product(*options))

#Error Reduction
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
options = [alpha, beta_r]
options_introspective["Error Reduction"] = list(itertools.product(*options))

#Uncertainty Change
alpha=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_r=[] # put your desired list of values here or use the corresponding variable from the previous cell.
beta_var=[] # put your desired list of values here
options = [alpha, beta_r, beta_var]
options_introspective["Uncertainty Change"] = list(itertools.product(*options))


combination_options[is_introspective] = options_introspective

In [None]:
#@title
def choose_reward(name, is_introspective):
    if name == "UDE":
        if is_introspective:
            return (UDE(stepsize=1.0,
                        is_introspective=True),
                     combination_options[is_introspective][name][track_options][0],
                    {"beta_r":combination_options[is_introspective][name][track_options][1],
                     "beta": combination_options[is_introspective][name][track_options][2],
                     "k": 0.01})
        else:
            return (UDE(stepsize=combination_options[is_introspective][name][track_options][0],
                        is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2],
                     "beta": combination_options[is_introspective][name][track_options][3]})
    elif name == "Weight Change":
        if is_introspective:
            return (WeightChange(stepsize=1.0,
                                 is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "k": 0.1})
        else:
            return (WeightChange(stepsize=combination_options[is_introspective][name][track_options][0],
                                 is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2]})
        
    elif name == "Bayesian Surprise":
        if is_introspective:
            return (BayesianSurprise(stepsize=1.0,
                                     is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "beta": combination_options[is_introspective][name][track_options][2],
                     "k": 0.1})
        else:
            return (BayesianSurprise(stepsize=combination_options[is_introspective][name][track_options][0],
                                     is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2],
                     "beta": combination_options[is_introspective][name][track_options][3]})
        
    elif name == "Squared Error":
        if is_introspective:
            return (SquaredError(stepsize=1.0,
                                 is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "k": 0.1})
        else:
            return (SquaredError(stepsize=combination_options[is_introspective][name][track_options][0],
                                 is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r":combination_options[is_introspective][name][track_options][2]})
        
        
    elif name == "Expected Error":
        if is_introspective:
            return (ExpectedError(stepsize=1.0,
                                  is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "beta": combination_options[is_introspective][name][track_options][2],
                     "k": 0.1})
        else:
            return (ExpectedError(stepsize=combination_options[is_introspective][name][track_options][0],
                                  is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2],
                     "beta": combination_options[is_introspective][name][track_options][3]})
        
    elif name == "Variance of Prediction":
        if is_introspective:
            return (VarianceOfPrediction(stepsize=1.0,
                                         is_introspective=True),
                     combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "beta_var": combination_options[is_introspective][name][track_options][2],
                     "k": 0.1})
        else:
            return (VarianceOfPrediction(stepsize=combination_options[is_introspective][name][track_options][0],
                                         is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2],
                     "beta_var": combination_options[is_introspective][name][track_options][3] #0.00001
                    })
    elif name == "Error Derivative":
        if is_introspective:
            return (ErrorDerivative(stepsize=1.0,
                                    is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "eta": combination_options[is_introspective][name][track_options][2],
                     "tau": combination_options[is_introspective][name][track_options][3],
                     "k": 0.1})
        else:
            return (ErrorDerivative(stepsize=combination_options[is_introspective][name][track_options][0],
                                   is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2],
                     "eta": combination_options[is_introspective][name][track_options][3],#1000,
                     "tau": combination_options[is_introspective][name][track_options][4]})#100.0})
    elif name == "Step-size Change":
        if is_introspective:
            return (StepSizeChange(stepsize=1.0,
                                  is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "k": 0.1})
        else:
            return (StepSizeChange(stepsize=combination_options[is_introspective][name][track_options][0],
                                  is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2]})
    
    elif name == "Error Reduction":
        if is_introspective:
            return (ErrorReduction(stepsize=1.0,
                                   is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "k": 0.1})
        else:
            return (ErrorReduction(stepsize=combination_options[is_introspective][name][track_options][0],
                                   is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r": combination_options[is_introspective][name][track_options][2]})
    elif name == "Uncertainty Change":
        if is_introspective:
            return (UncertaintyChange(stepsize=1.0,
                                      is_introspective=True),
                    combination_options[is_introspective][name][track_options][0],
                    {"beta_r": combination_options[is_introspective][name][track_options][1],
                     "beta_var": combination_options[is_introspective][name][track_options][2],
                     "k": 0.1})
        else:
            return (UncertaintyChange(stepsize=combination_options[is_introspective][name][track_options][0],
                                      is_introspective=False),
                    combination_options[is_introspective][name][track_options][1],
                    {"beta_r":combination_options[is_introspective][name][track_options][2],
                     "beta_var": combination_options[is_introspective][name][track_options][3]})

## Find the best hyper-parameters

In [None]:
#@title
def run(reward, choose_introspective = True, choose_runs = 2):
    output = widgets.Output()

    print("Running for "+ str(combination_options[choose_introspective][reward][track_options]))
    
    
    runs = [run_experiment(intrinsic_reward=reward, is_introspective=choose_introspective) for _ in range(choose_runs)]
    runs_probs = [runs[i][0] for i in range(len(runs))]
    RMSE = [runs[i][2] for i in range(len(runs))]
    
    print("*****RMSE**********")
    print(np.mean(RMSE))
    print("******************")
    action_probs = np.mean(runs_probs, axis=0)
    with output:
        fig = plt.figure(figsize=(4,5))
        fig.suptitle(reward, fontsize=10, y=0.92)
        colors = ["red", "green", "blue", "black"]
        for i in range(4):
            plt.plot(action_probs.T[i], color=colors[i])
        #plt.legend(["Target 1: Distractor -> Drifter",
        #    "Target 2: Drifter -> Distractor",
        #    "Target 3: Constant -> Drifter",
        #    "Target 4: Distractor -> Constant"], bbox_to_anchor=(0.599, 1.28), loc='center right')
        plt.ylim([0.0, 1.0])
        #plt.xlim([0,150000])
        plt.xticks(np.arange(0, 150000+1, 50000))

##Non-Introspective Learning: Below sample code is given for two rewards. We can use it for other rewards- we just need to change the 'reward' name

In [None]:
track_options = 0
choose_introspective = False
reward = "Bayesian Surprise"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 200, reward = reward)
    track_options = track_options + 1

In [None]:
track_options = 0
choose_introspective = False
reward = "Variance of Prediction"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 200, reward = reward)
    track_options = track_options + 1

##Introspective Learning: 

In [None]:
track_options = 0
choose_introspective = True
reward = "Bayesian Surprise"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 200, reward = reward)
    track_options = track_options + 1

## Plot Figure with all hyper-parameters

In [None]:
#@title
def run(reward, choose_introspective = True, choose_runs = 2):
    output = widgets.Output()

    print("Running for "+ str(combination_options[choose_introspective][reward][track_options]))
    
    
    runs = [run_experiment(intrinsic_reward=reward, is_introspective=choose_introspective) for _ in range(choose_runs)]
    runs_probs = [runs[i][0] for i in range(len(runs))]
    RMSE = [runs[i][2] for i in range(len(runs))]
    
    print("*****RMSE**********")
    print(np.mean(RMSE))
    print("******************")
    action_probs = np.mean(runs_probs, axis=0)
    with output:
        fig = plt.figure(figsize=(4,5))
        #Show the hyper-parameters on the figure
        fig.suptitle(str(combination_options[choose_introspective][reward][track_options]), fontsize=10, y=0.92)
        colors = ["red", "green", "blue", "black"]
        for i in range(4):
            plt.plot(action_probs.T[i], color=colors[i])
        #plt.legend(["Target 1: Distractor -> Drifter",
        #    "Target 2: Drifter -> Distractor",
        #    "Target 3: Constant -> Drifter",
        #    "Target 4: Distractor -> Constant"], bbox_to_anchor=(0.599, 1.28), loc='center right')
        plt.ylim([0.0, 1.0])
        plt.xticks(np.arange(0, 150000+1, 50000))
        
track_options = 0
choose_introspective = False
reward = "Variance of Prediction"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 5, reward = reward)
    track_options = track_options + 1

## Best Hyper-parameters

I did not able to check all combinations of the hyper-parameters due to limited computational resource and time constraint.

In [None]:
#@title
def choose_reward(name, is_introspective):
    if name == "Error Derivative":
        if is_introspective:
            return (ErrorDerivative(stepsize=1.0,
                                    is_introspective=True),
                     0.003906,
                    {"beta_r": 0.1,
                     "eta": 1000,
                     "tau": 1,
                     "k": 0.1})
        else:#(0.03125, 0.007812, 0.001)
            return (ErrorDerivative(stepsize=0.03125,
                                   is_introspective=False),
                    0.007812,
                    {"beta_r": 0.001,
                     "eta": 1000,
                     "tau": 100})
    
    elif name == "Expected Error":
        if is_introspective:
            return (ExpectedError(stepsize=1.0,
                                  is_introspective=True),
                    0.0078125,
                    {"beta_r": 0.1,
                     "beta": 0.001,
                     "k": 0.1})
        else:
            return (ExpectedError(stepsize=0.0625,
                                  is_introspective=False),
                    0.0078125,
                    {"beta_r": 0.1,
                     "beta": 0.1})
    
    elif name == "Step-size Change":
        if is_introspective:
            return (StepSizeChange(stepsize=1.0,
                                  is_introspective=True),
                    0.25,
                    {"beta_r": 0.0001,
                     "k": 0.1})
        else:
            return (StepSizeChange(stepsize=0.1,
                                  is_introspective=False),
                    0.1,
                    {"beta_r": 0.001})
    
    elif name == "Error Reduction":
        if is_introspective:
            return (ErrorReduction(stepsize=1.0,
                                   is_introspective=True),
                    0.003906,
                    {"beta_r": 0.1,
                     "k": 0.1})
        else:
            return (ErrorReduction(stepsize=0.125,
                                   is_introspective=False),
                    0.003906,
                    {"beta_r": 0.1})
    
    elif name == "Squared Error":
        if is_introspective:
            return (SquaredError(stepsize=1.0,
                                 is_introspective=True),
                    0.0078125,
                    {"beta_r": 0.1,
                     "k": 0.1})
        else:
            return (SquaredError(stepsize=0.015625,
                                 is_introspective=False),
                    0.003906,
                    {"beta_r": 0.01})
    
    elif name == "Bayesian Surprise":
        if is_introspective:
            return (BayesianSurprise(stepsize=1.0,
                                     is_introspective=True),
                    0.003906,
                    {"beta_r": 0.1,
                     "beta": 0.01,
                     "k": 0.1})
        else:
            return (BayesianSurprise(stepsize=0.03125,
                                     is_introspective=False),
                    0.003906,
                    {"beta_r": 0.1,
                     "beta": 0.01})
    
    elif name == "UDE":
        if is_introspective:
            return (UDE(stepsize=1.0,
                        is_introspective=True),
                    0.25,
                    {"beta_r": 0.1,
                     "beta": 0.001,
                     "k": 0.01})
        else:
            return (UDE(stepsize=0.0625,
                        is_introspective=False),
                    0.125,
                    {"beta_r": 0.1,
                     "beta": 0.1})
    
    elif name == "Uncertainty Change":
        if is_introspective:
            return (UncertaintyChange(stepsize=1.0,
                                      is_introspective=True),
                    0.0078125,
                    {"beta_r": 0.0001,
                     "beta_var": 0.1,
                     "k": 0.1})
        else:
            return (UncertaintyChange(stepsize=0.25,
                                      is_introspective=False),
                    0.00390625,
                    {"beta_r": 0.01,
                     "beta_var": 0.01})
    
    elif name == "Variance of Prediction":
        if is_introspective:
            return (VarianceOfPrediction(stepsize=1.0,
                                         is_introspective=True),
                    0.003906,
                    {"beta_r": 0.0001,
                     "beta_var": 0.1,
                     "k": 0.1})
        else:
            return (VarianceOfPrediction(stepsize=0.25,
                                         is_introspective=False),
                    0.25,
                    {"beta_r": 0.00001, 
                     "beta_var": 0.000001
                    })
    
    elif name == "Weight Change":
        if is_introspective:
            return (WeightChange(stepsize=1.0,
                                 is_introspective=True),
                    0.007812,
                    {"beta_r": 0.01,
                     "k": 0.1})
        else:
            return (WeightChange(stepsize=0.0625,
                                 is_introspective=False),
                    0.003906,
                    {"beta_r": 0.1})

## Run non-introspective and introspective learners for all intrinsic rewards

In [None]:
#@title
def run(choose_introspective = True, choose_runs = 2):
    output = widgets.Output()
    options=["Error Derivative", "Expected Error", "Step-size Change", 
                "Error Reduction", "Squared Error", "Bayesian Surprise", 
                "UDE", "Uncertainty Change", "Variance of Prediction", "Weight Change"]
         
    for reward in options:
        print("Running for "+ reward)
        runs = [run_experiment(intrinsic_reward=reward, is_introspective=choose_introspective)[0] for _ in range(choose_runs)]
        action_probs = np.mean(runs, axis=0)
        with output:
            fig = plt.figure(figsize=(4,5))
            fig.suptitle(reward, fontsize=10, y=0.92)
            colors = ["red", "green", "blue", "black"]
            for i in range(4):
                plt.plot(action_probs.T[i], color=colors[i])
            #plt.legend(["Target 1: Distractor -> Drifter",
            #    "Target 2: Drifter -> Distractor",
            #    "Target 3: Constant -> Drifter",
            #    "Target 4: Distractor -> Constant"], bbox_to_anchor=(0.599, 1.28), loc='center right')
            plt.ylim([0.0, 1.0])
            #plt.xlim([0,150000])
            plt.xticks(np.arange(0, 150000+1, 50000))

In [None]:
run(choose_introspective = False, choose_runs = 200)

In [None]:
run(choose_introspective = True, choose_runs = 200)

## Test Individual Intrinsic Reward

In [None]:
#@title
def run():
  output = widgets.Output()
  button = widgets.Button(description="Run Experiment")

  choose_reward = widgets.Dropdown(
      options=["Error Derivative", "Expected Error", "Step-size Change", 
                "Error Reduction", "Squared Error", "Bayesian Surprise", 
                "UDE", "Uncertainty Change", "Variance of Prediction", "Weight Change"],
      value="Weight Change",
      description="Choose Reward:",
      disabled=False,
      style={'description_width': 'initial'},
  )

  choose_runs = widgets.IntText(description="Choose number of runs:", value=2, style={'description_width': 'initial'},)
  choose_introspective = widgets.Checkbox(value=True, description="Use Introspective Learners?")

  def reset_displays():
      clear_output()
      display(choose_reward)
      display(choose_runs)
      display(choose_introspective)
      display(button)

  def on_button_click(b):
      reset_displays()
      reward = choose_reward.value
      print("Running")
      runs = [run_experiment(intrinsic_reward=reward, is_introspective=choose_introspective.value)[0] for _ in range(choose_runs.value)]
      action_probs = np.mean(runs, axis=0)
      with output:
            fig = plt.figure(figsize=(4,5))

            fig.suptitle(reward, fontsize=10, y=0.92)

            colors = ["red", "green", "blue", "black"]
            for i in range(4):
                plt.plot(action_probs.T[i], color=colors[i])
            #plt.legend(["Target 1: Distractor -> Drifter",
            #    "Target 2: Drifter -> Distractor",
            #    "Target 3: Constant -> Drifter",
            #    "Target 4: Distractor -> Constant"], bbox_to_anchor=(0.599, 1.28), loc='center right')
            plt.ylim([0.0, 1.0])
            #plt.xlim([0,150000])
            plt.xticks(np.arange(0, 150000+1, 50000))

  button.on_click(on_button_click)
  reset_displays()

In [None]:
run()

## Effect of Step-size parameter alpha_p

In [None]:
def run(reward, choose_introspective = True, choose_runs = 2):
    output = widgets.Output()

    print("Running for "+ str(combination_options[choose_introspective][reward][track_options]))
    runs = [run_experiment(intrinsic_reward=reward, is_introspective=choose_introspective) for _ in range(choose_runs)]
    runs_probs = [runs[i][0] for i in range(len(runs))]
    RMSE = [runs[i][2] for i in range(len(runs))]
    
    print("*****RMSE**********")
    print(np.mean(RMSE))
    print("******************")
    action_probs = np.mean(runs_probs, axis=0)
    with output:
        fig = plt.figure(figsize=(4,5))
        fig.suptitle(str(combination_options[choose_introspective][reward][track_options][0]), fontsize=10, y=0.92)
        colors = ["red", "green", "blue", "black"]
        for i in range(4):
            plt.plot(action_probs.T[i], color=colors[i])
        plt.ylim([0.0, 1.0])
        plt.xticks(np.arange(0, 150000+1, 50000))

In [None]:
#Squared Error
alpha_p=[]
for i in range(2,8):
    alpha_p.append(math.pow(2,-i))
alpha = [0.003906]
beta = [0.01]

options = [alpha_p, alpha, beta]
options_non_introspective["Squared Error"]= list(itertools.product(*options))

combination_options[False] = options_non_introspective
print(combination_options[False]["Squared Error"])

In [None]:
track_options = 0
choose_introspective = False
reward = "Squared Error"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 200, reward = reward)
    track_options = track_options + 1

In [None]:
#Bayesian Surprise
alpha_p=[]
for i in range(2,8):
    alpha_p.append(math.pow(2,-i))
    
beta_r=[0.1]
beta = [0.01]
alpha = [0.003906]

options=[alpha_p,alpha, beta_r, beta]
options_non_introspective["Bayesian Surprise"]= list(itertools.product(*options))

combination_options[False] = options_non_introspective
print(combination_options[False]["Bayesian Surprise"])

In [None]:
track_options = 0
choose_introspective = False
reward = "Bayesian Surprise"
for i in range(len(combination_options[choose_introspective][reward])):
    run(choose_introspective = False, choose_runs = 200, reward = reward)
    track_options = track_options + 1