In [None]:
# Notebook for RL Work
# Written By Bryan

# ==== Bandit Imports ====
from Bandits import *    # Custom bandit classes

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import log_loss
from scipy.optimize import minimize



# ==== Mermaid display imports ====
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
    graphbytes = graph.encode("utf8")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    display(Image(url="https://mermaid.ink/img/" + base64_string))

## eGreedy 

#### Trial Configurations

In [None]:
# Create and Show env
env = Testbed(num_sim=1000)

#Trial Params
k = env.k
steps = 300
trial_params = [k, steps]
reward_values = env.q_star

env.show_mean()
env.show_testbed()

In [None]:
#Model Params
model_type = "EG"
alpha = 0
epsilon = 0
model_params = [alpha, epsilon]
start_val = 0


# Create eGreedy task
eg_0 = create_bandit_task(model_type, model_params, trial_params, reward_values, start_val)
eg_01 = create_bandit_task(model_type="EG", model_params=[0,0.1], trial_params=trial_params, reward_values=reward_values, start_val=start_val)

tasks = [eg_0, eg_01]
for task in tasks:
    task.simulate()
    task.show_results()
    task.show_actions()


## Softmax

In [None]:
#Model Params
model_type = "SM"
alpha = 1
temp = 1
model_params = [alpha, temp]
start_val = 1



SM = create_bandit_task(model_type, model_params, trial_params, reward_values, start_val)
SM.simulate()
SM.show_results()
SM.show_actions()

## Softmax_UCB

In [None]:
# Model Params
model_type = "SMUCB"
alpha = 1
temp = 1
uncertParam = 2
#uncert = 2
model_params = [alpha, temp, uncertParam]
start_val = 1

SMUCB = create_bandit_task(model_type=model_type, model_params=model_params, trial_params=trial_params, reward_values=reward_values, start_val=start_val)

SMUCB.simulate()
SMUCB.show_results()

## Model Validation

#### Param Recovery
Fit data to model it was simulated with
<ol>
<li>Simulate choices and rewards using chosen parameter values(acts as ground truth)</li>
<li>
Run fitting procedure (neg log likelihood function) on simulated choice AND reward data.
See if we an estimate simulated parameter values (estimates are fitted parameter values)
</li>
</ol>

In [None]:
# Run this to display mermaid flowchart of parameter recovery
mm("""
flowchart LR
   subgraph input-parameters
   direction TB
   id1([epsilon])
   id2([alpha])
   id3([etc...])
   id1 ~~~ id2 ~~~ id3
   end

   subgraph output-parameters
   direction TB
   id4([epsilon])
   id5([alpha])
   id6([etc...])
   id4 ~~~ id5 ~~~ id6
   end

   simData[(Simulated Data)]
   model((RL Model))
   n([noise])
   fm{{fit data to model}}

   input-parameters --"(1) input"--> model
   model --"(1) creates"--> simData
   simData -."(2) (optional) Add".-> n
   simData --> fm
   n -.-> fm
   fm ~~~ output-parameters
   fm --"(3) Outputs"--> output-parameters
   input-parameters o-- "(4) compare"  --o output-parameters

""")

In [None]:
num_models = 2      # number of models
num_sims = 30       # number of agents to fit
num_trials = 0      #

reward_modelPR = np.zeros(shape=[num_models, num_sims, num_trials])
choices_modelPR = np.zeros(shape=[num_models, num_sims, num_trials])

# Create Parameters
paraSimPR = paramCreate(num_sims)

# 
paraFitPR = np.zeros(shape=[num_sims, num_Param])
LLArray = np.zeros(shape=[num_sims, num_models])

# number of fitting attempts
num_fit_attempts = 5

for agent in tqdm(range(num_sims)):
    
    # generate environment (non stationary bandit environment from Daw et al in TF Doc)
    environment = Testbed() 

    # eGreedy
    EG = SimpleBandit(arms=10, epsilon=paraSimPR[agent, 0:2], steps=1000, q_star=environment.q_star, initial_Q=0.1)
    EG.simulate()

    # Softmax
    SM = SimpleBandit(arms=10, epsilon=paraSimPR[agent, 2:4], steps=1000, q_star=environment.q_star, initial_Q=0, argmax_func=softmax)
    SM.simulate()
    # Softmax UCB (Softmax Distrubutrion with upper confidence bound action selection)
    #SMUCB_Choice = SimpleBandit(arms=10, epsilon=paraSimPR[param_count, 4:7], steps=1000, q_star=reward_values.q_star, initial_Q=0, argmax_func=UCB)



#### Model Recovery

Fit data to all models
<ol>
<li>
Use simulated choice and reward data from paramater recovery step
</li>

<li>
Calculate negative log likelihood values using each model on the simulated data
</li>
</ol>
