In [1]:
# Bandits

In [2]:
import warnings ; warnings.filterwarnings('ignore')

import gymnasium as gym
# import gym
import numpy as np
import gym_bandits
from scipy.special import softmax as softmax_fn
from pprint import pprint
from tqdm import tqdm_notebook as tqdm

from itertools import cycle

import sys
import random
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

%matplotlib inline

In [3]:
sns.set_theme()
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)

# Baseline strategies

In [4]:
Q = np.array([7, 3])
np.argmax(Q)

np.int64(0)

In [6]:
def pure_exploitation(env, n_episodes=1000):
    Q = np.zeros((env.action_space.n), dtype=np.float64)
    N = np.zeros((env.action_space.n), dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = 'Pure exploitation'
    es = tqdm(range(n_episodes), desc='Episodes for: ' + name, leave=False)
    for e in es:
        action = np.argmax(Q)
        print(action)
        print("env.step(action)")
        print(env.step(action))
        
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]            
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

In [7]:
def pure_exploration(env, n_episodes=1000):
    Q = np.zeros((env.action_space.n), dtype=np.float64)
    N = np.zeros((env.action_space.n), dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = 'Pure exploration'
    es = tqdm(range(n_episodes), desc='Episodes for: ' + name, leave=False)
    for e in es:
        action = np.random.randint(len(Q))
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]            
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

# Simple strategies

In [8]:
def epsilon_greedy(env, epsilon=0.1, n_episodes=1000):
    Q = np.zeros((env.action_space.n), dtype=np.float64)
    N = np.zeros((env.action_space.n), dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = 'Pure exploration'
    es = tqdm(range(n_episodes), desc='Episodes for: ' + name, leave=False)
    for e in es:
        if np.random.random() <= epsilon:
            action = np.random.randint(len(Q))
        else:
            action = np.argmax(len(Q))
        
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]            
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

In [9]:
np.random.uniform(low=2, high=100, size=(2, 2)), np.random.random((2, 2))
max(0.2, .5)
np.maximum(0.2, .5)

np.float64(0.5)

In [10]:
init_epsilon = 1.0
min_epsilon = 0.01
decay_ratio = 0.5 
n_episodes = 10

def lin_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes):
    decay_range = init_epsilon - min_epsilon
    decay_episodes = int(n_episodes * decay_ratio)
    decay = decay_range / decay_episodes
    return init_epsilon, decay

epsilon, decay = lin_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes)
for e in range(n_episodes):
    print(epsilon)
    epsilon = max(epsilon - decay, min_epsilon)

1.0
0.802
0.6040000000000001
0.4060000000000001
0.20800000000000007
0.010000000000000064
0.01
0.01
0.01
0.01


In [11]:
def lin_dec_epsilon_greedy(
    env, init_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.05, n_episodes=1000):
    Q = np.zeros((env.action_space.n), dtype=np.float64)
    N = np.zeros((env.action_space.n), dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = f"Lin Epsilon-Greedy {init_epsilon}, {min_epsilon}, {decay_ratio}"
    es = tqdm(
        range(n_episodes), 
        desc='Episodes for: ' + name, 
        leave=False
    )
    
    epsilon, decay = lin_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes)
    for e in es:
        if np.random.random() <= epsilon:
            action = np.random.randint(len(Q))
        else:
            action = np.argmax(len(Q))
        epsilon = max(epsilon - decay, min_epsilon)
        
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]            
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

In [12]:
init_epsilon = 1.0
min_epsilon = 0.01
decay_ratio = 0.1
n_episodes = 10

def exp_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes):
    decay_episodes = int(n_episodes * decay_ratio)
    min_epsilon = max(min_epsilon, 1e-10)
    decay = np.exp(-np.log(init_epsilon / min_epsilon) / decay_episodes)
    return init_epsilon, decay


decay, epsilon = exp_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes)
for e in range(n_episodes):
    print(f"{epsilon:f}")
    epsilon = max(epsilon * decay, min_epsilon)

0.010000
0.010000
0.010000
0.010000
0.010000
0.010000
0.010000
0.010000
0.010000
0.010000


In [13]:
def exp_dec_epsilon_greedy(
    env, init_epsilon=1.0, min_epsilon=0.01, decay_ratio=0.05, n_episodes=1000):
    Q = np.zeros((env.action_space.n), dtype=np.float64)
    N = np.zeros((env.action_space.n), dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = f"Exp Epsilon-Greedy {init_epsilon}, {min_epsilon}, {decay_ratio}"
    es = tqdm(
        range(n_episodes), 
        desc='Episodes for: ' + name, 
        leave=False
    )
    
    epsilon, decay = exp_decay(init_epsilon, min_epsilon, decay_ratio, n_episodes)
    for e in es:
        if np.random.random() <= epsilon:
            action = np.random.randint(len(Q))
        else:
            action = np.argmax(len(Q))
        epsilon = max(epsilon * decay, min_epsilon)
        
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]            
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

In [14]:
np.full((2), 1.0)
np.full((2), 100)

array([100, 100])

In [15]:
def optimistic_initialization(
    env, optimistic_estimate=1.0, initial_count=100, n_episodes=1000):
    Q = np.full((env.action_space.n), optimistic_estimate, dtype=np.float64)
    N = np.full((env.action_space.n), initial_count, dtype=np.int32)
    
    Qe = np.empty((n_episodes, env.action_space.n), dtype=np.float64)
    returns = np.empty(n_episodes, dtype=np.float64)
    actions = np.empty(n_episodes, dtype=np.int32)
    
    name = f"Optimistic {optimistic_estimate}, {initial_count}"
    es = tqdm(
        range(n_episodes),
        desc='Episodes for: ' + name, 
        leave=False
    )
    
    for e in es:
        action = np.argmax(Q)
        next_state, reward, done, truncated, info = env.step(action)
        N[action] += 1
        Q[action] += (reward - Q[action]) / N[action]
        
        Qe[e] = Q
        returns[e] = reward
        actions[e] = action
    return name, returns, Qe, actions

# Two-Armed Bandit environments

In [16]:
import buffalo_gym

SEEDS = (12, 34, 56, 78, 90)

In [18]:
b2_Vs = []
for seed in SEEDS:
    env_name = 'Bandit-v0'
    env = gym.make(env_name, seed=seed)
    env.reset()
    
    b2_Q = np.array(env.env.p_dist * env.env.r_dist)
    
    print('Two-Armed Bandit environment with seed', seed)
    print('Probability of reward:', env.env.p_dist)
    print('Reward:', env.env.r_dist)
    print('Q(.):', b2_Q)
    b2_Vs.append(np.max(b2_Q))
    print('V*:', b2_Vs[-1])
    print()

AttributeError: 'OrderEnforcing' object has no attribute 'p_dist'

In [None]:
def b2_run_simple_strategies_experiment(env_name='BanditTwoArmedUniform-v0'):
    results = {}
    experiments = [
        # baseline strategies
        lambda env: pure_exploitation(env),
        lambda env: pure_exploration(env),
        
        # epsilon greedy
        lambda env: epsilon_greedy(env, epsilon=0.07),
        lambda env: epsilon_greedy(env, epsilon=0.1),
        
        # epsilon greedy linearly decaying
        lambda env: lin_dec_epsilon_greedy(
            env, init_epsilon=1.0, min_epsilon=0.0, decay_ratio=0.1
        ),
        lambda env: lin_dec_epsilon_greedy(
            env, init_epsilon=0.3, min_epsilon=0.001, decay_ratio=0.1
        ),
        
        # epsilon greedy exponentially decaying
        lambda env: exp_dec_epsilon_greedy(
            env, init_epsilon=1.0, min_epsilon=0.0, decay_ratio=0.1
        ),
        lambda env: exp_dec_epsilon_greedy(
            env, init_epsilon=0.3, min_epsilon=0.0, decay_ratio=0.3
        ),
        
        # epsilon greedy
        lambda env: optimistic_initialization(
            env, optimistic_estimate=1.0, initial_count=10
        ),
        lambda env: optimistic_initialization(
            env, optimistic_estimate=1.0, initial_count=50
        ),
    ]
    
    for env_seed in tqdm(SEEDS, desc='All experiments'):
        env = gym.make(env_name, seed=env_seed)
        env.reset()
        true_Q = np.array(env.unwrapped.p_dist * env.unwrapped.r_dist)
        opt_V = np.max(true_Q)
        for seed in tqdm(SEEDS, desc='All environments', leave=False):
            for experiment in tqdm(
                experiments, desc=f'Experiments with seed {seed}', leave=False):
                env.seed(seed); np.random.seed(seed); random.seed(seed)
                name, Re, Qe, Ae = experiment(env)
                Ae = np.expand_dims(Ae, -1)
                
                episode_mean_rew = np.cumsum(Re) / (np.arange(len(Re)) + 1)
                Q_selected = np.take_along_axis(
                    np.tile(true_Q, Ae.shape), Ae, axis=1).squeeze()
                regret = opt_V - Q_selected
                cum_regret = np.cumsum(regret)
                
                if name not in results.keys(): results[name] = {}
                if 'Re' not in results[name].keys(): results[name]['Re'] = []
                if 'Qe' not in results[name].keys(): results[name]['Qe'] = []
                if 'Ae' not in results[name].keys(): results[name]['Ae'] = []
                if 'cum_regret' not in results[name].keys(): 
                    results[name]['cum_regret'] = []
                if 'episode_mean_rew' not in results[name].keys(): 
                    results[name]['episode_mean_rew'] = []

                results[name]['Re'].append(Re)
                results[name]['Qe'].append(Qe)
                results[name]['Ae'].append(Ae)
                results[name]['cum_regret'].append(cum_regret)
                results[name]['episode_mean_rew'].append(episode_mean_rew)
    return results

b2_results_s = b2_run_simple_strategies_experiment()

# Running simple strategies on Two-Armed Bandit environments

# Plotting results of simple strategies on Two-Armed Bandit environments

# Advanced strategies

# Running advanced strategies on Two-Armed Bandit environments

# Plotting results of advanced strategies on Two-Armed Bandit  environments

# 10-Armed Gaussian Bandit environments

# Running simple strategies on 10-Armed Bandit environments

# Plotting results of simple strategies on 10-Armed Bandit environments

# Running advanced strategies on 10-Armed Bandit environments

# Plotting results of advanced strategies on 10-Armed Bandit environments