# Learners

## Paul's stuff

In [None]:
from typing import Hashable, Sequence, Dict, Any
import coba.random

class Batched:
    def __init__(self, delay: int, batchsize: int, learner):
        self.learner = learner()
        self.batchsize = batchsize
        self.delay = delay
        self.mem = {}

        assert self.delay % self.batchsize == 0

    def init(self):
        self.learner.init()

    @property
    def family(self) -> str:
        return "Batched Learner"

    @property
    def params(self) -> Dict[str,Any]:
        return { 
                 **self.learner.params,
                 **{ 'delay': self.delay, 'batchsize': self.batchsize },
               }

    def predict(self, key: int, context: Hashable, actions: Sequence[Hashable]) -> int:
        """Choose which action index to take."""
        return self.learner.predict(key, context, actions)

    def learn(self, key: int, context: Hashable, action: Hashable, reward: float, probability: float) -> None:
        """Learn about the result of an action that was taken in a context."""

        self.mem[key] = { 'context': context,
                          'action': action,
                          'reward': reward,
                          'prob': probability
                        }

        if len(self.mem) >= self.delay:
            sumreward = 0
            contexts = []
            for key, values in self.mem.items():
                sumreward += values['reward']
                contexts.append((key, values))

                if len(contexts) % self.batchsize == 0:
                    for k, v in contexts:
                        self.learner.learn(k, 
                                           v['context'],
                                           v['action'],
                                           sumreward / self.batchsize,
                                           v['prob'])
                    sumreward = 0
                    contexts = []

            self.mem = {}


class BatchedSuffix:
    def __init__(self, delay: int, batchsize: int, learner, reorder=True):
        self.learner = learner()
        self.batchsize = batchsize
        self.delay = delay
        self.mem = {}
        self.reorder=reorder

        assert self.delay % self.batchsize == 0

    def init(self):
        self.learner.init()

    @property
    def family(self) -> str:
        return "BatchedSuffix"

    @property
    def params(self) -> Dict[str,Any]:
        return { 
                 **self.learner.params,
                 **{ 'delay': self.delay, 'batchsize': self.batchsize },
               }

    def predict(self, key: int, context: Hashable, actions: Sequence[Hashable]) -> int:
        """Choose which action index to take."""
        return self.learner.predict(key, context, actions)

    def learn(self, key: int, context: Hashable, action: Hashable, reward: float, probability: float) -> None:
        """Learn about the result of an action that was taken in a context."""

        self.mem[key] = { 'context': context,
                          'action': action,
                          'reward': reward,
                          'prob': probability
                        }

        if len(self.mem) >= self.delay:
            sumreward = 0
            contexts = []
            for key, values in self.mem.items():
                sumreward += values['reward']
                contexts.append((key, values))

                if len(contexts) % self.batchsize == 0:
                    order = list(range(self.batchsize))
                    if self.reorder:
                        order = coba.random.shuffle(order) 
                    for idx, i in enumerate(order):
                        (k, v) = contexts[i]
                        self.learner.learn(k, 
                                           v['context'],
                                           v['action'],
                                           sumreward / (self.batchsize - idx),
                                           v['prob'])
                        sumreward = sumreward - v['reward']
                    sumreward = 0
                    contexts = []

            self.mem = {}

In [None]:
from typing import Hashable, Sequence, Dict, Any

class Advantage:
    def __init__(self, seed: int, flags: str, learner):
        self.learner = learner()
        self.flags = flags
        self.seed = seed
        self.baseline=None

    def init(self):
        from os import devnull
        from coba import execution

        with open(devnull, 'w') as f, execution.redirect_stderr(f):
            from vowpalwabbit import pyvw
            self.baseline = pyvw.vw(f'--quiet ${self.flags} --random_seed {self.seed}')

    def tovw(self, context, reward, prob):
        assert type(context) is tuple, context

        return '\n'.join([
            f'{reward} {1.0/prob} | ' 
          + ' '.join([ f'{k+1}:{v}' for k, v in enumerate(context) if v != 0 ])
          ])

    @property
    def family(self) -> str:
        return "Advantage Wrapper"

    @property
    def params(self) -> Dict[str,Any]:
        return self.learner.params()

    def predict(self, key: int, context: Hashable, actions: Sequence[Hashable]) -> int:
        return self.learner.predict(key, context, actions)

    def learn(self, key: int, context: Hashable, action: Hashable, reward: float, probability: float) -> None:
        prob = self.learner._probs[key]
        exstr = self.tovw(context, reward, prob)
        vhat = self.baseline.predict(exstr)
        self.baseline.learn(exstr)
        self.learner.learn(key, context, action, reward - vhat, probability)

In [None]:
def epsilon_greedy_learner(epsilon=0.2, flags='--coin'):
    from coba.learners import VowpalLearner
    return VowpalLearner(seed=10, epsilon=epsilon, flags=flags)

def squarecblearner(epsilon, flags = '--coin'):
    from coba.learners import VowpalLearner
    return VowpalLearner(seed=10, epsilon = epsilon, flags=f'--squarecb {flags}')

def synthcoverlearner(epsilon, flags = '--coin'):
    from coba.learners import VowpalLearner
    return VowpalLearner(seed=10, epsilon=epsilon, flags=f'--synthcover {flags}')

def baglearner(bag=5, flags='--coin'):
    from coba.learners import VowpalLearner
    return VowpalLearner(seed=10, bag=bag, flags=flags)    


# Simulation

In [None]:
import coba.random

from coba.simulations import LambdaSimulation
from coba.learners.bandit import RandomLearner, EpsilonBanditLearner, UcbBanditLearner
from coba.learners.vowpal import VowpalLearner
from coba.benchmarks import Benchmark

import numpy as np

from coba.benchmarks import Benchmark
import re

def get_context(means, t):
    return (str(t % means.shape[0]), str(coba.random.randint(0, means.shape[1] - 1)))

def get_actions(means):
    return [str(i) for i in range(means.shape[2])]

def get_reward(means, c, a):
    return int(coba.random.random() < means[int(c[0])][int(c[1])][int(a)]) 

def print_info(title, means, epsilon=0.2):
    random_perf = np.mean(means)
    best_perf = np.max(means, axis=2).mean()

    print(f'----{title}----')
    print(f'Random perfomance: {random_perf}')
    print(f'Best performance: {best_perf}')
    print(f'Best performance with {epsilon} exploration: {best_perf * (1 - epsilon) + random_perf * epsilon}')

# Experiments

In [None]:
def do_the_test(means, count, batched, batchsize, learners, baseline=VowpalLearner(epsilon=0.2, seed=10, flags='--coin'), delay=8):
    actions_objects = get_actions(means)

    contexts = lambda t: get_context(means, t)
    actions = lambda t, c: actions_objects

    rewards = lambda t, c, a: get_reward(means, c, a)

    #define a simulation
    simulations = [
        LambdaSimulation(count, contexts, actions, rewards, seed=10),
    ]

    #define a benchmark: this benchmark replays the simulation 15 times
    benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

    learner_factories = [baseline] + [batched(delay=delay, batchsize=batchsize, learner=l) for l in learners]

    return benchmark.evaluate(learner_factories)

## Simulations

In [None]:
nsteps = 1
npeople = 8
nactions = 8

means_1_8_8 = np.ndarray(shape = (nsteps, npeople, nactions), buffer = np.array(coba.random.randoms(nsteps * npeople * nactions)))
print_info('means_1_8_8', means_1_8_8)

In [None]:
count=200000

In [None]:
learners = [
    lambda: epsilon_greedy_learner(epsilon=0.2),
    lambda: squarecblearner(epsilon=0.01),
    lambda: baglearner(),
    lambda: synthcoverlearner(epsilon=0.01)]

# Episodic

## steps = 1

In [None]:
result = do_the_test(means_1_8_8, count, Batched, 1, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=1)

## steps = 2

In [None]:
result = do_the_test(means_1_8_8, count, Batched, 2, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=1)

## steps = 4

In [None]:
result = do_the_test(means_1_8_8, count, Batched, 4, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=1)

## steps = 8

In [None]:
result = do_the_test(means_1_8_8, count, Batched, 8, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=1)

# Suffix

## steps = 1

In [None]:
result = do_the_test(means_1_8_8, count, BatchedSuffix, 1, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=2)

## steps = 2

In [None]:
result = do_the_test(means_1_8_8, count, BatchedSuffix, 2, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=2)

## steps = 4

In [None]:
result = do_the_test(means_1_8_8, count, BatchedSuffix, 4, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=2)

## steps = 8

In [None]:
result = do_the_test(means_1_8_8, count, BatchedSuffix, 8, learners, baseline=epsilon_greedy_learner())
result.standard_plot(show_err=True, figsize=[16,6], episode_factor=2)