# Visualization

In [1]:
from coba.benchmarks import Result
#from coba.analysis import Plots
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [12.0, 9.0]
! pip list | grep coba

'grep' is not recognized as an internal or external command,
operable program or batch file.


In [17]:
class Plots():

    @staticmethod
    def standard_plot(result: Result, episode_factor: int = 1, show_err: bool = False, show_sd: bool = False) -> None:
        from collections import defaultdict
        from itertools import groupby 
        from typing import cast, Dict, List
        import math
        from coba.utilities import check_matplotlib_support
        from coba.benchmarks import Result
        from coba.statistics import OnlineMean, OnlineVariance

        def _plot(axes, label, xs, ys, vs, ns):
            axes.plot(xs, ys, label=label)

            if show_sd:
                ls = [ y-math.sqrt(v) for y,v in zip(ys,vs) ]
                us = [ y+math.sqrt(v) for y,v in zip(ys,vs) ]
                axes.fill_between(xs, ls, us, alpha = 0.1)

            if show_err:
                # I don't really understand what this is... For each x our distribution
                # is changing so its VAR is also changing. What does it mean to calculate
                # sample variance from a deterministic collection of random variables with
                # different distributions? For example sample variance of 10 random variables
                # from dist1 and 10 random variables from dist2... This is not the same as 20
                # random variables with 50% chance drawing from dist1 and 50% chance of drawing
                # from dist2. So the distribution can only be defined over the whole space (i.e.,
                # all 20 random variables) and not for a specific random variable. Oh well, for
                # now I'm leaving this as it is since I don't have any better ideas. I think what
                # I've done is ok, but I need to more some more thought into it.
                ls = [ y-math.sqrt(v/n) for y,v,n in zip(ys,vs,ns) ]
                us = [ y+math.sqrt(v/n) for y,v,n in zip(ys,vs,ns) ]
                axes.fill_between(xs, ls, us, alpha = 0.1)

        learners, _, batches = result.to_indexed_tuples()

        learner_index_key = lambda batch: (batch.learner_id, batch.batch_index)
        sorted_batches    = sorted(batches.values(), key=learner_index_key)
        grouped_batches   = groupby(groupby(sorted_batches , key=learner_index_key), key=lambda x: x[0][0])

        max_batch_N = 0

        indexes     = cast(Dict[int,List[int  ]], defaultdict(list))
        incounts    = cast(Dict[int,List[int  ]], defaultdict(list))
        inmeans     = cast(Dict[int,List[float]], defaultdict(list))
        invariances = cast(Dict[int,List[float]], defaultdict(list))
        cucounts    = cast(Dict[int,List[int  ]], defaultdict(list))
        cumeans     = cast(Dict[int,List[float]], defaultdict(list))
        cuvariances = cast(Dict[int,List[float]], defaultdict(list))

        for learner_id, learner_batches in grouped_batches:

            cucount    = 0
            cumean     = OnlineMean()
            cuvariance = OnlineVariance()

            for (_, batch_index), index_batches in learner_batches:

                incount    = 0
                inmean     = OnlineMean()
                invariance = OnlineVariance()

                for N, reward in [ (b.N, b.reward) for b in index_batches]:

                    max_batch_N = max(N, max_batch_N)

                    incount     = incount + 1
                    inmean      .update(reward)
                    invariance  .update(reward)
                    cucount     = cucount + 1
                    cumean      .update(reward)
                    cuvariance  .update(reward)

                #sanity check, sorting above (in theory) should take care of this...
                #if this isn't the case then the cu* values will be incorrect...
                assert indexes[learner_id] == [] or batch_index > indexes[learner_id][-1]

                incounts[learner_id].append(incount)
                indexes[learner_id].append(batch_index)
                inmeans[learner_id].append(inmean.mean)
                invariances[learner_id].append(invariance.variance)
                cucounts[learner_id].append(cucount)
                cumeans[learner_id].append(cumean.mean)
                cuvariances[learner_id].append(cuvariance.variance)

        check_matplotlib_support('Plots.standard_plot')
        import matplotlib.pyplot as plt #type: ignore

        fig = plt.figure()

        index_unit = "Interaction" if max_batch_N ==1 else "Batch"

        ax1 = fig.add_subplot(1,2,1) 
        ax2 = fig.add_subplot(1,2,2) 

#         for learner_id in learners:
#             _plot(ax1, learners[learner_id].full_name, indexes[learner_id], inmeans[learner_id], invariances[learner_id], incounts[learner_id])
        ax1.set_title("Progressive Average Reward")
#         ax1.set_ylabel("Mean Reward")
        ax1.set_ylabel('reward (averaged over datasets)')
        ax1.set_xlabel(f"episodes / {episode_factor}")

        for learner_id in learners:
            batch_size = 1
            if learner_id > 0:
                batch_size = learners[learner_id].batchsize
            if batch_size != 1:
                batch_size = batch_size / episode_factor
            _plot(ax1, learners[learner_id].full_name, np.divide(indexes[learner_id], batch_size), cumeans[learner_id], cuvariances[learner_id], cucounts[learner_id])
            _plot(ax2, learners[learner_id].full_name, indexes[learner_id], cumeans[learner_id], cuvariances[learner_id], cucounts[learner_id])


        ax2.set_title("Progressive Average Reward")
#         ax1.set_xlabel(f"{index_unit} Index")

#         (bot1, top1) = ax1.get_ylim()
        (bot2, top2) = ax2.get_ylim()

#         ax1.set_ylim(min(bot1,bot2), max(top1,top2))
#         ax2.set_ylim(min(bot1,bot2), max(top1,top2))

        scale = 0.25
#         box1 = ax1.get_position()
        box2 = ax2.get_position()
#         ax1.set_position([box1.x0, box1.y0 + box1.height * scale, box1.width, box1.height * (1-scale)])
#         ax2.set_position([box2.x0, box2.y0 + box2.height * scale, box2.width, box2.height * (1-scale)])

        # Put a legend below current axis
#         fig.legend(*ax2.get_legend_handles_labels(), loc='upper center', bbox_to_anchor=(.5, .175), fancybox=True, ncol=2) #type: ignore
        ax2.set_xlabel('examples')
        ax2.set_ylabel('reward (averaged over datasets)')
        ax2.legend()

        plt.show()

# Learners

## Paul's stuff

In [3]:
from typing import Hashable, Sequence, Dict, Any
import coba.random

class Batched:
    def __init__(self, delay: int, batchsize: int, learner):
        self.learner = learner()
        self.batchsize = batchsize
        self.delay = delay
        self.mem = {}

        assert self.delay % self.batchsize == 0

    def init(self):
        self.learner.init()

    @property
    def family(self) -> str:
        return "Batched Learner"

    @property
    def params(self) -> Dict[str,Any]:
        return { 
                 #**self.learner.params(),
                 **{ 'delay': self.delay, 'batchsize': self.batchsize },
               }

    def choose(self, key: int, context: Hashable, actions: Sequence[Hashable]) -> int:
        """Choose which action index to take."""
        return self.learner.choose(key, context, actions)

    def learn(self, key: int, context: Hashable, action: Hashable, reward: float) -> None:
        """Learn about the result of an action that was taken in a context."""

        self.mem[key] = { 'context': context,
                          'action': action,
                          'reward': reward
                        }

        if len(self.mem) >= self.delay:
            sumreward = 0
            contexts = []
            for key, values in self.mem.items():
                sumreward += values['reward']
                contexts.append((key, values))

                if len(contexts) % self.batchsize == 0:
                    order = coba.random.shuffle(list(range(self.batchsize)))
                    for idx, i in enumerate(order):
                        (k, v) = contexts[i]
                        self.learner.learn(k, 
                                           v['context'],
                                           v['action'],
                                           sumreward / (self.batchsize - idx))
                        sumreward = sumreward - v['reward']
                    sumreward = 0
                    contexts = []

            self.mem = {}

In [4]:
from typing import Hashable, Sequence, Dict, Any

class Advantage:
    def __init__(self, seed: int, flags: str, learner):
        self.learner = learner()
        self.flags = flags
        self.seed = seed
        self.baseline=None

    def init(self):
        from os import devnull
        from coba import execution

        with open(devnull, 'w') as f, execution.redirect_stderr(f):
            from vowpalwabbit import pyvw
            self.baseline = pyvw.vw(f'--quiet ${self.flags} --random_seed {self.seed}')

    def tovw(self, context, reward, prob):
        assert type(context) is tuple, context

        return '\n'.join([
            f'{reward} {1.0/prob} | ' 
          + ' '.join([ f'{k+1}:{v}' for k, v in enumerate(context) if v != 0 ])
          ])

    @property
    def family(self) -> str:
        return "Advantage Wrapper"

    @property
    def params(self) -> Dict[str,Any]:
        return self.learner.params()

    def choose(self, key: int, context: Hashable, actions: Sequence[Hashable]) -> int:
        return self.learner.choose(key, context, actions)

    def learn(self, key: int, context: Hashable, action: Hashable, reward: float) -> None:
        prob = self.learner._probs[key]
        exstr = self.tovw(context, reward, prob)
        vhat = self.baseline.predict(exstr)
        self.baseline.learn(exstr)
        self.learner.learn(key, context, action, reward - vhat)

In [5]:
def baseLearner():
    from coba.learners import VowpalLearner
    return VowpalLearner(seed=10, epsilon=0.2, flags='--coin')

def advantageLearner():
    return Advantage(seed=10, flags='--coin', learner=baseLearner)

# Simulation

In [6]:
import coba.random

from coba.simulations import LambdaSimulation
from coba.learners import RandomLearner, EpsilonLearner, VowpalLearner, UcbTunedLearner
from coba.benchmarks import Benchmark

import numpy as np

from coba.benchmarks import Benchmark
import re

def get_context(means, t):
    return (str(t % means.shape[0]), str(coba.random.randint(0, means.shape[1] - 1)))

def get_actions(means):
    return [str(i) for i in range(means.shape[2])]

def get_reward(means, c, a):
    return int(coba.random.random() < means[int(c[0])][int(c[1])][int(a)]) 

def print_info(title, means, epsilon=0.2):
    random_perf = np.mean(means)
    best_perf = np.max(means, axis=2).mean()

    print(f'----{title}----')
    print(f'Random perfomance: {random_perf}')
    print(f'Best performance: {best_perf}')
    print(f'Best performance with {epsilon} exploration: {best_perf * (1 - epsilon) + random_perf * epsilon}')

# Experiments

## Simulations

In [7]:
nsteps = 1
npeople = 8
nactions = 8

means_1_8_8 = np.ndarray(shape = (nsteps, npeople, nactions), buffer = np.array(coba.random.randoms(nsteps * npeople * nactions)))
print_info('means_1_8_8', means_1_8_8)

means_2_4_8 = means_1_8_8.reshape(2,4,8)
print_info('means_2_4_8', means_2_4_8)

----means_1_8_8----
Random perfomance: 0.4696920402065776
Best performance: 0.9034116238154579
Best performance with 0.2 exploration: 0.8166677070936819
----means_2_4_8----
Random perfomance: 0.4696920402065776
Best performance: 0.9034116238154579
Best performance with 0.2 exploration: 0.8166677070936819


In [8]:
means_8_1_8 = means_1_8_8.reshape(8,1,8)
print_info('means_8_1_8', means_8_1_8)

----means_8_1_8----
Random perfomance: 0.4696920402065776
Best performance: 0.9034116238154579
Best performance with 0.2 exploration: 0.8166677070936819


In [9]:
%matplotlib qt

means = means_1_8_8

actions_objects = get_actions(means)

contexts = lambda t: get_context(means, t)
actions = lambda t: actions_objects

rewards = lambda c, a: get_reward(means, c, a)

#define a simulation
simulations = [
    LambdaSimulation(100000, contexts, actions, rewards, seed=10),
]

#define a benchmark: this benchmark replays the simulation 15 times
benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

#create the learner factories
learner_factories = [
#    RandomLearner(seed=10),
    VowpalLearner(epsilon=0.2, seed=10, flags='--coin'),
    Batched(delay=8, batchsize=1, learner=baseLearner),
    Batched(delay=8, batchsize=2, learner=baseLearner),
    Batched(delay=8, batchsize=4, learner=baseLearner),
    Batched(delay=8, batchsize=8, learner=baseLearner),
]

result = benchmark.evaluate(learner_factories)
Plots.standard_plot(result, show_err=True)

2020-12-14 09:00:14 loading simulation...
2020-12-14 09:00:14   * finished after 0.0 seconds


In [18]:
Plots.standard_plot(result, episode_factor=2, show_err=True)

In [41]:
%matplotlib qt

means = means_1_8_8

actions_objects = get_actions(means)

contexts = lambda t: get_context(means, t)
actions = lambda t: actions_objects

rewards = lambda c, a: get_reward(means, c, a)

#define a simulation
simulations = [
    LambdaSimulation(10000, contexts, actions, rewards, seed=10),
]

#define a benchmark: this benchmark replays the simulation 15 times
benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

#create the learner factories
learner_factories = [
    RandomLearner(seed=10),
    VowpalLearner(epsilon=0.2, seed=10),
    Batched(delay=8, batchsize=1, learner=advantageLearner),
    Batched(delay=8, batchsize=2, learner=advantageLearner),
    Batched(delay=8, batchsize=4, learner=advantageLearner),
    Batched(delay=8, batchsize=8, learner=advantageLearner),
]

benchmark.evaluate(learner_factories).standard_plot()

2020-12-01 01:11:43 loading simulation...
2020-12-01 01:11:43   * finished after 0.0 seconds


In [51]:
means = means_2_4_8

actions_objects = get_actions(means)

contexts = lambda t: get_context(means, t)
actions = lambda t: actions_objects

rewards = lambda c, a: get_reward(means, c, a)

#define a simulation
simulations = [
    LambdaSimulation(10000, contexts, actions, rewards, seed=10),
]

#define a benchmark: this benchmark replays the simulation 15 times
benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

#create the learner factories
learner_factories = [
    RandomLearner(seed=10),
    VowpalLearner(epsilon=0.2, seed=10),
    Batched(delay=8, batchsize=1, learner=baseLearner),
    Batched(delay=8, batchsize=2, learner=baseLearner),
    Batched(delay=8, batchsize=4, learner=baseLearner),
    Batched(delay=8, batchsize=8, learner=baseLearner),
]

benchmark.evaluate(learner_factories).standard_plot()

2020-11-30 12:51:58 loading simulation...
2020-11-30 12:51:58   * finished after 0.0 seconds


In [52]:
means = means_2_4_8

actions_objects = get_actions(means)

contexts = lambda t: get_context(means, t)
actions = lambda t: actions_objects

rewards = lambda c, a: get_reward(means, c, a)

#define a simulation
simulations = [
    LambdaSimulation(10000, contexts, actions, rewards, seed=10),
]

#define a benchmark: this benchmark replays the simulation 15 times
benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

#create the learner factories
learner_factories = [
    RandomLearner(seed=10),
    VowpalLearner(epsilon=0.2, seed=10),
    Batched(delay=8, batchsize=1, learner=advantageLearner),
    Batched(delay=8, batchsize=2, learner=advantageLearner),
    Batched(delay=8, batchsize=4, learner=advantageLearner),
    Batched(delay=8, batchsize=8, learner=advantageLearner),
]

benchmark.evaluate(learner_factories).standard_plot()

2020-11-30 12:56:31 loading simulation...
2020-11-30 12:56:31   * finished after 0.0 seconds


In [54]:
means = means_8_1_8

actions_objects = get_actions(means)

contexts = lambda t: get_context(means, t)
actions = lambda t: actions_objects

rewards = lambda c, a: get_reward(means, c, a)

#define a simulation
simulations = [
    LambdaSimulation(10000, contexts, actions, rewards, seed=10),
]

#define a benchmark: this benchmark replays the simulation 15 times
benchmark = Benchmark(simulations, batch_size = 1, shuffle_seeds=list(range(5)))

#create the learner factories
learner_factories = [
    RandomLearner(seed=10),
    VowpalLearner(epsilon=0.2, seed=10),
    Batched(delay=8, batchsize=1, learner=baseLearner),
    Batched(delay=8, batchsize=2, learner=baseLearner),
    Batched(delay=8, batchsize=4, learner=baseLearner),
    Batched(delay=8, batchsize=8, learner=baseLearner),
]

benchmark.evaluate(learner_factories).standard_plot()

2020-11-30 13:22:49 loading simulation...
2020-11-30 13:22:49   * finished after 0.0 seconds


In [11]:
import random
mylist = list(range(5))
res=random.shuffle(mylist)
mylist

[4, 1, 2, 0, 3]

In [3]:
import coba.random
coba.random.shuffle(["1","2","3"])

['3', '1', '2']

In [None]:
random.s