# Helpers

In [None]:
import itertools
import json
import pandas as pd
import random
import numpy as np
import random
import matplotlib.pyplot as plt

from vowpalwabbit import pyvw

class environment:
    def __init__(self, means):
        self.means = means
        self.npeople = self.means.shape[0]
        self.narms = self.means.shape[1]

        self.arms = [f'| a_{a}' for a in range(self.narms)]
        self.people = [f'shared | p_{p}' for p in range(self.npeople)]

    def generate(self, cmd, count):
        result=[]
        vw = pyvw.vw(cmd)
        for i in range(count):
            p = random.randint(0, self.npeople-1)
            row = {}
            row['p']=p
            pred = vw.predict([vw.example(e) for e in self.get_pred_ex(p)])
            pred = np.divide(pred, np.sum(pred))
            c = np.random.choice(self.narms, p=pred)
            r = int(random.uniform(0,1)<self.means[p][c])
            vw.learn([vw.example(e) for e in self.get_learn_ex(p, c, pred[c], r)])
            row[f'chosen']=c
            row[f'online']=pred[c]
            row[f'r']=r
            result.append(row)
        return result

    def get_pred_ex(self, person):
        return [self.people[person]] + self.arms

    def get_learn_ex(self, person, action, prob, reward):
        label=f'{action}:{-reward}:{prob}'
        result = self.get_pred_ex(person)
        result[action + 1] = f'{label} {result[action + 1]}'
        return result


class steps_gen:
    def __init__(self, steps):
        self.steps = steps
        self.i = 0

    def get(self):
        self.i = (self.i + 1) % len(self.steps)
        return self.steps[self.i]

def cfe(cmd, scenario, env, get_steps, reward_f, name, with_episode_length):
    from vowpalwabbit import pyvw
    result = []
    vw = pyvw.vw(cmd)
    remains = get_steps.get()
    episode = []
    patched=[]
    for l in scenario:
        episode.append(l)
        remains = remains - 1
        if remains == 0:
            rewards = []

            for step in episode:
                ex_pred_str = env.get_pred_ex(step['p'])
                ex_pred = [vw.example(e) for e in ex_pred_str]
                pred=vw.predict(ex_pred)
                result.append(pred[step['chosen']])
                rewards.append(step['r'])

            reward = reward_f(rewards)
            for i in range(len(episode)):
                ex_learn_str = env.get_learn_ex(episode[i]['p'], episode[i]['chosen'], episode[i]['online'], reward)
                ex_learn = [vw.example(e) for e in ex_learn_str]
                vw.learn(ex_learn)
            
            episode=[]
            remains=get_steps.get()
    return pd.DataFrame({name: result})

def plot_action_perf(df, means, prob_columns=['online']):
    best = pd.DataFrame([{'p': p, 'best': best} for p, best in enumerate(np.argmax(means, axis=1))])
    details = pd.merge(df, best, on='p', how='left')
    for prob_column in prob_columns:
        for i in range(means.shape[0]):
            details[(details['p']==i) & (details['chosen']==details['best'])][prob_column].expanding().mean().plot(label=f'{prob_column}/{i}', logx=True, figsize=(10,6)) 
    plt.legend() 

def plot(results, policies):
    for p in policies:
        (results['r'] * results[p] / results['online']).expanding().mean().plot(logx=True, figsize=(8,6), label=p)
    plt.legend()

# Setup arms configuration

In [None]:
npeople = 4
narms = 8

means = np.random.rand(npeople, narms)
print(f'Random perfomance: {np.mean(means)}')
print(f'Best performance: {np.max(means, axis=1).mean()}')
print(f'Best performance with 0.2 exploration: {np.max(means, axis=1).mean() * 0.8 + 0.2 * np.mean(means)}')

# Generate some data

In [None]:
vw_args='--cb_explore_adf --dsjson --epsilon 0.2 --coin --power_t 0 --quiet --cb_type mtr -q ::'

env = environment(means)
events = env.generate(vw_args, 1024 * 128)
events_df=pd.DataFrame(events)
events_df.head()

In [None]:
plot_action_perf(events_df, means)

In [None]:
print(f'r_mean = {events_df["r"].mean()}')
events_df['r'].expanding().mean().plot(figsize=(10,6), logx=True)

In [None]:
vw_args='--cb_explore_adf --dsjson --epsilon 0 --coin --power_t 0 --quiet --cb_type mtr -q ::'

step1 = cfe(vw_args, events, env, steps_gen([1]), np.mean, '1step', with_episode_length=False)

step2mean = cfe(vw_args, events, env, steps_gen([2]), np.mean, '2step.mean', with_episode_length=False)
step2sum = cfe(vw_args, events, env, steps_gen([2]), np.sum, '2step.sum', with_episode_length=False)

In [None]:
step4mean = run(vw_args, events, env, steps_gen([4]), np.mean, '4step.mean', with_episode_length=False)
step4sum = run(vw_args, events, env, steps_gen([4]), np.sum, '4step.sum', with_episode_length=False)

In [None]:
step8mean = run(vw_args, events, env, steps_gen([8]), np.mean, '8step.mean', with_episode_length=False)
step8sum = run(vw_args, events, env, steps_gen([8]), np.sum, '8step.sum', with_episode_length=False)

In [None]:
step16mean = run(vw_args, events, env, steps_gen([16]), np.mean, '16step.mean', with_episode_length=False)
step16sum = run(vw_args, events, env, steps_gen([16]), np.sum, '16step.sum', with_episode_length=False)

In [None]:
results = events_df.join(step1).join(step2mean).join(step2sum).join(step4mean).join(step4sum).join(step8mean).join(step8sum).join(step16mean).join(step16sum)

In [None]:
plot(results, ['1step', '2step.mean', '2step.sum', '4step.mean', '4step.sum', '8step.mean', '8step.sum', '16step.mean', '16step.sum'])