In [None]:
import argparse
import os
from datetime import datetime
from os import path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import trange

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
from tabular.algo import model_based
from tabular.algo.model_based import ModelBased
from tabular.finite_mdp import FiniteMDP

In [None]:
parser = argparse.ArgumentParser(description="Finite-horizon MDP")
parser.add_argument("--n-episode", type=int, default=1000)
parser.add_argument("--n-action", type=int, default=2)
parser.add_argument("--n-step", type=int, default=2)
parser.add_argument("--state-per-stage", type=int, default=2)
parser.add_argument("--p", type=int, default=0.05)
parser.add_argument("--alpha", type=int, default=0.1)
parser.add_argument("--n-run", type=int, default=10)
parser.add_argument("--n-pol-eval-step", type=int, default=1)
parser.add_argument("--c", type=float, default=0.1)
parser.add_argument("--random-reward", action="store_true", default=True)
args = parser.parse_args('--n-episode 1000 --n-action 4 --n-step 2 --state-per-stage 2 --c 0.1 --n-run 1'.split())
setting = vars(args)

In [None]:
def exp(prev_estimate):
    env = FiniteMDP(setting)

    regret_df = []
    step = int(max(setting["n_episode"] / 100, 1))
    episode_index = np.arange(start=0, stop=setting["n_episode"], step=step)
    algorithm = ModelBased(algorithm_type=model_based.POLICY_ITERATION, using_previous_estimate=prev_estimate)
    regret, info = algorithm.run(setting['c'], setting, env)
    plt.subplot(1,2,1)
    plt.plot(info['delta'])
    plt.axhline(y=0)
    plt.subplot(1,2,2)
    plt.plot(np.cumsum(regret))
    plt.show()

# previous estimate

In [None]:
for _ in range(4):
    exp(True)

# same estimate

In [None]:
for _ in range(4):
    exp(False)