Example usage of QSA with all default values for QSA but shown here for clarification - <br>
Environment  ====> Gridworldv2 <br>
Optimizer    ====> CMA <br>
IS Estimator ====> PDIS <br>
CIS          ====> ttest <br>

# Import modules

In [1]:
import timeit
import sys
import os
from sklearn.model_selection import train_test_split
from gym import spaces

# import QSA library
from qsa import QSA

# import environment module that you want to use
from environments.gridworldv2 import Gridworldv2
from environments.gridworld687 import Gridworld687
from environments.gridworldv1 import Gridworldv1
from environments.mountaincar import Mountaincar
from environments.cartpole import Cartpole


# import data creation modules
from data.create_dataset import Dataset
from data.create_model import Model

# import optimizer module as desired
# optimizer library gives cma, powell and bfgs from scipy
from optimizers.optimizer_library import *
# cross entropy
from optimizers.cem import *
# CMA-ES
from optimizers.cmaes import *

# import all importance sampling estimator modules
from estimators.is_estimators import *


# import CIS module
from bounds.confidence_intervals import *

# import utility modules
from utils_dir import *

# Environments

In [2]:
# The available environment implementations
# for a new environment inherit the environment class and implement all methods
# Refer to - https://github.com/ananyagupta27/Seldonian-RL/blob/main/environments/environment.py
# and other environments in the same folder
env_map = {0: 'Mountaincar', 1: 'Gridworldv1', 2: 'Gridworldv2', 3: 'Gridworld687', 4: 'Cartpole'}
# Choose an environment
env = Gridworldv2()

# Importance Sampling estimators

In [3]:
# The following is the list of importance sampling estimators available
# In order to add more estimators add functions in estimators/is_estimators.py
# PDIS, IS, WIS, DR, DR_hat supported 
fHat = PDIS

# Confidence Intervals

In [4]:
# The following is the list of confidence intervals available
# In order to add more estimators add functions in bounds/confidence_intervals.py
# ttest, Anderson, MPeB, Phil, Hoeffding supported 
cis = ttestLB
# Lower bound because the safety requirement in this example is to best a random behavior performance, 
# hence provide probabilistic guarantees on the lower bound 

# Optimizers

In [5]:
# Optimizers available - Powell, CMA, CMAES, BFGS, CEM supported [default: CMA]

# initialize initial policy parameter
# initializing with zeros would mean uniform random action selection policy after taking softmax
theta = np.zeros((env.getStateDims(), env.getNumActions()))

# initialize optimizer with initial solution and the function to optimize
optimizer = CMA

# Create Dataset

In [9]:
# select a candidate-safety dataset split
split_ratio = 0.5
episodes_of_data = 1000

# runs over the selected environment to generate episodes of data
datasetGenerator = Dataset(episodes_of_data, env)
# generate data with the initial random policy
dataset = datasetGenerator.generate_dataset(theta)

# split dataset into candidate and safety datasets
candidateDataset, safetyDataset = split_dataset(dataset, split_ratio)

# QSA (Quasi Seldonian Algorithm)

In [16]:
# giving failure rate = 0.01 which is delta
delta = 0.01

# initializing QSA with all parameter selected as above (each has default values, refer to documentation)
qsa = QSA(env, int(episodes_of_data*split_ratio), fHat, delta, candidateDataset, safetyDataset, cis=cis, optimizer=optimizer) 

# Obtaining Candidate Solution

In [11]:
solution = qsa.getCandidateSolution()

CMA
(8_w,16)-aCMA-ES (mu_w=4.8,w_1=32%) in dimension 64 (seed=983998, Sat Dec 12 20:05:32 2020)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     16 1.000245466710725e+05 1.0e+00 4.75e-01  5e-01  5e-01 0:16.3
    2     32 1.000074403536040e+05 1.0e+00 4.55e-01  5e-01  5e-01 0:31.2
    3     48 1.000155785264909e+05 1.0e+00 4.40e-01  4e-01  4e-01 0:49.5
    4     64 1.000025429231585e+05 1.0e+00 4.23e-01  4e-01  4e-01 1:08.2
    5     80 1.000003702167943e+05 1.0e+00 4.13e-01  4e-01  4e-01 1:24.5
    6     96 1.000043218841156e+05 1.1e+00 4.06e-01  4e-01  4e-01 1:39.8
    7    112 9.999577539049044e+04 1.1e+00 4.01e-01  4e-01  4e-01 2:02.8
    8    128 9.999518255284854e+04 1.1e+00 3.97e-01  4e-01  4e-01 2:27.9
    9    144 9.998647439812800e+04 1.1e+00 3.93e-01  4e-01  4e-01 2:44.5
   10    160 9.998155029834039e+04 1.1e+00 3.88e-01  4e-01  4e-01 3:02.0
   11    176 9.998335236067725e+04 1.1e+00 3.81e-01  4e-01  4e-01 3:17.8
   12    192 9.99940986887457

# Performing Safety Test

In [17]:
result, estimates = qsa.fHat(solution, safetyDataset, int(episodes_of_data*split_ratio), env)
passedSafetyTest, lb = qsa.safety_test(estimates, int(episodes_of_data*split_ratio), delta=0.01, factor=1)

# Results

In [18]:
print("lower bound = ", lb, "PDIS estimate = ", result, "passed safety test = ", passedSafetyTest)

lower bound =  56.34744057407149 PDIS estimate =  98.6329588091057 passed safety test =  True


 (Optimal value for this domain is around 95 and this estimate is very close to the expected solution)

# Policy Learned

In [20]:
print(solution.reshape(env.getStateDims(), env.getNumActions()))

[[-0.23026852 -0.12605438  0.03937284 -0.09534596]
 [ 0.15681257  0.63013497  0.44250409  0.42806969]
 [-0.25162532  0.18870361  0.75923558  0.55164717]
 [-0.15134266 -0.41557884 -0.73433601  0.89512895]
 [-1.07159732  0.74582771  0.21210795 -0.14943435]
 [-0.96937218  0.69061937 -0.04948645 -0.06326128]
 [-0.16207679  0.06645783 -0.17995619  0.21348601]
 [-0.7725456  -1.28605593 -0.67668946  0.79263468]
 [-0.29855585  0.73174064  0.10162056 -0.27585082]
 [ 0.23798618  1.17084567 -1.16541824  0.28122553]
 [-0.05435876  0.91513922 -0.16608842  0.90972586]
 [ 0.27430375 -0.90226269 -0.0485854   1.20058051]
 [-0.2598932   0.33794037  0.28827164  0.14931284]
 [-1.027389   -0.63214268 -0.25042047 -0.49438158]
 [ 0.06530747  0.65102643  0.62531958 -1.45980724]
 [-0.21890324  0.03134366 -0.15776815  0.50272296]]
