Example usage of QSA with all default values for QSA but shown here for clarification - <br>
Environment  ====> Gridworldv2 <br>
Optimizer    ====> CMA <br>
IS Estimator ====> PDIS <br>
CIS          ====> ttest <br>

# Import modules

In [1]:
import timeit
import sys
import os
from sklearn.model_selection import train_test_split
from gym import spaces

# import QSA library
from qsa import QSA

# import environment module that you want to use
from environments.gridworldv2 import Gridworldv2
from environments.gridworld687 import Gridworld687
from environments.gridworldv1 import Gridworldv1
from environments.mountaincar import Mountaincar
from environments.cartpole import Cartpole


# import data creation modules
from data.create_dataset import Dataset
from data.create_model import Model

# import optimizer module as desired
# optimizer library gives cma, powell and bfgs from scipy
from optimizers.optimizer_library import *
# cross entropy
from optimizers.cem import *
# CMA-ES
from optimizers.cmaes import *

# import all importance sampling estimator modules
from estimators.is_estimators import *


# import CIS module
from bounds.confidence_intervals import *

# import utility modules
from utils_dir import *

# Environments

In [2]:
# The available environment implementations
# for a new environment inherit the environment class and implement all methods
# Refer to - https://github.com/ananyagupta27/Seldonian-RL/blob/main/environments/environment.py
# and other environments in the same folder
env_map = {0: 'Mountaincar', 1: 'Gridworldv1', 2: 'Gridworldv2', 3: 'Gridworld687', 4: 'Cartpole'}
# Choose an environment
env = Gridworldv2()

# Importance Sampling estimators

In [3]:
# The following is the list of importance sampling estimators available
# In order to add more estimators add functions in estimators/is_estimators.py
# PDIS, IS, WIS, DR, DR_hat supported 
fHat = PDIS

# Confidence Intervals

In [4]:
# The following is the list of confidence intervals available
# In order to add more estimators add functions in bounds/confidence_intervals.py
# ttest, Anderson, MPeB, Phil, Hoeffding supported 
cis = ttestLB
# Lower bound because the safety requirement in this example is to best a random behavior performance, 
# hence provide probabilistic guarantees on the lower bound 

# Optimizers

In [5]:
# Optimizers available - Powell, CMA, CMAES, BFGS, CEM supported [default: CMA]

# initialize initial policy parameter
# initializing with zeros would mean uniform random action selection policy after taking softmax
theta = np.zeros((env.getStateDims(), env.getNumActions()))

# initialize optimizer with initial solution and the function to optimize
optimizer = CMA

# Create Dataset

In [6]:
# select a candidate-safety dataset split
split_ratio = 0.5
episodes_of_data = 100

# runs over the selected environment to generate episodes of data
datasetGenerator = Dataset(episodes_of_data, env)
# generate data with the initial random policy
dataset = datasetGenerator.generate_dataset(theta)

# split dataset into candidate and safety datasets
candidateDataset, safetyDataset = split_dataset(dataset, split_ratio)

# QSA (Quasi Seldonian Algorithm)

In [7]:
# giving failure rate = 0.01 which is delta
delta = 0.01

# initializing QSA with all parameter selected as above (each has default values, refer to documentation)
qsa = QSA(env, int(episodes_of_data*split_ratio), fHat, delta, candidateDataset, safetyDataset, cis=cis, optimizer=optimizer) 

# Obtaining Candidate Solution

In [8]:
solution = qsa.getCandidateSolution()

CMA
(8_w,16)-aCMA-ES (mu_w=4.8,w_1=32%) in dimension 64 (seed=923213, Sat Dec 12 20:35:28 2020)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     16 1.000250220267198e+05 1.0e+00 4.74e-01  5e-01  5e-01 0:01.6
    2     32 1.000288589347467e+05 1.0e+00 4.55e-01  5e-01  5e-01 0:03.0
    3     48 1.000286503782887e+05 1.0e+00 4.38e-01  4e-01  4e-01 0:04.3
    6     96 1.000150668441777e+05 1.1e+00 4.06e-01  4e-01  4e-01 0:08.1
    9    144 1.000156848811528e+05 1.1e+00 3.90e-01  4e-01  4e-01 0:12.5
   13    208 1.000070053258650e+05 1.1e+00 3.92e-01  4e-01  4e-01 0:18.4
   15    240 1.000050768444706e+05 1.1e+00 4.09e-01  4e-01  4e-01 0:20.9
termination on maxiter=15 (Sat Dec 12 20:35:49 2020)
final/bestever f-value = 1.000052e+05 1.000051e+05
incumbent solution: [-0.42449132 -0.4619871   1.37357265  1.25319207 -0.42623942 -0.88361098
 -0.97652177  0.76095173 ...]
std deviations: [0.40976959 0.40818407 0.40796657 0.40648359 0.40824479 0.40957908
 0.40688403

# Performing Safety Test

In [12]:
result, estimates = qsa.fHat(solution, safetyDataset, int(episodes_of_data*split_ratio), env)
passedSafetyTest, lb = qsa.safety_test(estimates, int(episodes_of_data*split_ratio), delta=0.01, factor=1)

# Results

In [13]:
print("lower bound = ", lb, "PDIS estimate = ", result, "passed safety test = ", passedSafetyTest)

lower bound =  -18.208714748788637 PDIS estimate =  -4.4269274850202525 passed safety test =  False


 # Optimal value for this domain is around 95 
 # This estimate fails on the safety set (due to low amount of data)
 # No Solution Found