Example usage of QSA with all default values for QSA but shown here for clarification - <br>
Environment  ====> Mountaincar <br>
Optimizer    ====> CMA <br>
IS Estimator ====> DR_hat <br>
CIS          ====> ttest <br>

# Import modules

In [1]:
import timeit
import sys
import os
from sklearn.model_selection import train_test_split
from gym import spaces

# import QSA library
from qsa import QSA

# import environment module that you want to use
from environments.gridworldv2 import Gridworldv2
from environments.gridworld687 import Gridworld687
from environments.gridworldv1 import Gridworldv1
from environments.mountaincar import Mountaincar
from environments.cartpole import Cartpole


# import data creation modules
from data.create_dataset import Dataset
from data.create_model import Model

# import optimizer module as desired
# optimizer library gives cma, powell and bfgs from scipy
from optimizers.optimizer_library import *
# cross entropy
from optimizers.cem import *
# CMA-ES
from optimizers.cmaes import *

# import all importance sampling estimator modules
from estimators.is_estimators import *


# import CIS module
from bounds.confidence_intervals import *

# import utility modules
from utils_dir import *

# Environments

In [2]:
# The available environment implementations
# for a new environment inherit the environment class and implement all methods
# Refer to - https://github.com/ananyagupta27/Seldonian-RL/blob/main/environments/environment.py
# and other environments in the same folder
env_map = {0: 'Mountaincar', 1: 'Gridworldv1', 2: 'Gridworldv2', 3: 'Gridworld687', 4: 'Cartpole'}
# Choose an environment
env = Mountaincar(discrete=True)

# Importance Sampling estimators

In [3]:
# The following is the list of importance sampling estimators available
# In order to add more estimators add functions in estimators/is_estimators.py
# PDIS, IS, WIS, DR, DR_hat supported 
fHat = DR_hat

# Confidence Intervals

In [4]:
# The following is the list of confidence intervals available
# In order to add more estimators add functions in bounds/confidence_intervals.py
# ttest, Anderson, MPeB, Phil, Hoeffding supported 
cis = ttestLB
# Lower bound because the safety requirement in this example is to best a random behavior performance, 
# hence provide probabilistic guarantees on the lower bound 

# Optimizers

In [5]:
# Optimizers available - Powell, CMA, CMAES, BFGS, CEM supported [default: CMA]

# initialize initial policy parameter
# initializing with zeros would mean uniform random action selection policy after taking softmax
theta = np.zeros((env.getStateDims(), env.getNumActions()))

# initialize optimizer with initial solution and the function to optimize
optimizer = CMA

# Create Dataset

In [6]:
# select a candidate-safety dataset split
split_ratio = 0.5
episodes_of_data = 100

# runs over the selected environment to generate episodes of data
datasetGenerator = Dataset(int(episodes_of_data * (1 - split_ratio)), env)
# generate data with the initial random policy
candidateDataset = datasetGenerator.generate_dataset(theta)
# Create mle model for estimating p and R
model = Model(env, candidateDataset, int(episodes_of_data * (1 - split_ratio)), env.getNumDiscreteStates(), env.getNumActions(), env.horizonLength)
candidateDataset = model.makeMLEModel()

datasetGenerator = Dataset(int(episodes_of_data), env)
theta = np.zeros((env.getStateDims(), env.getNumActions()))
safetyDataset = datasetGenerator.generate_dataset(theta)
model = Model(env, safetyDataset, episodes_of_data, env.getNumDiscreteStates(), env.getNumActions(), env.horizonLength)
safetyDataset = model.makeMLEModel()





p and R
100 L 152 3
calculated value functions
done modeling
p and R
100 L 152 3
calculated value functions
done modeling


In [12]:
# print(safetyDataset)

# QSA (Quasi Seldonian Algorithm)

In [8]:
# giving failure rate = 0.01 which is delta
delta = 0.01

# initializing QSA with all parameter selected as above (each has default values, refer to documentation)
qsa = QSA(env, int(episodes_of_data*split_ratio), fHat, delta, candidateDataset, safetyDataset, cis=cis, optimizer=optimizer) 

# Obtaining Candidate Solution

In [9]:
solution = qsa.getCandidateSolution()

CMA
(11_w,22)-aCMA-ES (mu_w=6.5,w_1=26%) in dimension 459 (seed=976472, Sun Dec 13 00:40:33 2020)
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     22 1.005482245762618e+05 1.0e+00 4.93e-01  5e-01  5e-01 0:11.9
    2     44 1.005136411861163e+05 1.0e+00 4.87e-01  5e-01  5e-01 0:28.0
    3     66 1.004918277594110e+05 1.0e+00 4.81e-01  5e-01  5e-01 0:38.7
    4     88 1.004465530549273e+05 1.0e+00 4.75e-01  5e-01  5e-01 0:49.5
    5    110 1.004417032045986e+05 1.0e+00 4.70e-01  5e-01  5e-01 1:03.4
    6    132 1.004268830628257e+05 1.0e+00 4.65e-01  5e-01  5e-01 1:17.1
    7    154 1.004085957704779e+05 1.0e+00 4.61e-01  5e-01  5e-01 1:28.3
    8    176 1.004012309007210e+05 1.0e+00 4.57e-01  5e-01  5e-01 1:44.1
    9    198 1.003934124008213e+05 1.0e+00 4.54e-01  5e-01  5e-01 2:01.6
   10    220 1.003913747338216e+05 1.0e+00 4.50e-01  4e-01  5e-01 2:10.6
termination on maxiter=10 (Sun Dec 13 00:42:45 2020)
final/bestever f-value = 1.003919e+05 1.003914e

# Performing Safety Test

In [10]:
result, estimates = qsa.fHat(solution, safetyDataset, int(episodes_of_data*split_ratio), env)
passedSafetyTest, lb = qsa.safety_test(estimates, int(episodes_of_data*split_ratio), delta=0.01, factor=1)

# Results

In [11]:
print("threshold performance = ", env.threshold,"lower bound = ", lb, "DR_hat = ", result, "passed safety test = ", passedSafetyTest)

threshold performance =  -390 lower bound =  -645.2086476153451 DR_hat =  -480.15408762611435 passed safety test =  False


 # This estimate fails on the safety set (due to low amount of data)
 # No Solution Found