In [1]:
import numpy as np
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.autograd import Variable
import copy
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import KFold
from scipy.stats import norm
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
from torch.distributions import Categorical
from collections import deque
from torch.optim.lr_scheduler import SequentialLR, LinearLR, CosineAnnealingLR
from sklearn.linear_model import LogisticRegression

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
##parameter setting
policy = torch.tensor([0.5, 0.5], dtype=torch.float32)
policy_new = torch.tensor([0.4, 0.6], dtype=torch.float32)

In [5]:
#################################
###### move one step forward ####
#################################
def sigmoid(t):
    return 1 / (1 + np.exp(-t))

def step(last_obs):
    # last_obs: last observation of state

    a=np.random.binomial(1, p=policy[0]*sigmoid(last_obs[0])+policy[1]*sigmoid(last_obs[1]))
    mat=np.array([[-0.75*(1-2*a),0],[0,0.75*(1-2*a)]])
    z=np.random.multivariate_normal([0,0], [[0.25,0],[0,0.25]])
    s_next=mat@last_obs+z
    r=np.transpose(s_next)@[2,1]-0.25*(2*a-1)
    return(a, r, s_next)


def step_new(last_obs):
    # last_obs: last observation of state
    a=np.random.binomial(1, p=policy_new[0]*sigmoid(last_obs[0])+policy_new[1]*sigmoid(last_obs[1]))
    mat=np.array([[-0.75*(1-2*a),0],[0,0.75*(1-2*a)]])
    z=np.random.multivariate_normal([0,0], [[0.25,0],[0,0.25]])
    s_next=mat@last_obs+z
    r=np.transpose(s_next)@[2,1]-0.25*(2*a-1)
    return(a, r, s_next)

def get_new_probs(state,action):
    prob = action*(policy_new[0].item()*sigmoid(state[0])+policy_new[1].item()*sigmoid(state[1]))+(1-action)*(1-(policy_new[0].item()*sigmoid(state[0])+policy_new[1].item()*sigmoid(state[1])))
    return prob

In [6]:
#################################
#### generate one trajectory ####
#################################


def gen_traj(T, gam, seed=None, s_init=None):
    # seed: random seed
    # s_init: initial state
    # gam: discount
    # T: iterative number

    # initialize the state
    if seed is None and s_init is None:
        s = np.random.multivariate_normal([0,0],[[1,0],[0,1]])
    elif seed is not None:
        np.random.seed(seed)
        s = np.random.multivariate_normal([0,0],[[1,0],[0,1]])
    if s_init is not None:
        s = s_init

    s_traj = [s]
    a_traj = []
    r_traj = []

    ret = 0
    for i in range(T):
        a, r, s_next = step(s)
        s_traj.append(s_next)
        a_traj.append(a)
        r_traj.append(r)
        s = s_next  # update current S as S_next
        ret += r * gam**i

    ## output state, reward trajectory. return
    return [s_traj, a_traj, r_traj, ret]


def gen_traj_new(T, gam, seed=None, s_init=None):
    # seed: random seed
    # s_init: initial state
    # gam: discount
    # T: iterative number

    # initialize the state
    if seed is None and s_init is None:
        s = np.random.multivariate_normal([0,0],[[1,0],[0,1]])
    elif seed is not None:
        np.random.seed(seed)
        s = np.random.multivariate_normal([0,0],[[1,0],[0,1]])
    if s_init is not None:
        s = s_init

    s_traj = [s]
    a_traj = []
    r_traj = []

    ret = 0
    for i in range(T):
        a, r, s_next = step_new(s)
        s_traj.append(s_next)
        a_traj.append(a)
        r_traj.append(r)
        s = s_next  # update current S as S_next
        ret += r * gam**i

    ## output state, reward trajectory. return
    return [s_traj, a_traj, r_traj, ret]

In [7]:
#######################
#### generate data ####
#######################


def data_gen(N, T_obs, T, gam, seed=None, s_init=None):
    # N: number of trajectories
    # T_obs: observed stage numbers

    s_data = np.zeros((N, T_obs, 2))
    a_data = np.zeros((N, T_obs), dtype=int)
    r_data = np.zeros((N, T_obs))
    ret_data = []


    for i in range(N):
        if seed is not None:
            seed += 1
        tmp = gen_traj(T, gam, seed, s_init)
        s_data[i] = tmp[0][0:T_obs]  # store the i-th state trajectory
        a_data[i] = tmp[1][0:T_obs]
        r_data[i] = tmp[2][0:T_obs]  # store the i-th reward trajectory
        ret_data.append(tmp[3])
  

    ## output observed state, reward trajectory and true return
    return [s_data, a_data ,r_data, ret_data]

def data_gen_new(N, T_obs, T, gam, seed=None, s_init=None):
    # N: number of trajectories
    # T_obs: observed stage numbers

    s_data = np.zeros((N, T_obs, 2))
    a_data = np.zeros((N, T_obs), dtype=int)
    r_data = np.zeros((N, T_obs))
    ret_data = []

    for i in range(N):
        if seed is not None:
            seed += 1
        tmp = gen_traj_new(T, gam, seed, s_init)
        s_data[i] = tmp[0][0:T_obs]  # store the i-th state trajectory
        a_data[i] = tmp[1][0:T_obs]
        r_data[i] = tmp[2][0:T_obs]  # store the i-th reward trajectory
        ret_data.append(tmp[3])

    ## output observed state, reward trajectory and true return
    return [s_data, a_data ,r_data, ret_data]

In [None]:
class QuantileNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, num_quantiles, hidden_size=32):
        super(QuantileNetwork, self).__init__()
        self.num_quantiles = num_quantiles
        self.action_dim = action_dim
        
        # Feature extraction network
        self.feature = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU()
        ).to(device)
        
        # Output layer for quantiles
        self.quantiles = nn.Linear(hidden_size, action_dim * num_quantiles).to(device)
        
    def forward(self, x):
        # Convert input to tensor if needed and move to device
        if not isinstance(x, torch.Tensor):
            x = torch.FloatTensor(x).to(device)
        elif x.device != device:
            x = x.to(device)
            
        features = self.feature(x)
        quantiles = self.quantiles(features)
        return quantiles.view(-1, self.action_dim, self.num_quantiles)

class BehaviorPolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=32):
        super(BehaviorPolicyNetwork, self).__init__()
        # Policy network architecture
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, action_dim)
        ).to(device)
        
    def forward(self, x):
        # Convert input to tensor if needed and move to device
        if not isinstance(x, torch.Tensor):
            x = torch.FloatTensor(x).to(device)
        elif x.device != device:
            x = x.to(device)
        return self.net(x)
    
    def get_probs(self, x):
        logits = self.forward(x)
        return torch.softmax(logits, dim=-1)

class QTD_Agent:
    def __init__(self, state_dim, action_dim, gamma, lr, num_quantiles, 
                 behavior_lr, beta, batch_size):
        
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.num_quantiles = num_quantiles
        self.beta = beta
        self.device = device
        
        # Initialize networks and move to GPU
        self.net = QuantileNetwork(state_dim, action_dim, num_quantiles).to(device)
        self.target_net = QuantileNetwork(state_dim, action_dim, num_quantiles).to(device)
        self.target_net.load_state_dict(self.net.state_dict())
        
        self.behavior_policy = BehaviorPolicyNetwork(state_dim, action_dim).to(device)
        
        # Optimizers
        self.optimizer = optim.Adam(self.net.parameters(), lr=lr)
        self.behavior_optimizer = optim.Adam(self.behavior_policy.parameters(), lr=behavior_lr)
        
        # Replay buffer
        self.buffer = deque(maxlen=100000)
        self.batch_size = batch_size
        self.quantile_fractions = torch.linspace(0.5/num_quantiles, 1-0.5/num_quantiles, num_quantiles).to(device)

        # Learning rate scheduling
        self.scheduler=SequentialLR(self.optimizer,
            schedulers=[
                LinearLR(self.optimizer, start_factor=0.2, end_factor=1.0, total_iters=300),  # Warm-up phase
                CosineAnnealingLR(self.optimizer, T_max=700, eta_min=1e-5)                  # Main decay phase
            ],
            milestones=[300]  # Switch after 300 steps
                                   )                
        self.behavior__scheduler = LinearLR(self.behavior_optimizer, start_factor=1, end_factor=0.2, total_iters=3000)
 
    def get_target_policy_probs(self, states, actions=None):
        with torch.no_grad():
            # Calculate target policy probabilities based on state components
            sigmoid = torch.nn.Sigmoid()
            state_components = states  # states shape: [batch_size, 2]
            p = policy_new[0] * sigmoid(state_components[:, 0]) + policy_new[1] * sigmoid(state_components[:, 1])
            
            # Clamp probabilities to [0,1] range
            p = torch.clamp(p, 0.0, 1.0)
            
            # Construct action probability distribution
            # probs shape: [batch_size, 2]
            # probs[:, 1] = p (probability of action 1)
            # probs[:, 0] = 1-p (probability of action 0)
            probs = torch.stack([1-p, p], dim=1)
            
            if actions is not None:
                actions = actions.to(device)
                return probs.gather(1, actions.unsqueeze(1))  # Return probability of selected action
        return probs  # Return full action probability distribution
    
    # Polyak Averaging for target network update
    def update_target(self, tau):
        for target_param, param in zip(self.target_net.parameters(), self.net.parameters()):
            target_param.data.copy_(tau * param.data + (1-tau) * target_param.data)
    
    def store_transition(self, state, action, reward, next_state, done, behavior_prob):
        # Ensure we store Python scalars or NumPy arrays
        if isinstance(state, torch.Tensor):
            state = state.cpu().numpy()
        if isinstance(next_state, torch.Tensor):
            next_state = next_state.cpu().numpy()
        self.buffer.append((state, action, reward, next_state, done, behavior_prob))
    
    def train_behavior_policy(self, states, actions):
        probs = self.behavior_policy.get_probs(states)
        action_probs = probs.gather(1, actions.unsqueeze(1))
        loss = -torch.log(action_probs).mean()
        
        self.behavior_optimizer.zero_grad()
        loss.backward()
        self.behavior_optimizer.step()
        return loss.item()
    
    def pseudo_sample_next_actions(self, next_states):
        probs = self.get_target_policy_probs(next_states)
        probs = torch.clamp(probs, min=1e-5, max=1.0-1e-5)
        probs = probs / probs.sum(dim=-1, keepdim=True)
        actions = torch.multinomial(probs, num_samples=1)
        return actions.squeeze(-1)
        
    def get_quantiles_for_state(self, state, action=None):
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
            quantiles = self.net(state_tensor).squeeze(0)  # [action_dim, num_quantiles]
            
            if action is not None:
                return quantiles[action].cpu().numpy()
            return quantiles.cpu().numpy()

    def train(self):
        if len(self.buffer) < self.batch_size:
            return 0, 0
        
        # Sample batch from replay buffer
        batch = random.sample(self.buffer, self.batch_size)
        states, actions, rewards, next_states, dones, behavior_probs = zip(*batch)
        
        # Convert to tensors and move to GPU
        states = torch.FloatTensor(np.array(states)).to(device)
        actions = torch.LongTensor(np.array(actions)).to(device)
        rewards = torch.FloatTensor(np.array(rewards)).unsqueeze(-1).to(device)
        next_states = torch.FloatTensor(np.array(next_states)).to(device)
        dones = torch.FloatTensor(np.array(dones)).unsqueeze(-1).to(device)
        behavior_probs = torch.FloatTensor(np.array(behavior_probs)).unsqueeze(-1).to(device)
        
        # 1. Train behavior policy
        behavior_loss = self.train_behavior_policy(states, actions)
        
        # 2. Calculate importance weights
        with torch.no_grad():
            current_action_probs = self.get_target_policy_probs(states, actions)
            importance_weights = (current_action_probs / behavior_probs+ 1e-5).clamp(0, 1/self.beta)
        
        # 3. Get current quantile estimates
        current_quantiles = self.net(states)
        actions = actions.view(-1, 1, 1).expand(-1, -1, self.num_quantiles)
        current_quantiles = current_quantiles.gather(1, actions).squeeze(1)
        
        # 4. Compute target quantiles
        with torch.no_grad():
            next_actions = self.pseudo_sample_next_actions(next_states)
            next_actions = next_actions.view(-1, 1, 1).expand(-1, -1, self.num_quantiles)
            
            target_quantiles = self.target_net(next_states)
            target_quantiles = target_quantiles.gather(1, next_actions).squeeze(1)
            target_quantiles = rewards + self.gamma * target_quantiles * (1 - dones)
        
        # 5. Compute quantile regression loss
        diff = target_quantiles.unsqueeze(-1) - current_quantiles.unsqueeze(1)
        weight = torch.abs(self.quantile_fractions - (diff.detach() < 0).float())
        
        loss = torch.where(
            diff.abs() < 1,
            0.5 * diff.pow(2) * weight,
            (diff.abs() - 0.5) * weight
        )
        loss = (loss * importance_weights.unsqueeze(-1)).mean()
    
        # 6. Optimize model
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 2.0)
        self.optimizer.step()
        self.scheduler.step()
        
        return loss.item(), behavior_loss

In [9]:
def QTD_new(state_traj,
        action_traj,
        reward_traj,
        state_dim,
        action_card,
        quantile_num,
        gam,
        seed,
        lr,
        behavior_lr,
        beta,
        batch_size,
        tau):

    agent = QTD_Agent(state_dim = state_dim, action_dim = action_card,gamma=gam,lr=lr,num_quantiles=quantile_num,behavior_lr=behavior_lr,
                      beta=beta,batch_size=batch_size)
    n_tr=np.shape(state_traj)[0]
    T_obs=np.shape(state_traj)[1]
    for i in range(n_tr):
        for j in range(T_obs-1):
            with torch.no_grad():   
                state = state_traj[i,j]
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = action_traj[i,j]
                behavior_probs = agent.behavior_policy.get_probs(state_tensor)
                behavior_prob = behavior_probs[0, action].item()
                reward = reward_traj[i,j]
                next_state = state_traj[i,j+1]
                done=False
                behavior_prob = behavior_probs[0, action].item()
        
            agent.store_transition(state, action, reward, next_state, done, behavior_prob)
    for step in range(2001):
        loss, behavior_loss = agent.train()
        if step % 5 == 0:
            agent.update_target(tau=tau)
    return agent

In [10]:
## calculate V estimator based on QTD output
def v_hat_f(state):
    q_hat = (np.mean(agent.get_quantiles_for_state(state), axis=1))
    v = (q_hat[0]*get_new_probs(state,0)+q_hat[1]*get_new_probs(state,1)).item()
    return v



In [11]:
def weight_calculation_clip(states, actions, clip):
    weight=1
    step=np.shape(states)[0]-1
    for i in range(step):
        state_tensor = torch.FloatTensor(states[i]).unsqueeze(0)
        behavior_probs = agent.behavior_policy.get_probs(state_tensor)
        behavior_prob = behavior_probs[0, actions[i]].item()
        weight*=get_new_probs(states[i],actions[i])/behavior_prob
    weight=max(min(clip[1],weight),clip[0]) 
    return(weight)

In [12]:
## replay buffer+weight estimation
def replay_buffer(s_traj, s_traj_train, a_traj, r_traj,step_forward,clip):
    n = np.shape(s_traj)[0]
    p = np.shape(s_traj)[1] - step_forward
    state_dim=np.shape(s_traj)[2]
#logistic regression
    s_b = s_traj_train[:,1:p].reshape(-1,state_dim)
    s_0 = s_traj_train[:,0]
    X = np.vstack([s_b,s_0])
    y = np.concatenate([np.ones(np.shape(s_b)[0]), np.zeros(np.shape(s_0)[0])])
    rt = 1/(p-1)
    model = LogisticRegression().fit(X, y)
    
    Mem_state = np.zeros((n*p,step_forward+1,state_dim))
    Mem_action = np.zeros((n*p,step_forward+1),dtype=int)
    Mem_reward = np.zeros((n*p,step_forward+1))
    idx_weight=[]
    state_zero=s_traj[:,0]
    for i in range(n):
        for j in range(p):
            Mem_state[(i*p+j),:] = s_traj[i,j:(j+step_forward+1)]
            Mem_action[(i*p+j),:] = a_traj[i,j:(j+step_forward+1)]
            Mem_reward[(i*p+j),:] = r_traj[i,j:(j+step_forward+1)]
            idx_weight.append(
            weight_calculation_clip(Mem_state[(i*p+j), :], Mem_action[(i*p+j), :],clip) * rt*model.predict_proba(np.expand_dims(Mem_state[(i*p+j), 0], axis=0))[0, 1]/model.predict_proba(np.expand_dims(Mem_state[(i*p+j), 0], axis=0))[0, 0]
            )

    total = np.array(idx_weight).sum()
    idx_weight_final=np.array(idx_weight)/total
    return([Mem_state,Mem_action,Mem_reward,idx_weight_final])




In [13]:

def weighted_percentile(data, weights, perc,method):

    data = np.array(data)
    weights = np.array(weights)
    idx = np.argsort(data)
    data = data[idx] # sort data
    weights = weights[idx] # sort weights
    cdf = np.cumsum(weights) / np.sum(weights)
    count = np.sum([ cdf[i] <= perc for i in range(np.shape(cdf)[0]) ])
    #if output=infty return the maximum of V
    if data[count]==float('inf') or method=="min":
        count-=1

        
    return(data[count])
    

In [14]:
def scoring(s_traj, r_traj,step_forward,gam,quan_num):
    # s_traj: state trajectory
    # r_traj: reward trajectory
    # step_forward: number of steps used in approximating return
    # gam: discount
    if np.shape(s_traj)[1]!=step_forward+1:
        print("length dismatch")
    if np.shape(s_traj)[0]!=np.shape(r_traj)[0]:
        print("height dismatch")
    
    n = np.shape(s_traj)[0]
    u = np.random.randint(0, quan_num - 1, size=n)
    sc = list(
        map(
            abs,
            np.sum([gam**i * r_traj[:, i] for i in range(step_forward)],
                   axis=0) +
            [
                gam**step_forward * agent.get_quantiles_for_state(s_traj[i, step_forward])[step_new(s_traj[i, step_forward])[0],u[i]] -
                v_hat_f(s_traj[i, 0] ) for i in range(n)
            ]))
    return (sc)

In [15]:
import itertools

In [16]:
def new_rb_res(data_train, data_test, gam, alp, step_forward, num_quantiles,B,eta,seed,sample_size,clip,action_card,lr,behavior_lr,beta,batch_size,tau):
    n_tr, n_te = np.shape(data_train[0])[0], np.shape(data_test[0])[0]
    s_init_te = data_test[0]
    a_init_te = data_test[1]
    ret_te = data_test[3]
    state_dim=np.shape(data_test[0])[2]
    
    ## split training data
    idx_perm = np.random.permutation(list(range(0, n_tr)))
    idx_tr, idx_cal = [idx_perm[0:int(n_tr / 2)], idx_perm[int(n_tr / 2):n_tr]]
    s_train_fold = data_train[0][idx_tr]
    a_train_fold = data_train[1][idx_tr]
    r_train_fold = data_train[2][idx_tr]
    
    ## train return distribution using QTD
    agent=QTD_new(state_traj=s_train_fold,
        action_traj=a_train_fold,
        reward_traj=r_train_fold,
        state_dim=state_dim,
        action_card=action_card,
        quantile_num=num_quantiles,
        gam=gam,
        seed=seed,
        lr=lr,
        behavior_lr=behavior_lr,
        beta=beta,
        batch_size=batch_size,
        tau=tau)

    def v_hat_f(state):
        q_hat = (np.mean(agent.get_quantiles_for_state(state), axis=1))
        v = (q_hat[0]*get_new_probs(state,0)+q_hat[1]*get_new_probs(state,1)).item()
        return v

    def weight_calculation_clip(states, actions, clip):
        with torch.no_grad():
            weight=1
            step=np.shape(states)[0]-1
            for i in range(step):
                state_tensor = torch.FloatTensor(states[i]).unsqueeze(0)
                behavior_probs = agent.behavior_policy.get_probs(state_tensor)
                behavior_prob = behavior_probs[0, actions[i]].item()
                weight*=get_new_probs(states[i],actions[i])/behavior_prob
            weight=max(min(clip[1],weight),clip[0])
        return(weight)

    def replay_buffer(s_traj, s_traj_train, a_traj, r_traj,step_forward,clip):
        with torch.no_grad():
            n = np.shape(s_traj)[0]
            p = np.shape(s_traj)[1] - step_forward
            state_dim=np.shape(s_traj)[2]
        #logistic regression
            s_b = s_traj_train[:,1:p].reshape(-1,state_dim)
            s_0 = s_traj_train[:,0]
            X = np.vstack([s_b,s_0])
            y = np.concatenate([np.ones(np.shape(s_b)[0]), np.zeros(np.shape(s_0)[0])])
            rt = 1/(p-1)
            model = LogisticRegression().fit(X, y)
            
            Mem_state = np.zeros((n*p,step_forward+1,state_dim))
            Mem_action = np.zeros((n*p,step_forward+1),dtype=int)
            Mem_reward = np.zeros((n*p,step_forward+1))
            idx_weight=[]
            state_zero=s_traj[:,0]
            for i in range(n):
                for j in range(p):
                    Mem_state[(i*p+j),:] = s_traj[i,j:(j+step_forward+1)]
                    Mem_action[(i*p+j),:] = a_traj[i,j:(j+step_forward+1)]
                    Mem_reward[(i*p+j),:] = r_traj[i,j:(j+step_forward+1)]
                    idx_weight.append(
                    weight_calculation_clip(Mem_state[(i*p+j), :], Mem_action[(i*p+j), :],clip) * rt*model.predict_proba(np.expand_dims(Mem_state[(i*p+j), 0], axis=0))[0, 1]/model.predict_proba(np.expand_dims(Mem_state[(i*p+j), 0], axis=0))[0, 0]
                    )
        
            total = np.array(idx_weight).sum()
            idx_weight_final=np.array(idx_weight)/total
        return([Mem_state,Mem_action,Mem_reward,idx_weight_final])
    
    def scoring(s_traj, r_traj,step_forward,gam,quan_num):
    # s_traj: state trajectory
    # r_traj: reward trajectory
    # step_forward: number of steps used in approximating return
    # gam: discount
        with torch.no_grad():
            if np.shape(s_traj)[1]!=step_forward+1:
                print("length dismatch")
            if np.shape(s_traj)[0]!=np.shape(r_traj)[0]:
                print("height dismatch")
            
            n = np.shape(s_traj)[0]
            u = np.random.randint(0, quan_num - 1, size=n)
            sc = list(
                map(
                    abs,
                    np.sum([gam**i * r_traj[:, i] for i in range(step_forward)],
                           axis=0) +
                    [
                        gam**step_forward * agent.get_quantiles_for_state(s_traj[i, step_forward])[step_new(s_traj[i, step_forward])[0],u[i]] -
                        v_hat_f(s_traj[i, 0] ) for i in range(n)
                    ]))
        return (sc)
        
    with torch.no_grad():
    ## calculate nonconformity scores based on test set
        sc_te = [abs(ret_te[i] - v_hat_f(s_init_te[i, 0])) for i in range(n_te)]
        
        ## replay buffer
        l = np.shape(step_forward)[0]
        if isinstance(eta, int) == False:
            m = np.shape(eta)[0]
        elif isinstance(eta, int) == True:
            m = 1
            
        PI_cov_e = np.zeros((m,l))
        PI_len_e = np.zeros((m,l))
                
    
        for k in range(l):
            
           
            quan_B_e = np.zeros((m,n_te,B))
     
            for i in range(B):
                
            #n_cal = np.random.choice(a=[j for j in range(np.shape(Mem[0])[0])], p=p,size=200)
                Mem=replay_buffer(s_traj=data_train[0][idx_cal, :], s_traj_train=s_train_fold, a_traj=data_train[1][idx_cal, :], 
                             r_traj=data_train[2][idx_cal, :],step_forward=step_forward[k],clip=clip)
                
                weight_is=Mem[-1]
                n_cal = np.random.choice(range(np.shape(weight_is)[0]),size=sample_size, p=weight_is)
                ## calculate nonconformity scores based on calibration set
                sc_rb = scoring(s_traj=Mem[0][n_cal,],
                                r_traj=Mem[2][n_cal,],
                                step_forward=step_forward[k],
                                gam=gam,
                                quan_num=num_quantiles)
                sc_rb.append(float('inf')) 
                for j in range(n_te): 
                    for z in range(m):
                        quan_B_e[z][j,i] = weighted_percentile(data=sc_rb,weights=np.ones(sample_size+1),
                                                          perc=1-alp*eta[z],method="max")
                        
            critical_value_rb_e = np.zeros((m,n_te))
     
            for z in range(m):
                critical_value_rb_e[z,:] = [ np.percentile(a=quan_B_e[z][k,:],
                                                               q=eta[z]*100) for k in range(n_te) ]
    
                
                PI_cov_e[z,k] = np.mean([sc_te[k] <= critical_value_rb_e[z,k] 
                                             for k in range(n_te)])
                PI_len_e[z,k] = 2 * np.mean(critical_value_rb_e[z,:])
                
            
    return([PI_cov_e,PI_len_e])


In [17]:
def quantile_region_res(data_train, data_test, gam, alp, num_quantiles,seed,action_card,lr,behavior_lr,beta,batch_size,tau):

    n_tr, n_te = np.shape(data_train[0])[0], np.shape(data_test[0])[0]
    state_dim = np.shape(data_train[0])[2]
    s_init_te = data_test[0]
    a_init_te = data_test[1] 
    ret_te = data_test[3]
    quant_num = num_quantiles
    ## train QTD using full training data
    agent=QTD_new(state_traj=data_train[0],
        action_traj=data_train[1],
        reward_traj=data_train[2],
        state_dim=state_dim,
        action_card=action_card,
        quantile_num=num_quantiles,
        gam=gam,
        seed=seed,
        lr=lr,
        behavior_lr=behavior_lr,
        beta=beta,
        batch_size=batch_size,
        tau=tau)
    with torch.no_grad():
        data_t=data_test[0].reshape(-1,state_dim)
        quant_interval_lower=np.zeros(n_te)
        quant_interval_upper=np.zeros(n_te)
        ## lower and upper quanitles for each states 
        for i in range(n_te):
            data_aug=np.hstack((agent.get_quantiles_for_state(data_t[i])[0,:], agent.get_quantiles_for_state(data_t[i])[1,:])) 
            weight_aug=np.hstack((np.ones(quant_num)*get_new_probs(data_t[i],0)/quant_num,np.ones(quant_num)*get_new_probs(data_t[i],1)/quant_num))
            quant_interval_lower[i]=weighted_percentile(data_aug,weight_aug,alp/2,method="min")
            quant_interval_upper[i]=weighted_percentile(data_aug,weight_aug,1-alp/2,method="max")
                           
    
        ## calculate coverage
        t1 = [
        ret_te[i] >= quant_interval_lower[i] for i in range(n_te)
        ]
        t2 = [
        ret_te[i] <= quant_interval_upper[i] for i in range(n_te)
        ]
        quan_PI_cov = np.mean([all([t1[i], t2[i]]) for i in range(n_te)])
        quan_PI_len = np.mean(quant_interval_upper - quant_interval_lower
            )

    return ([quan_PI_cov, quan_PI_len])

In [18]:
#Parallel Calculation
def run_single_experiment(i, n_tr, gam, T_obs, seed, n_te, T, num_quantiles, B, alp, eta, step_forward,sample_size,clip,action_card,lr,behavior_lr,beta,batch_size,tau):

    data_train = data_gen(N=n_tr,
                          T_obs=T_obs,
                          T=T,
                          gam=gam,
                          seed=seed + i,
                          s_init=None)


    data_test = data_gen_new(N=n_te,
                             T_obs=1,
                             T=T,
                             gam=gam,
                             seed=seed + i + 10000,
                             s_init=None)

    result = new_rb_res(data_train=data_train,
                        data_test=data_test,
                        gam=gam,
                        alp=alp,
                        step_forward=step_forward,
                        num_quantiles=num_quantiles,
                        B=B,
                        eta=eta,
                        seed=seed + i,
                        sample_size=sample_size,
                       clip=clip,
                       action_card=action_card,
                       lr=lr,
                       behavior_lr=behavior_lr,
                       beta=beta,
                       batch_size=batch_size,
                       tau=tau)
   
    return result  # return [PI_cov_e, PI_len_e]




In [None]:
# Parameter setting
rep = 100
n_tr = 200
gam = 0.8
T_obs = 30
seed = 2025
n_te = 310
T = 70
num_quantiles = 30
B = 50
alp = 0.1
eta = [0.2,0.3,0.4,0.5,0.6,0.7,0.8]
clip=np.array([0.2,5.0])
step_forward = [1, 2, 3,4,5]
sample_size=200
action_card=2
lr=0.006
behavior_lr=0.01
beta=0.5
batch_size=64
tau=0.1



results = Parallel(n_jobs=18, verbose=1)(
    delayed(run_single_experiment)(
        i, n_tr, gam, T_obs, seed, n_te, T, num_quantiles, B, alp, eta, step_forward,sample_size,clip,action_card,lr,behavior_lr,beta,batch_size,tau
    ) for i in range(rep)
)


In [18]:
rb_new_cov_tau02_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau03_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau04_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau05_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau06_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau07_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_cov_tau08_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau02_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau03_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau04_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau05_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau06_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau07_e3 = np.zeros((rep, np.shape(step_forward)[0]))
rb_new_len_tau08_e3 = np.zeros((rep, np.shape(step_forward)[0]))
#Restore data
for i in range(np.shape(results)[0]):
    rb_new_cov_tau02_e3[i, :] = results [i][0][0]
    rb_new_cov_tau03_e3[i, :] = results [i][0][1]
    rb_new_cov_tau04_e3[i, :] = results [i][0][2]
    rb_new_cov_tau05_e3[i, :] = results [i][0][3]
    rb_new_cov_tau06_e3[i, :] = results [i][0][4]
    rb_new_cov_tau07_e3[i, :] = results [i][0][5]
    rb_new_cov_tau08_e3[i, :] = results [i][0][6]


    rb_new_len_tau02_e3[i, :] = results [i][1][0]
    rb_new_len_tau03_e3[i, :] = results [i][1][1]
    rb_new_len_tau04_e3[i, :] = results [i][1][2]
    rb_new_len_tau05_e3[i, :] = results [i][1][3]
    rb_new_len_tau06_e3[i, :] = results [i][1][4]
    rb_new_len_tau07_e3[i, :] = results [i][1][5]
    rb_new_len_tau08_e3[i, :] = results [i][1][6]
    

PI_cov_all = [res[0] for res in results]
PI_len_all = [res[1] for res in results]

In [22]:
import pandas as pd

##### save simulation result

data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau02_e3[:,0],
    'new-rb-2': rb_new_cov_tau02_e3[:,1],
    'new-rb-3': rb_new_cov_tau02_e3[:,2],
    'new-rb-4': rb_new_cov_tau02_e3[:,3],
    'new-rb-5': rb_new_cov_tau02_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_02.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau02_e3[:,0],
     'new-rb-2': rb_new_len_tau02_e3[:,1],
     'new-rb-3': rb_new_len_tau02_e3[:,2],
     'new-rb-4': rb_new_len_tau02_e3[:,3],
     'new-rb-5': rb_new_len_tau02_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_02.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')


In [21]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau03_e3[:,0],
    'new-rb-2': rb_new_cov_tau03_e3[:,1],
    'new-rb-3': rb_new_cov_tau03_e3[:,2],
    'new-rb-4': rb_new_cov_tau03_e3[:,3],
    'new-rb-5': rb_new_cov_tau03_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_03.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau03_e3[:,0],
     'new-rb-2': rb_new_len_tau03_e3[:,1],
     'new-rb-3': rb_new_len_tau03_e3[:,2],
     'new-rb-4': rb_new_len_tau03_e3[:,3],
     'new-rb-5': rb_new_len_tau03_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_03.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau04_e3[:,0],
    'new-rb-2': rb_new_cov_tau04_e3[:,1],
    'new-rb-3': rb_new_cov_tau04_e3[:,2],
    'new-rb-4': rb_new_cov_tau04_e3[:,3],
    'new-rb-5': rb_new_cov_tau04_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_04.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau04_e3[:,0],
     'new-rb-2': rb_new_len_tau04_e3[:,1],
     'new-rb-3': rb_new_len_tau04_e3[:,2],
     'new-rb-4': rb_new_len_tau04_e3[:,3],
     'new-rb-5': rb_new_len_tau04_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_04.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau05_e3[:,0],
    'new-rb-2': rb_new_cov_tau05_e3[:,1],
    'new-rb-3': rb_new_cov_tau05_e3[:,2],
    'new-rb-4': rb_new_cov_tau05_e3[:,3],
    'new-rb-5': rb_new_cov_tau05_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_05.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau05_e3[:,0],
     'new-rb-2': rb_new_len_tau05_e3[:,1],
     'new-rb-3': rb_new_len_tau05_e3[:,2],
     'new-rb-4': rb_new_len_tau05_e3[:,3],
     'new-rb-5': rb_new_len_tau05_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_05.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau06_e3[:,0],
    'new-rb-2': rb_new_cov_tau06_e3[:,1],
    'new-rb-3': rb_new_cov_tau06_e3[:,2],
    'new-rb-4': rb_new_cov_tau06_e3[:,3],
    'new-rb-5': rb_new_cov_tau06_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_06.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau06_e3[:,0],
     'new-rb-2': rb_new_len_tau06_e3[:,1],
     'new-rb-3': rb_new_len_tau06_e3[:,2],
     'new-rb-4': rb_new_len_tau06_e3[:,3],
     'new-rb-5': rb_new_len_tau06_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_06.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau07_e3[:,0],
    'new-rb-2': rb_new_cov_tau07_e3[:,1],
    'new-rb-3': rb_new_cov_tau07_e3[:,2],
    'new-rb-4': rb_new_cov_tau07_e3[:,3],
    'new-rb-5': rb_new_cov_tau07_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_07.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau07_e3[:,0],
     'new-rb-2': rb_new_len_tau07_e3[:,1],
     'new-rb-3': rb_new_len_tau07_e3[:,2],
     'new-rb-4': rb_new_len_tau07_e3[:,3],
     'new-rb-5': rb_new_len_tau07_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_07.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:
data_new_rb_cov = {
    'new-rb-1': rb_new_cov_tau08_e3[:,0],
    'new-rb-2': rb_new_cov_tau08_e3[:,1],
    'new-rb-3': rb_new_cov_tau08_e3[:,2],
    'new-rb-4': rb_new_cov_tau08_e3[:,3],
    'new-rb-5': rb_new_cov_tau08_e3[:,4],

    
}

data_cov = pd.DataFrame(data_new_rb_cov)
data_cov.to_excel('cov_shi_off_08.xlsx', index=False)

#df_cov.to_excel('simu_res/res_new_rb_e_cov_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

data_new_rb_len = {
     'new-rb-1': rb_new_len_tau08_e3[:,0],
     'new-rb-2': rb_new_len_tau08_e3[:,1],
     'new-rb-3': rb_new_len_tau08_e3[:,2],
     'new-rb-4': rb_new_len_tau08_e3[:,3],
     'new-rb-5': rb_new_len_tau08_e3[:,4],

}

data_len = pd.DataFrame(data_new_rb_len)
data_len.to_excel('len_shi_off_08.xlsx', index=False)

#df_len.to_excel('simu_res/res_new_rb_e_len_gam0.8_tau0.5_qnum30_100to200.xlsx', index=False, engine='openpyxl')

In [None]:

data_new_rb_cov = {
    'new-rb-1': data_new_rb_cov_5.iloc[:,0],
    'new-rb-2': data_new_rb_cov_6.iloc[:,1],
    'new-rb-3': data_new_rb_cov_7.iloc[:,2],
    'new-rb-4': data_new_rb_cov_8.iloc[:,3],
    'new-rb-5': data_new_rb_cov_8.iloc[:,4],

    
}

data_new_rb_cov = pd.DataFrame(data_new_rb_cov)


data_new_rb_len = {
    'new-rb-1': data_new_rb_len_5.iloc[:,0],
    'new-rb-2': data_new_rb_len_6.iloc[:,1],
    'new-rb-3': data_new_rb_len_7.iloc[:,2],
    'new-rb-4': data_new_rb_len_8.iloc[:,3],
    'new-rb-5': data_new_rb_len_8.iloc[:,4],

}

data_new_rb_len = pd.DataFrame(data_new_rb_len)

In [32]:
rep = 100
n_tr = 200
gam = 0.8
T_obs = 30
seed = 2025
n_te = 310
T = 70
num_quantiles = 30
clip=np.array([0.2,5.0])
step_forward = [1, 2, 3,4,5]
action_card=2
lr=0.003
behavior_lr=0.005
beta=0.5
batch_size=64
tau=0.1
alp=0.1
res_quan = np.zeros((rep, 2))

# Parallel Calculation
def process_iteration(i,n_tr, gam, T_obs, n_te, T, num_quantiles, alp, seed,action_card,lr,behavior_lr,beta,batch_size,tau):
 
    data_train = data_gen(N=n_tr,
                         T_obs=T_obs,
                         T=T,
                         gam=gam,
                         seed=seed+i,
                         s_init=None)
    

    data_test = data_gen_new(N=n_te,
                            T_obs=1,
                            T=T,
                            gam=gam,
                            seed=seed + i + 10000,
                            s_init=None)
    

    quan_PI_res1 = quantile_region_res(data_train=data_train,
                                      data_test=data_test, 
                                      gam=gam, 
                                      alp=alp,
                                      num_quantiles=num_quantiles,
                                      seed=seed+i,
                                        action_card=action_card,
                                        lr=lr,
                                        behavior_lr=behavior_lr,
                                        beta=beta,
                                        batch_size=batch_size,
                                        tau=tau)
    
    print(f"test num: {i}")
    print("quantile region: ")
    print(f"cov: {quan_PI_res1[0]} | length: {quan_PI_res1[1]}")
    
    return quan_PI_res1



results_qr = Parallel(n_jobs=18, verbose=1)(delayed(process_iteration)(i, n_tr, gam, T_obs, n_te, T, num_quantiles, alp, seed,action_card,lr,behavior_lr,beta,batch_size,tau) for i in range(rep))

# restore data
for i in range(rep):
    res_quan[i, :] = results_qr[i]

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  5.7min finished


In [33]:
import pandas as pd

##### save simulation result
data_quan_cov = {
    'quantile region': res_quan[:, 0]
}

df_cov = pd.DataFrame(data_quan_cov)

df_cov.to_excel('QR_cov_shi_off.xlsx', index=False, engine='openpyxl')

data_quan_len = {
    'quantile region': res_quan[:, 1]
}

df_len = pd.DataFrame(data_quan_len)

df_len.to_excel('QR_len_shi_off.xlsx', index=False, engine='openpyxl')

print(df_cov.head())
print(df_len.head())

   quantile region
0         0.832258
1         0.845161
2         0.829032
3         0.854839
4         0.822581
   quantile region
0         8.165129
1         8.385649
2         8.284308
3         8.543129
4         8.108450


In [34]:
print("quantile region: ")
print("coverage probability: ", np.mean(res_quan[:, 0]),
      "|  average length: ", np.mean(res_quan[:, 1]))

quantile region: 
coverage probability:  0.8309677419354837 |  average length:  8.191806151636186


# Plotting simulation results

In [23]:
import pandas as pd


data_new_rb_cov.columns = ['k=1', 'k=2', 'k=3' , 'k=4' , 'k=5']
data_new_rb_len.columns = ['k=1', 'k=2', 'k=3' , 'k=4' , 'k=5']

data_QR_cov = pd.read_excel('QR_cov_shi_off.xlsx')
data_QR_len = pd.read_excel('QR_len_shi_off.xlsx')


#data_new_rb_cov.rename(columns={'quantile region': 'QR'}, inplace=True)
#data_new_rb_len.rename(columns={'quantile region': 'QR'}, inplace=True)
data_new_rb_cov['DRL-QR'] = data_QR_cov['quantile region']
data_new_rb_len['DRL-QR'] = data_QR_len['quantile region']

print(data_new_rb_cov.head())
print(data_new_rb_len.head())

   new-rb-1  new-rb-2  new-rb-3  new-rb-4  new-rb-5        QR
0  0.877419  0.877419  0.877419  0.877419  0.877419  0.887097
1  0.880645  0.880645  0.880645  0.883871  0.890323  0.864516
2  0.883871  0.887097  0.887097  0.883871  0.883871  0.887097
3  0.877419  0.877419  0.877419  0.877419  0.883871  0.887097
4  0.877419  0.880645  0.880645  0.880645  0.887097  0.887097
   new-rb-1  new-rb-2  new-rb-3  new-rb-4  new-rb-5        QR
0  7.184103  7.300162  7.227494  7.234046  7.354492  7.398871
1  7.163068  7.181728  7.286160  7.319855  7.422460  7.187608
2  7.292234  7.425412  7.465158  7.339162  7.246481  7.400118
3  7.208743  7.352818  7.360902  7.363134  7.422255  7.383613
4  7.155576  7.290999  7.305147  7.293528  7.457363  7.438672


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

bplot_new_cov = data_new_rb_cov.boxplot(patch_artist=True,
                         medianprops={
                             'linestyle': '-',
                             'color': 'black',
                             'linewidth': 1.5
                         },
                         whiskerprops={
                             'linestyle': '--',
                             'color': 'black'
                         },
                         capprops={
                             'linestyle': '-',
                             'color': 'black'
                         },
                         boxprops={
                             'linestyle': '-',
                             'color': 'black'
                         })

colors = [
    'goldenrod', 'orange', 'gold', 'khaki', 'wheat', 'lightyellow','skyblue'
]

colors2 = [
    'darkseagreen','limegreen' ,'greenyellow','yellowgreen','lightgreen','honeydew','skyblue'
]
for patch, color in zip(bplot_new_cov.patches, colors):
    patch.set_facecolor(color)
    patch.set_linewidth(1)

bplot_new_cov.yaxis.grid(False)
bplot_new_cov.xaxis.grid(False)
bplot_new_cov.set_xlabel("Method")
bplot_new_cov.set_ylabel("Coverage Probability")

plt.axhline(y=0.90, color='red', linestyle='-', linewidth=1)
plt.ylim(0.75,1)
#plt.savefig('fig/new_rb_cov_o_gam0.8_nr100_qnum10.png')
plt.show()
plt.savefig('Ex2_off_policy_cp.png')

In [None]:
plt.figure(figsize=(8, 6))

bplot_new_len = data_new_rb_len.boxplot(patch_artist=True,
                         medianprops={
                             'linestyle': '-',
                             'color': 'black',
                             'linewidth': 1.5
                         },
                         whiskerprops={
                             'linestyle': '--',
                             'color': 'black'
                         },
                         capprops={
                             'linestyle': '-',
                             'color': 'black'
                         },
                         boxprops={
                             'linestyle': '-',
                             'color': 'black'
                         })


for patch, color in zip(bplot_new_len.patches, colors):
    patch.set_facecolor(color)
    patch.set_linewidth(1)

bplot_new_len.yaxis.grid(False)
bplot_new_len.xaxis.grid(False)
bplot_new_len.set_xlabel("Method")
bplot_new_len.set_ylabel("Empirical Length")
#plt.savefig('fig/new_len_o_gam0.8_nr100_qnum10.png')
plt.show()
plt.savefig('Ex2_off_policy_al.png')