In [None]:
import os
import time
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from matplotlib import pyplot as plt
from joblib import Parallel, delayed
from sklearn.decomposition import NMF
np.random.seed(9000000)

In [None]:
def generate_data(num_context):
    
    d = 2
    num_action = 40
    
    theta_true = np.array([0.9, 0.4])
    
    actions_per_group = num_action // 40
    phi_true = np.zeros((num_context, num_action, d))

    for i in range(num_context):
        
        for group in range(40):
            
            lower_bound = 0.975 - group * 0.025
            upper_bound = 1.0 - group * 0.025
            
            for j in range(actions_per_group):
                
                while True:
                    
                    vec = np.random.randn(d)
                    vec = vec / np.linalg.norm(vec)
                    
                    if lower_bound <= np.dot(vec, theta_true) <= upper_bound:
                        
                        phi_true[i][group * actions_per_group + j] = vec
                        
                        break
                        
    psi_true = np.zeros((num_context, num_action, d))
    
    for i in range(num_context):

        for a in range(num_action):

            psi_true[i, a] = phi_true[i, a] + np.random.normal(0, 2.5 * 1e-3, d)
            
    rewards = phi_true.dot(theta_true)

    return phi_true, psi_true, theta_true, rewards, d

In [None]:
# calculate the beta
def get_beta(rho, delta, V_bar, theta_true):
    
    ld = 1
    beta = rho * np.sqrt(2 * np.log((np.sqrt(np.linalg.det(V_bar)) * (np.linalg.det(ld * np.eye(d)) ** (-1/2))) / delta)) + np.sqrt(ld) * np.linalg.norm(theta_true)
    
    return beta

In [None]:
# slsqp
def get_decision(psi_action, theta_hat, V_bar, beta, d):
    
    def maximize_reward(theta, psi_a):

        return -1.0 * psi_a.dot(theta)
    
    # constraint confidence set: || theta_hat - theta ||_V_bar < Beta
    def constraint(theta, theta_hat, V_bar, beta):
        
        temp = np.array(theta_hat - theta.reshape(-1, 1))
        norm = np.sqrt(temp.reshape(-1, 1).T.dot(V_bar).dot(temp.reshape(-1, 1)))
        
        return beta - norm[0][0]
    
    # constraints for optimization
    beta_constraint = {'type': 'ineq', 'fun' : lambda theta: constraint(theta, theta_hat, V_bar, beta)}
    
    theta_list = []
    reward_list = []
    
    for psi_a in psi_action:
        
        res = minimize(maximize_reward, x0 = np.ones(d), args = (psi_a), method = 'SLSQP', constraints = [beta_constraint], options = {'ftol': 1e-3, 'eps': 1e-10, 'maxiter': 1e6, 'disp': False})
        theta_list.append(res.x)
        reward_list.append(psi_a.dot(res.x))
        # print(res.message)
        
    decision = np.argmax(reward_list)
    theta_tilde = theta_list[decision]
    
    return decision, theta_tilde

In [None]:
def prepare_plot_data(data_list, trials, iterations, num_agent, every_num_point):
    
    new_list = []

    for T in range(trials):

        new_list.append([item[1] for item in data_list if item[0] == T])

    new_list = np.array(new_list).reshape(trials, iterations).tolist()
    
    data_value = np.zeros(iterations)
    
    for T in range(trials):
        
        data_value += np.array(new_list[T])
        
    data_value = data_value / trials
    
    x_value = [0]
    y_value = [data_value[0] / num_agent[0]]
    
    for num in range(int(iterations / every_num_point)):
        
        x_value.append((num + 1) * every_num_point - 1)
        y_value.append(data_value[(num + 1) * every_num_point - 1] / num_agent[0])
    
    return x_value, y_value

In [None]:
d = 2
R = 1
ld = 1
trials = 100
alpha = 0.01
num_agent = [1]
baseline_idx = 10
delta_value = 1e-3
iterations = 20000
every_num_point = 25
optimal_alg = []
optimal_ECC = []
optimal_sw = []
optimal_sw_UCB = []
reward_alg = []
reward_ECC = []
reward_sw = []
reward_sw_UCB = []
baseline_alg = []
baseline_ECC = []
baseline_sw = []
baseline_sw_UCB = []
cummulative_regret_alg = []
cumulative_violate_alg = []
cumulative_baseline_alg = []
cummulative_regret_ECC = []
cumulative_violate_ECC = []
cumulative_baseline_ECC = []
cummulative_regret_sw = []
cumulative_violate_sw = []
cumulative_baseline_sw = []
cummulative_regret_sw_UCB = []
cumulative_violate_sw_UCB = []
cumulative_baseline_sw_UCB = []
theta_true = np.array([1, 1])

In [None]:
phi_true_group_T = []
psi_true_group_T = []
sample_id_T = []
noise_T = []
r_h_T = []
r_l_T = []
rho_bar_T = []
zeta_T = []

for T in range(trials):
    
    phi_true, psi_true, theta_true, rewards, d = generate_data(int(1e2))
    L = np.max(np.linalg.norm(phi_true, axis = 2))
    
    phi_true_1 = phi_true.copy()
    phi_true_1[:, :, 1] = 0

    phi_true_2 = phi_true.copy()
    phi_true_2[:, :, 0] = 0
    
    psi_true_1 = psi_true.copy()
    psi_true_1[:, :, 1] = 0

    psi_true_2 = psi_true.copy()
    psi_true_2[:, :, 0] = 0
    
    phi_true_group = np.array([phi_true, phi_true_1, phi_true_2] * 3)
    psi_true_group = np.array([psi_true, psi_true_1, psi_true_2] * 3)
    
    # generate the index set for context
    sample_id = np.random.randint(0, np.shape(phi_true)[0], size = iterations)

    # generate the noise of reward for each iteration
    noise = np.random.normal(0, 1e-3, size = (num_agent[-1], trials, iterations))

    # calculate r_l and r_h
    product_results = np.dot(phi_true_group, theta_true)
    sorted_values = np.sort(product_results, axis=2)
    all_baseline_values = sorted_values[:, :, -baseline_idx]
    r_h = np.max(all_baseline_values)
    r_l = np.min(all_baseline_values)

    # gererate rho_bar
    rho_bar = np.random.uniform(1e-10, np.min(alpha) * r_l / (np.linalg.norm(theta_true) + r_h), size = (trials, iterations))

    # generate zeta
    zeta_data = np.random.normal(0, 1e-3, (trials, iterations, len(theta_true)))
    zeta_zero = zeta_data - np.mean(zeta_data, axis = 2, keepdims = True)
    zeta = zeta_zero / np.linalg.norm(zeta_zero, axis = 2, keepdims = True)
    
    phi_true_group_T.append(phi_true_group)
    psi_true_group_T.append(psi_true_group)
    sample_id_T.append(sample_id)
    noise_T.append(noise)
    r_h_T.append(r_h)
    r_l_T.append(r_l)
    rho_bar_T.append(rho_bar)
    zeta_T.append(zeta)

In [None]:
# our setting
start = time.time()

for M in num_agent:
    
    for T in range(trials):
        
        phi_true_group = phi_true_group_T[T]
        psi_true_group = psi_true_group_T[T]
        sample_id = sample_id_T[T]
        noise = noise_T[T]
        r_h = r_h_T[T]
        r_l = r_l_T[T]
        rho_bar = rho_bar_T[T]
        zeta = zeta_T[T]
        L = np.max(np.linalg.norm(phi_true_group[0], axis = 2))
        
        total_regret = 0
        total_reward = 0
        total_violate = 0
        total_baseline = 0
        cummulative_regret = []
        cummulative_violate = []
        cummulative_baseline = []
        optimal_list = []
        reward_list = []
        baseline_list = []
        
        t_last = 0
        V_last = ld * np.eye(d)
        W_syn = np.zeros((d, d))
        U_syn = np.zeros((d, 1))
        
        W_new_list = [np.zeros((d, d)) for i in range(M)]
        U_new_list = [np.zeros((d, 1)) for i in range(M)]
        
        B = (iterations * np.log(M * iterations)) / (d * M)
        print('B is equal to', B)
        
        syn = 0

        for t in range(1, iterations + 1): 
            
            print('')
            print('Trial: {} t: {}:'.format(T,t))
            
            index = sample_id[t-1]
            
            for i in range(M):
                
                phi = phi_true_group[i][index]
                psi = psi_true_group[i][index]

                x_star = np.argmax(np.dot(np.array(psi), theta_true))
                optimal = np.dot(np.array(phi[x_star]), theta_true)
                print('best decision is:', np.argmax(np.dot(phi, theta_true)))

                x_b = np.argsort(phi.dot(theta_true))[::-1][baseline_idx]
                r_b = phi[x_b].dot(theta_true)
                print('baseline decision is:', x_b)
                
                V_bar = ld * np.eye(d) + W_syn + W_new_list[i]
                theta_hat = np.dot(np.linalg.inv(V_bar), (U_syn + U_new_list[i]))

                # construct the confidence ellipsoid beta
                beta = get_beta(rho = np.sqrt(1 + R ** 2), delta = delta_value / 2, V_bar = V_bar, theta_true = theta_true)
                
                #construct the trimmed action set
                tas = (psi.dot(theta_hat) >= beta * L / np.sqrt(np.min(np.linalg.eigvals(V_bar))) + (1 - alpha) * r_b)
                phi_set = phi[tas.ravel()]
                psi_set = psi[tas.ravel()]
                
                # get the best action
                if (psi_set.size != 0) and (np.min(np.linalg.eigvals(V_bar)) >= np.square(2 * L * beta / ((optimal - r_h) + alpha * r_b))):
                
                    decision, theta_tilde = get_decision(psi_set, theta_hat, V_bar, beta, d)
                    psi_new = psi_set[decision]
                    y = np.dot(phi_set[decision], theta_true)
                    print("play learner's decision:", decision)
                    
                else:
                    
                    decision = x_b
                    total_baseline += 1
                    psi_new = (1 - rho_bar[T][t-1]) * psi[decision] + rho_bar[T][t-1] * zeta[T][t-1]
                    y = (1 - rho_bar[T][t-1]) * np.dot(phi[decision], theta_true) + rho_bar[T][t-1] * np.dot(zeta[T][t-1], theta_true)
                    print("play conservative decision:", decision)
                    
                regret = optimal - y
                total_regret = total_regret + regret
                total_reward = total_reward + y

                # update W_new and U_new
                W_new_list[i] = W_new_list[i] + np.outer(psi_new, psi_new)
                U_new_list[i] = U_new_list[i] + psi_new.reshape(-1, 1) * (y + noise[i][T][t-1])
                V = ld * np.eye(d) + W_syn + W_new_list[i]
                
                LHS_condition = np.log(np.linalg.det(V) / np.linalg.det(V_last)) * (t - t_last)
                
                if LHS_condition >= B:
                    
                    print('synchronization start for agent', i)
                    print('LHS condition is:', LHS_condition)
                    
                    syn = 1
                    
                print('----------')
                
                if y < (1 - alpha) * r_b:
                    
                    total_violate += 1
                    print('violate the constraint1111111111111111111111111111111111111')
                    
            if syn == 1:
                
                W_syn = W_syn + np.sum(W_new_list, axis=0)
                U_syn = U_syn + np.sum(U_new_list, axis=0)
                
                W_new_list = [np.zeros((d, d)) for i in range(M)]
                U_new_list = [np.zeros((d, 1)) for i in range(M)]
                t_last = t
                V_last = ld * np.eye(d) + W_syn
                
                syn = 0
                
            print('cummulative_regret is: ', total_regret)
            cummulative_regret.append(total_regret)
            cummulative_violate.append(total_violate)
            cummulative_baseline.append(total_baseline)
            optimal_list.append(np.max(np.dot(phi, theta_true)))
            reward_list.append(y)
            baseline_list.append((1 - alpha) * r_b)
            
        cummulative_regret_alg.append((T, cummulative_regret))
        cumulative_violate_alg.append((T, cummulative_violate))
        cumulative_baseline_alg.append((T, cummulative_baseline))
        optimal_alg.append((T, optimal_list))
        reward_alg.append((T, reward_list))
        baseline_alg.append((T, baseline_list))
        
end = time.time()
print('Finished! The total time we use is: ', end - start)

In [None]:
# ECC setting
start = time.time()

for M in num_agent:
    
    for T in range(trials):
        
        phi_true_group = phi_true_group_T[T]
        psi_true_group = psi_true_group_T[T]
        sample_id = sample_id_T[T]
        noise = noise_T[T]
        r_h = r_h_T[T]
        r_l = r_l_T[T]
        rho_bar = rho_bar_T[T]
        zeta = zeta_T[T]
        L = np.max(np.linalg.norm(phi_true_group[0], axis = 2))
        
        total_regret = 0
        total_reward = 0
        total_violate = 0
        total_baseline = 0
        cummulative_regret = []
        cummulative_violate = []
        cummulative_baseline = []
        optimal_list = []
        reward_list = []
        baseline_list = []
        
        t_last = 0
        V_last = ld * np.eye(d)
        W_syn = np.zeros((d, d))
        U_syn = np.zeros((d, 1))
        
        W_new_list = [np.zeros((d, d)) for i in range(M)]
        U_new_list = [np.zeros((d, 1)) for i in range(M)]
        
        B = (iterations * np.log(M * iterations)) / (d * M)
        print('B is equal to', B)
        
        syn = 0

        for t in range(1, iterations + 1): 
            
            print('')
            print('Trial: {} t: {}:'.format(T,t))
            
            index = sample_id[t-1]
            
            for i in range(M):
                
                phi = phi_true_group[i][index]
                psi = psi_true_group[i][index]

                x_star = np.argmax(np.dot(np.array(psi), theta_true))
                optimal = np.dot(np.array(phi[x_star]), theta_true)
                print('best decision is:', np.argmax(np.dot(phi, theta_true)))

                x_b = np.argsort(phi.dot(theta_true))[::-1][baseline_idx]
                r_b = phi[x_b].dot(theta_true)
                print('baseline decision is:', x_b)

                V_bar = ld * np.eye(d) + W_syn + W_new_list[i]
                theta_hat = np.dot(np.linalg.inv(V_bar), (U_syn + U_new_list[i]))

                # construct the confidence ellipsoid beta
                beta = get_beta(rho = np.sqrt(1 + R ** 2), delta = delta_value / 2, V_bar = V_bar, theta_true = theta_true)
                
                # get the best combination of action and theta_tilde
                decision, theta_tilde = get_decision(psi, theta_hat, V_bar, beta, d)
                print('Decision:', decision)
                
                y = np.dot(phi[decision], theta_true)
                regret = optimal - y
                total_regret = total_regret + regret
                total_reward = total_reward + y

                # update W_new and U_new
                W_new_list[i] = W_new_list[i] + np.outer(psi[decision], psi[decision])
                U_new_list[i] = U_new_list[i] + psi[decision].reshape(-1, 1) * (y + noise[i][T][t-1])
                V = ld * np.eye(d) + W_syn + W_new_list[i]
                
                LHS_condition = np.log(np.linalg.det(V) / np.linalg.det(V_last)) * (t - t_last)
                
                if LHS_condition >= B:
                    
                    print('synchronization start for agent', i)
                    print('LHS condition is:', LHS_condition)
                    
                    syn = 1
                    
                print('----------')
                
                if y < (1 - alpha) * r_b:
                    
                    total_violate += 1
                    print('violate the constraint')
                
            if syn == 1:
                
                W_syn = W_syn + np.sum(W_new_list, axis=0)
                U_syn = U_syn + np.sum(U_new_list, axis=0)
                
                W_new_list = [np.zeros((d, d)) for i in range(M)]
                U_new_list = [np.zeros((d, 1)) for i in range(M)]
                t_last = t
                V_last = ld * np.eye(d) + W_syn
                
                syn = 0
                    
            print('cummulative_regret is: ', total_regret)
            cummulative_regret.append(total_regret)
            cummulative_violate.append(total_violate)
            cummulative_baseline.append(total_baseline)
            optimal_list.append(np.max(np.dot(phi, theta_true)))
            reward_list.append(y)
            baseline_list.append((1 - alpha) * r_b)
            
        cummulative_regret_ECC.append((T, cummulative_regret))
        cumulative_violate_ECC.append((T, cummulative_violate))
        cumulative_baseline_ECC.append((T, cummulative_baseline))
        optimal_ECC.append((T, optimal_list))
        reward_ECC.append((T, reward_list))
        baseline_ECC.append((T, baseline_list))
        
end = time.time()
print('Finished! The total time we use is: ', end - start)

In [None]:
# stage-wise TS setting
start = time.time()

for T in range(trials):
        
    phi_true_group = phi_true_group_T[T]
    psi_true_group = psi_true_group_T[T]
    sample_id = sample_id_T[T]
    noise = noise_T[T]
    r_h = r_h_T[T]
    r_l = r_l_T[T]
    rho_bar = rho_bar_T[T]
    zeta = zeta_T[T]
    L = np.max(np.linalg.norm(phi_true_group[0], axis = 2))
        
    total_regret = 0
    total_reward = 0
    total_violate = 0
    total_baseline = 0
    cummulative_regret = []
    cummulative_violate = []
    cummulative_baseline = []
    optimal_list = []
    reward_list = []
    baseline_list = []
    
    delta_ = delta_value / (10 * iterations)
    
    W = np.zeros((d, d))
    U = np.zeros((d, 1))

    for t in range(1, iterations + 1): 

        print('')
        print('Trial: {} t: {}:'.format(T,t))
        
        index = sample_id[t-1]
        phi = phi_true_group[0][index]
        psi = psi_true_group[0][index]

        x_star = np.argmax(np.dot(np.array(psi), theta_true))
        optimal = np.dot(np.array(phi[x_star]), theta_true)
        print('best decision is:', np.argmax(np.dot(phi, theta_true)))

        x_b = np.argsort(phi.dot(theta_true))[::-1][baseline_idx]
        r_b = phi[x_b].dot(theta_true)
        print('baseline decision is:', x_b)
        
        # sample eta
        eta = np.random.multivariate_normal(mean = np.zeros(d), cov = np.eye(d))
        
        # compute RLS-estimate theta_hat and V
        V = ld * np.eye(d) + W
        theta_hat = np.dot(np.linalg.inv(V), U)
        
        # compute the beta
        beta = R * np.sqrt(d * np.log((1 + t * L ** 2 / ld) / delta_)) + np.sqrt(ld) * np.linalg.norm(theta_true)
        
        # calculate V^{-1/2}
        eigvals, eigvecs = np.linalg.eigh(V)
        temp = eigvecs.dot(np.diag(1.0 / np.sqrt(eigvals))).dot(eigvecs.T)
        
        # compute theta_tilde
        theta_tilde = theta_hat + beta * (temp.dot(eta)).reshape(-1, 1)
        
        # compute the estimated safe set Xi
        Xi = (psi.dot(theta_hat) - (beta * np.array([np.sqrt(psi[i].dot(np.linalg.inv(V)).dot(psi[i].T)) for i in range(psi.shape[0])]).reshape(-1, 1)) >= (1 - alpha) * r_b)
        phi_set = phi[Xi.ravel()]
        psi_set = psi[Xi.ravel()]
        
        # get the best action
        if (psi_set.size != 0) and (np.min(np.linalg.eigvals(V)) >= np.square(2 * L * beta / ((optimal - r_h) + alpha * r_b))):

            decision = np.argmax(psi_set.dot(theta_tilde))
            psi_new = psi_set[decision]
            y = np.dot(phi_set[decision], theta_true)
            print("play learner's decision:", decision)

        else:
            
            decision = x_b
            total_baseline += 1
            psi_new = (1 - rho_bar[T][t-1]) * psi[decision] + rho_bar[T][t-1] * zeta[T][t-1]
            y = (1 - rho_bar[T][t-1]) * np.dot(phi[decision], theta_true) + rho_bar[T][t-1] * np.dot(zeta[T][t-1], theta_true)
            print("play conservative decision:", decision)
            
        regret = optimal - y
        total_regret = total_regret + regret
        total_reward = total_reward + y
        
        W += np.outer(psi_new, psi_new)
        U += psi_new.reshape(-1, 1) * (y + noise[0][T][t-1])
                
        if y < (1 - alpha) * r_b:

            total_violate += 1
            print('violate the constraint')
        
        print('cummulative_regret is: ', total_regret)
        cummulative_regret.append(total_regret)
        cummulative_violate.append(total_violate)
        cummulative_baseline.append(total_baseline)
        optimal_list.append(np.max(np.dot(phi, theta_true)))
        reward_list.append(y)
        baseline_list.append((1 - alpha) * r_b)
        
    cummulative_regret_sw.append((T, cummulative_regret))
    cumulative_violate_sw.append((T, cummulative_violate))
    cumulative_baseline_sw.append((T, cummulative_baseline))
    optimal_sw.append((T, optimal_list))
    reward_sw.append((T, reward_list))
    baseline_sw.append((T, baseline_list))
    
end = time.time()
print('Finished! The total time we use is: ', end - start)

In [None]:
x_value, y_alg = prepare_plot_data(cummulative_regret_alg, trials, iterations, num_agent, every_num_point)
x_value_r, y_optimal_alg = prepare_plot_data(optimal_alg, trials, iterations, num_agent, 1)
x_value_r, y_reward_alg = prepare_plot_data(reward_alg, trials, iterations, num_agent, 1)
x_value_r, y_baseline_alg = prepare_plot_data(baseline_alg, trials, iterations, num_agent, 1)

x_value, y_ECC = prepare_plot_data(cummulative_regret_ECC, trials, iterations, num_agent, every_num_point)
x_value_r, y_optimal_ECC = prepare_plot_data(optimal_ECC, trials, iterations, num_agent, 1)
x_value_r, y_reward_ECC = prepare_plot_data(reward_ECC, trials, iterations, num_agent, 1)
x_value_r, y_baseline_ECC = prepare_plot_data(baseline_ECC, trials, iterations, num_agent, 1)

x_value, y_sw = prepare_plot_data(cummulative_regret_sw, trials, iterations, num_agent, every_num_point)
x_value_r, y_optimal_sw = prepare_plot_data(optimal_sw, trials, iterations, num_agent, 1)
x_value_r, y_reward_sw = prepare_plot_data(reward_sw, trials, iterations, num_agent, 1)
x_value_r, y_baseline_sw = prepare_plot_data(baseline_sw, trials, iterations, num_agent, 1)

In [None]:
# plot the figure
plt.figure(figsize=(12, 8), dpi=300)

colors = (['black', 'blue', 'darkgreen', 'purple', 'darkred', 'grey'])
markers = ['*', 's', 'o', 'X', '^', 'P']

plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rc('legend', fontsize = 25)

ax = plt.gca()
ax.ticklabel_format(style='sci', axis='both', scilimits=(0, 0), useOffset=False)

plt.plot(x_value, y_alg, label = 'Algorithm', color = colors[1], linewidth=3)
plt.scatter(x_value[::100], y_alg[::100], label = 'Algorithm', marker = markers[1], color = colors[1], s=300)
plt.plot(x_value, y_ECC, label = 'ECC', color = colors[2], linewidth=3)
plt.scatter(x_value[::100], y_ECC[::100], label = 'ECC', marker = markers[2], color = colors[2], s=300)
plt.plot(x_value, y_sw, label = 'Stage-Wise-TS', color = colors[3], linewidth=3)
plt.scatter(x_value[::100], y_sw[::100], label = 'Stage-Wise-TS', marker = markers[3], color = colors[3], s=300)

legend_elements = [mlines.Line2D([0], [0], color=colors[1], lw = 5, label = 'DiSC-UCB', marker = markers[1], markersize = 15),
                   mlines.Line2D([0], [0], color=colors[2], lw = 5, label = 'DisLinUCB', marker = markers[2], markersize = 15),
                   mlines.Line2D([0], [0], color=colors[3], lw = 5, label = 'SCLTS', marker = markers[3], markersize = 15)]

plt.grid(True)
plt.xlabel('round,t', fontsize=25)
plt.ylabel('cumulative regret Rt', fontsize=25)
plt.title('synthetic data', fontsize=25)
plt.legend(handles=legend_elements)
# plt.savefig('plot_1.pdf', dpi=600, bbox_inches = 'tight')
plt.show()
plt.close()

In [None]:
# plot the figure
plt.figure(figsize=(12,8), dpi=300)

colors = (['black', 'blue', 'darkgreen', 'purple', 'darkred', 'grey'])
markers = ['*', 's', 'o', 'X', '^', 'P']

plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rc('legend', fontsize = 25)

ax = plt.gca()
ax.ticklabel_format(style='sci', axis='both', scilimits=(0, 0), useOffset=False)

plt.plot(x_value_r[::100], y_optimal_alg[::100], label = 'Optimal-Reward', linestyle = '--', color = colors[0], linewidth=3)
plt.plot(x_value_r, y_reward_alg, label = 'DiSC-UCB', color = colors[1], linewidth=3)
plt.plot(x_value_r[::100], y_baseline_sw[::100], label = 'Conservative-Reward', linestyle = '--', color = colors[5], linewidth=3)

plt.ylim(0.5, 1.1)

plt.grid(True)
plt.xlabel('round,t', fontsize=25)
plt.ylabel('reward,r', fontsize=25)
plt.title('synthetic data', fontsize=25)
plt.legend()
# plt.savefig('plot_2.pdf', dpi=600, bbox_inches = 'tight')
plt.show()
plt.close()

In [None]:
def preprocessing_plot(data):
    
    list_T = []
    list_vio = [0]
    
    for T in range(trials):

        list_temp = [0]
        
        for i in range(1, len(data[T][1])):
            
            if data[T][1][i] != data[T][1][i-1]:

                list_temp.append(list_temp[-1] + 1)

            else:

                list_temp.append(list_temp[-1])

        list_T.append(list_temp)
        
    for i in range(1, len(list_T[0])):
        
        if any(sublist[i] != 0 for sublist in list_T):
            
            list_vio.append(list_vio[-1] + 1)
            
        else:
            
            list_vio.append(list_vio[-1])
            
    return list_vio

In [None]:
x_value_r = [i for i in range(iterations)]
num_violation_alg = preprocessing_plot(cumulative_violate_alg)
num_violation_ECC = preprocessing_plot(ECC_violation)
num_violation_sw = preprocessing_plot(cumulative_violate_sw)

In [None]:
# plot the figure
plt.figure(figsize=(12,8), dpi=300)

colors = (['black', 'blue', 'darkgreen', 'purple', 'darkred', 'grey'])
markers = ['*', 's', 'o', 'X', '^', 'P']

plt.rcParams['xtick.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 25
plt.rc('legend', fontsize = 25)

ax = plt.gca()
ax.ticklabel_format(style='sci', axis='both', scilimits=(0, 0), useOffset=False)

plt.plot(x_value_r, num_violation_alg, label = 'DiSC-UCB', color = colors[1], linewidth = 3, linestyle = '-')
plt.plot(x_value_r, num_violation_ECC, label = 'DisLinUCB', color = colors[2], linewidth = 3, linestyle = '--')
plt.plot(x_value_r, num_violation_sw, label = 'SCLTS', color = colors[3], linewidth = 3, linestyle = '-.')

plt.grid(True)
plt.xlabel('round,t', fontsize=25)
plt.ylabel('cumulative violation', fontsize=25)
plt.title('synthetic data', fontsize=25)
plt.legend()
# plt.savefig('plot_3.pdf', dpi=600, bbox_inches = 'tight')
plt.show()
plt.close()