In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# specify the path
PATH = "/content/drive/My Drive/Columbia_Sept_19_/Class/IEOR research/infinite_horizon_policy_gradient/NN_model_implementation"
import sys
sys.path.append(PATH)

In [5]:
from utils import *
from implementations import *
from optimization_grad import *
from variance_grad import *
from analysis import *
import copy

Tensorboard:

In [None]:
%load_ext tensorboard
import os
logs_base_dir = "runs"
os.makedirs(logs_base_dir, exist_ok=True)
from torch.utils.tensorboard import SummaryWriter

In [None]:
%tensorboard --logdir "runs"

# In-policy Optimization

In [None]:
# define environment
environment = riverswim()

# policies
policy_benchmark = Policy().cuda()
policy_causality = copy.deepcopy(policy_benchmark).cuda()
policy_baseline  = copy.deepcopy(policy_benchmark).cuda()
policy_ihp1      = copy.deepcopy(policy_benchmark).cuda()
# configuration
rolling_out_start_recording = 1000
rolling_out_step = 15000
n_rolling_out = 5
optimization_step = 50
lr = 0.25

#visualization
tensorboard = SummaryWriter()
reward_list_in_policy = []
for time_step in tqdm(range(optimization_step), desc = "Optimization step", leave = True):

    policy_grad_benchmark, reward_benchmark = gradient_estimate_benchmark(policy_benchmark,
                                                                          policy_benchmark,
                                                                          environment,
                                                                          rolling_out_start_recording,
                                                                          rolling_out_step,
                                                                          n_rolling_out)
    update_parameter(policy = policy_benchmark, 
                     policy_gradient = policy_grad_benchmark, 
                     learning_rate= lr,
                     normalize = True)
    
    policy_grad_causality, reward_causality = gradient_estimate_causality(policy_causality,
                                                                          policy_causality,
                                                                          environment,
                                                                          rolling_out_start_recording,
                                                                          rolling_out_step,
                                                                          n_rolling_out)
    update_parameter(policy = policy_causality, 
                     policy_gradient = policy_grad_causality, 
                     learning_rate= lr,
                     normalize = True)
    
    policy_grad_baseline, reward_baseline = gradient_estimate_baseline(policy_baseline,
                                                                        policy_baseline,
                                                                        environment,
                                                                        rolling_out_start_recording,
                                                                        rolling_out_step,
                                                                        n_rolling_out)
    update_parameter(policy = policy_baseline, 
                     policy_gradient = policy_grad_baseline, 
                     learning_rate= lr,
                     normalize = True)
    
    policy_grad_ihp1, reward_ihp1 = gradient_estimate_ihp1(policy_ihp1,
                                                           policy_ihp1,
                                                           environment,
                                                           rolling_out_start_recording,
                                                           rolling_out_step,
                                                           n_rolling_out,
                                                           in_policy = True)
    update_parameter(policy = policy_ihp1, 
                     policy_gradient = policy_grad_ihp1, 
                     learning_rate= lr,
                     normalize = True)
    
    tensorboard.add_scalars(main_tag="Average Reward Per Step: In-Policy", 
                            tag_scalar_dict={"Benchmark": reward_benchmark,
                                             "Causality": reward_causality,
                                             "Baseline": reward_baseline,
                                             "ihp1": reward_ihp1},
                            global_step = time_step)
    reward_list_in_policy.append([reward_benchmark, reward_causality, reward_baseline, reward_ihp1])

tensorboard.close()

In [None]:
reward_array_np = np.array(reward_list_in_policy)
df = pd.DataFrame(reward_array_np, columns= ["Benchmark", "Causality", "Baseline", "IHP"])
sns.lineplot(data = df, dashes=False)
plt.title("Optimization Procedure: In-policy setting")
plt.xlabel("Iteration")
plt.ylabel("Average Reward Per Step")
plt.savefig("in-policy_optimization.png")

# Off-policy Optimization

In [None]:
# define environment
environment = riverswim()

# policies
policy_0 = Policy().cuda()
policy_benchmark = copy.deepcopy(policy_0).cuda()
policy_causality = copy.deepcopy(policy_0).cuda()
policy_baseline  = copy.deepcopy(policy_0).cuda()
policy_ihp1      = copy.deepcopy(policy_0).cuda()
# configuration
rolling_out_start_recording = 1000
rolling_out_step = 15000
n_rolling_out = 5
optimization_step = 100
lr = 0.15

reward_list_off_policy = []

#visualization
tensorboard = SummaryWriter()

for time_step in tqdm(range(optimization_step), desc = "Optimization step", leave = True):


    policy_grad_benchmark, policy_grad_causality,policy_grad_baseline, policy_grad_ihp1 = gradient_estimate_aggregate(policy_0, 
                                                                                                                      policy_benchmark,
                                                                                                                      policy_causality,
                                                                                                                      policy_baseline,
                                                                                                                      policy_ihp1,
                                                                                                                      environment, 
                                                                                                                      rolling_out_start_recording, 
                                                                                                                      rolling_out_step, 
                                                                                                                      n_rolling_out)
    update_parameter(policy = policy_benchmark, 
                     policy_gradient = policy_grad_benchmark, 
                     learning_rate= lr,
                     normalize = True)
    
    update_parameter(policy = policy_causality, 
                     policy_gradient = policy_grad_causality, 
                     learning_rate= lr,
                     normalize = True)
    

    update_parameter(policy = policy_baseline, 
                     policy_gradient = policy_grad_baseline, 
                     learning_rate= lr,
                     normalize = True)
    
    update_parameter(policy = policy_ihp1, 
                     policy_gradient = policy_grad_ihp1, 
                     learning_rate= lr,
                     normalize = True)
    
    reward_benchmark = roll_out_evaluate_average(policy_benchmark, environment, rolling_out_start_recording, rolling_out_step)
    reward_causality = roll_out_evaluate_average(policy_causality, environment, rolling_out_start_recording, rolling_out_step)
    reward_baseline = roll_out_evaluate_average(policy_baseline, environment, rolling_out_start_recording, rolling_out_step)
    reward_ihp1 = roll_out_evaluate_average(policy_ihp1, environment, rolling_out_start_recording, rolling_out_step)
    
    tensorboard.add_scalars(main_tag="Average reward per step: off policy", 
                            tag_scalar_dict={"Benchmark": reward_benchmark,
                                             "Causality": reward_causality,
                                             "Baseline": reward_baseline,
                                             "ihp1": reward_ihp1},
                            global_step = time_step)
    reward_list_off_policy.append([reward_benchmark, reward_causality, reward_baseline, reward_ihp1])
    

tensorboard.close()

In [None]:
reward_array_np = np.array(reward_list_in_policy)
df = pd.DataFrame(reward_array_np, columns= ["Benchmark", "Causality", "Baseline", "IHP"])
sns.lineplot(data = df, dashes=False)
plt.title("Optimization Procedure: Off-policy setting")
plt.xlabel("Iteration")
plt.ylabel("Average Reward Per Step")
plt.savefig("Off-policy_optimization.png")

# Individually Rolling Out Variance Estimate

In [None]:
# define environment
env = riverswim()

# policies
policy_0 = Policy().cuda()
policy_1 = copy.deepcopy(policy_0).cuda()
#policy_1 = Policy().cuda()

# configuration
rolling_out_start_recording = 1000
rolling_out_step = 15000
n_rolling_out = 100
ihp1_record = variance_estimate_ihp1(policy_0 = policy_0, 
                                     policy_1 = policy_1, 
                                     env = env, 
                                     rolling_out_start_recording = rolling_out_start_recording, 
                                     rolling_out_step = rolling_out_step, 
                                     n_rolling_out = n_rolling_out
                                    )
benchmark_record = variance_estimate_benchmark(policy_0 = policy_0, 
                                               policy_1 = policy_1, 
                                               env = env, 
                                               rolling_out_start_recording = rolling_out_start_recording, 
                                               rolling_out_step = rolling_out_step, 
                                               n_rolling_out = n_rolling_out
                                               )
baseline_record = variance_estimate_baseline(policy_0 = policy_0, 
                                                                   policy_1 = policy_1, 
                                                                   env = env, 
                                                                   rolling_out_start_recording = rolling_out_start_recording, 
                                                                   rolling_out_step = rolling_out_step, 
                                                                   n_rolling_out = n_rolling_out
                                                                   )
causality_record = variance_estimate_causality(policy_0 = policy_0, 
                                               policy_1 = policy_1, 
                                               env = env, 
                                               rolling_out_start_recording = rolling_out_start_recording, 
                                               rolling_out_step = rolling_out_step, 
                                               n_rolling_out = n_rolling_out
                                               )


In [None]:
analysis(benchmark_record, causality_record, baseline_record, ihp1_record)
boxplot_analysis(benchmark_record, causality_record, baseline_record, ihp1_record, np.random.randint(20))

# Common Random Number Variance Estimate

In [None]:
# define environment
env = riverswim()

# policies
policy_0 = Policy().cuda()
policy_1 = copy.deepcopy(policy_0).cuda()
#policy_1 = Policy().cuda()

# configuration
rolling_out_start_recording = 1000
rolling_out_step = 15000
n_rolling_out = 100
benchmark_record, causality_record, baseline_record, ihp1_record = variance_estimate_aggregate(policy_0 = policy_0, 
                                                                                               policy_1 = policy_1, 
                                                                                               env = env, 
                                                                                               rolling_out_start_recording = rolling_out_start_recording, 
                                                                                               rolling_out_step = rolling_out_step, 
                                                                                               n_rolling_out = n_rolling_out
                                                                                              )

In [None]:
analysis(benchmark_record, causality_record, baseline_record, ihp1_record)
boxplot_analysis(benchmark_record, causality_record, baseline_record, ihp1_record, np.random.randint(20))