<h1><center>
    DSOR 646 - Reinforcement Learning </br>
    Brandon Hosley
</center></h1>

# On-policy Monte Carlo Control (every-visit):

- [ ] Tune for $Q_1$, $\varepsilon_a$, and $\varepsilon_b$.
- [ ] Execute at least 40 runs
- [ ] At least 10 reps per run
- [ ] Report Mean Time-Avg EETDR and Mean Max EETDR
- [ ] When milestone testing a policy, use at least 30 episodes
- [ ] Use the lower bound of the confidence interval, i.e., 95CILB, to identify a superlative policy
- [ ] Show a scatter plot (See the Lesson 9 Handout for an example of the desired scatter plot)

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

# Initialize environment
env = gym.make('CartPole-v1')

# Parameters
num_episodes = 5000
gamma = 0.99
epsilon = 0.1
min_epsilon = 0.01
decay_rate = 0.999

# Action-value function Q(s, a) and policy π(s)
Q = defaultdict(lambda: np.zeros(env.action_space.n))

# Tracking returns
returns = defaultdict(list)

def discretize_state(state, bins):
    # Discretize each state component separately
    state_bins = [np.digitize(state[i], bins[i]) for i in range(len(bins))]
    return tuple(state_bins)

# Define bins for discretization of the continuous state space
state_bins = [
    np.linspace(-4.8, 4.8, 10),  # Cart position
    np.linspace(-5, 5, 10),      # Cart velocity
    np.linspace(-0.418, 0.418, 10),  # Pole angle
    np.linspace(-5, 5, 10)       # Pole angular velocity
]

# Epsilon-greedy policy
def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.choice(env.action_space.n)
    return np.argmax(Q[state])

# Monte Carlo Control Loop
for episode in range(num_episodes):
    state = env.reset()
    state = discretize_state(state, state_bins)
    episode_data = []

    # Generate an episode following the current policy
    done = False
    while not done:
        action = epsilon_greedy_policy(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        next_state = discretize_state(next_state, state_bins)
        episode_data.append((state, action, reward))
        state = next_state

    # Calculate returns for the episode
    G = 0
    visited_pairs = set()
    for state, action, reward in reversed(episode_data):
        G = reward + gamma * G
        if (state, action) not in visited_pairs:
            visited_pairs.add((state, action))
            returns[(state, action)].append(G)
            Q[state][action] = np.mean(returns[(state, action)])

    # Decay epsilon
    epsilon = max(min_epsilon, epsilon * decay_rate)

# Demonstrate the learned policy
num_demo_episodes = 10
for _ in range(num_demo_episodes):
    state = env.reset()
    state = discretize_state(state, state_bins)
    done = False
    while not done:
        env.render()
        action = np.argmax(Q[state])
        next_state, _, done, _ = env.step(action)
        state = discretize_state(next_state, state_bins)

env.close()


# Off-policy Monte Carlo Control (every-visit, weighted importance sampling):

# Compare

In [1]:
import numpy as np
import time
from scipy.stats.qmc import LatinHypercube
from joblib import Parallel, delayed
from typing import Tuple
from datetime import datetime
#from CPv1_MCC_onpoloicy_algorithm_for_parDOE import MCC_onpolicy_DOE

def run_experiment (run_index:int, factors:np.ndarray) -> Tuple[int,float,float,float]:
    run_start_time = time.time()
    maxETDR, maxETDRhw, meanMaxTestEETDR, maxTestHW, meanAULC, hwAULC, secs_taken =\
        MCC_onpolicy_DOE(factors[0],factors[1],factors[2])
    alg_score = 0.6*(meanMaxTestEETDR-maxTestHW) + 0.4*(meanAULC-hwAULC)
    print(f"Complete experiment run {run_index} with a score of {alg_score:.2f} ({time.time() - run_start_time:.1f}s)")
    return run_index, maxETDR, maxETDRhw, meanMaxTestEETDR, maxTestHW, meanAULC, hwAULC, secs_taken

In [2]:
NUM_CPU_CORE_PROCS = 6
num_runs = 30

num_alg_feats = 3
rng_seed = 0
sampler = LatinHypercube(num_alg_feats, scramble=False, optimization="lloyd", seed=rng_seed)
factor_table = sampler.random(n=num_runs)

print(f"\nInitializing LHS experiment with {num_runs} runs...")
experiment_start_time = time.time()
# create an instance of the Parallel object to manage execution of our processes
parallel_manager = Parallel(n_jobs = NUM_CPU_CORE_PROCS)
# generate a list of function calls to run_experiment () for each row of the factor table 
# each row of the factor table is an algorithm design run 
# delayed () creates the list without actually executing run_experiment ()
run_list = (delayed (run_experiment)(run_index, factor_table[run_index]) for run_index in range (num_runs) )
#execute the list of run_experiment() calls in parallel
print ("\nExecuting experiment...")
results_table = parallel_manager (run_list)
results_table = np.array(results_table)
print (f"\n\nCompleted experiment ({time.time () - experiment_start_time:.3f}s)")

# combine the factor table with the results table, add column headers, and save the date to a CSV file
# compute algorithm run score, the average of the 95% CI lowerbounds for maximum and mean performance
maxEETDR_95CI_LB = results_table[:,3] - results_table[:,4]
meanEETDR_95CI_LB = results_table[:,5] - results_table[:,6]
score = 0.6*maxEETDR_95CI_LB + 0.4*meanEETDR_95CI_LB
results_table = np.column_stack((results_table[:,0], factor_table, results_table[:,1:], score))
# grab data for performance scatter plot
x = results_table[:,6]
y = results_table[:,8]
column_names = ["Run Index", "eps_a", "eps_b", "Init Qbar", "Sup EETDR", "Sup EETDR hw", "Mean Max EETDR", 
                "Mean Max EETDR hw", "Time-Avg EETDR", "Time-Avg EETDR hw", "Secs per run", "Score"]
results_table = np.row_stack((column_names, results_table))
filename_DOE = "MCC_onpolicy_results_DOE_" + datetime.now().strftime('%Y%m%d_%H%M%S') + ".csv"
np. savetxt (filename_DOE, results_table, delimiter = ",", fmt = "&s")


"""
Plot Performance Results

For example, we can compare different algorithms or same algorithm with large, 
nominal structural differences using these visual aids
"""

#import necessary libraries
import matplotlib. pyplot as plt

# create scatter plot
plt. scatter(x, y, label="MCC (on-policy) -- 40 reps per run, 10k episodes per rep")

# setting axes boundaries
plt.xlim(0,1) 
plt.ylim(0,1)
# setting title and labels
plt.title("MCC (on-policy) LHS DOE Performance Results")
plt.xlabel("Mean Maximum EETDR")
plt.ylabel("Mean Time-Average EETDR")
# grid on
plt.grid()
# legend on
plt.legend(loc='upper left', fontsize=7)
# display the plot
plt.show()

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.formula.api import ols

X = factor_table

poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)

input_features = column_names[1:num_alg_feats+1]
feature_names = [
    name.replace(' ','_').replace('^','_pow_').replace('*','_times_')
    for name in poly.get_feature_names_out(input_features=input_features)]
df = pd.DataFrame(X_poly, columns=feature_names)

# define response variable
df['AlgScore'] = score

# Create the formula string for the OLS model
# Exclude the first column (the constant term) from the predictors
predictors = '+'.join(df. columns [1: -1]) # Exclude '1' and 'y'
formula = f'AlgScore ~ {predictors}'

# Create and fit the OLS model
model = ols(formula, data=df)
results = model. fit ()

# Display the summary
print ("\n\n" )
print (results.summary())

# Perform ANOVA and display the table
anova_results = sm.stats.anova_lm(results, typ=2)
print(anova_results)