## Generic Test

In [None]:
from LambdaSARSA import LambdaSARSA
import gymnasium as gym

env = gym.make('LunarLander-v2')
lam_sarsa = LambdaSARSA(env)
lam_sarsa.train(num_replications=5, num_episodes=200)
lam_sarsa.get_results()
lam_sarsa.show_results()
lam_sarsa.display_best_policy()

## WANDB Sweep

In [None]:
!rm -r __pycache__/
import wandb
from LSPI import LSPI

sweep_config = {
    "method": "random",
    "name": "sweep",
    "metric": {"goal": "maximize", "name": "Score"},
    'parameters': {
        'alpha_a': {'distribution': 'uniform', 'min': 0, 'max': 1},
        'alpha_b': {'distribution': 'uniform', 'min': 0, 'max': 1},
        'eps_a': {'distribution': 'uniform', 'min': 0, 'max': 1},
        'eps_b': {'distribution': 'uniform', 'min': 0, 'max': 1},
        'eta': {'distribution': 'log_uniform', 'min': 1e-7, 'max': 1e-5},
        'n_batch_size': {'distribution': 'log_uniform', 'min': 2**8, 'max': 2**10},
    }
}

#wandb.init(project="DSOR 646 - Final")
sweep_id = wandb.sweep(sweep=sweep_config, project="my-first-sweep")

import gymnasium as gym
env = gym.make('MountainCar-v0')

def main():
    run = wandb.init()
    model = LSPI(env,
        alpha_a = wandb.config.alpha_a,
        alpha_b = wandb.config.alpha_b,
        eps_a = wandb.config.eps_a,
        eps_b = wandb.config.eps_b,
        eta = wandb.config.eta,
        n_batch_size = wandb.config.n_batch_size
        )
    model.train(num_replications=10,num_episodes=300)
    maxETDR, maxETDRhw, meanMaxTestEETDR, maxTestHW, meanAULC, hwAULC, time = model.get_results()
    alg_score = 0.6*(meanMaxTestEETDR-maxTestHW) + 0.4*(meanAULC-hwAULC)
    wandb.log({
            "maxETDR": maxETDR,
            "maxETDRhw": maxETDRhw,
            "meanMaxTestEETDR": meanMaxTestEETDR,
            "maxTestHW": maxTestHW,
            "meanAULC": meanAULC,
            "hwAULC": hwAULC,
            "time": time,
            "Score": alg_score
        })

# Start sweep job.
wandb.agent(sweep_id, function=main, count=5)

## Additions Test

In [None]:
from MDP import MDP_Tiled, FourierBasis
from SemiGradSARSA import SemiGradSARSA

import numpy as np
from time import perf_counter
from tiles3 import tiles

!rm -r __pycache__/

class SemiGradSARSA_fourier(SemiGradSARSA):
    name = "Semi-gradient n-step SARSA"

    def __init__(self, env, n=1, basis_order=2, **kwargs) -> None:
        super().__init__(env, **kwargs)
        self.algorithm_name = f"Semi-gradient {n}-step SARSA with {basis_order} order Fourier Basis"
        self.nm1 = n-1
        self.fourier_basis = FourierBasis(self.Ssize, basis_order)

        self.max_size = len(self.fourier_basis.coefficients)
        self.scale_factor = self.fourier_basis.transform(self.scale_factor)

    #@override(SemiGradSARSA) # Needs Python >3.12
    def get_action(self, state, policy, epsilon=0) -> int:
        w, iht = policy
        if np.random.rand() > epsilon:
            #transformed_state = self.fourier_basis.transform(state)
            #return self.argmaxQbar(transformed_state,w,iht)
            return self.argmaxQbar(state,w,iht)
        else:
            return self.env.action_space.sample()
        
    #@override(MDPBase) # Needs Python >3.12
    def phi(self,s,a,iht):
        s = self.fourier_basis.transform(s)
        return tiles(iht, self.num_tiles,list(s*self.scale_factor),[a])


import gymnasium as gym
env = gym.make('LunarLander-v2')
sgsf = SemiGradSARSA_fourier(env)
sgsf.train(num_replications=5, num_episodes=100)
#sgsf.get_results()
##sgsf.show_results()
#sgsf.display_best_policy()

In [None]:
from MDP import MDP_Tiled, FourierBasis
from SemiGradSARSA import SemiGradSARSA

import numpy as np
from time import perf_counter
from tiles3 import tiles

!rm -r __pycache__/

class SemiGradSARSA_boltzmann(SemiGradSARSA):
    name = "Semi-gradient n-step SARSA"

    def __init__(self, env, n=1, basis_order=4, **kwargs) -> None:
        super().__init__(env, **kwargs)
        self.algorithm_name = f"Semi-gradient {n}-step SARSA with {basis_order} order Fourier Basis"
        self.nm1 = n-1



    #@override(SemiGradSARSA) # Needs Python >3.12
    def get_action(self, state, policy, epsilon=0) -> int:
        w, iht = policy
        if np.random.rand() > epsilon:
            #transformed_state = self.fourier_basis.transform(state)
            #return self.argmaxQbar(transformed_state,w,iht)
            return self.argmaxQbar(state,w,iht)
        else:
            return self.env.action_space.sample()
        
    #@override(MDPBase) # Needs Python >3.12
    def phi(self,s,a,iht):
        s = self.fourier_basis.transform(s)
        return tiles(iht, self.num_tiles,list(s*self.scale_factor),[a])


import gymnasium as gym
env = gym.make('LunarLander-v2')
sgsb = SemiGradSARSA_boltzmann(env)
sgsb.train(num_replications=5, num_episodes=100)
#sgsb.get_results()
#sgsb.show_results()
#sgsb.display_best_policy()