Neuroevolution: Exercise 4
=========
###### Artur Ganzha 10019651
---------	
###### Raul Gorek 10061333
---------	

In [157]:
import gymnasium as gym
import numpy as np
import random as random
from collections import deque, namedtuple
import matplotlib as plt

In [158]:
env = gym.make("CartPole-v1")
num_actions = env.action_space.n
obs_shape = env.observation_space.shape[0]


Nutzen Sie den Code Ihres neuronalen Netzes, um ein Q-Netzwerk zu implementieren, welches als Eingabe die aktuelle Wagenposition, die Beschleunigung des Wagens, den Winkel des Stabes sowie die Beschleunigung des Stabwinkels erh ̈alt, um daraus die Q-Values zu berechnen. Es wird empfohlen, sich dabei an der Architektur, aus dem Notebook error_calculation.ipynb, zu orientieren.


In [159]:
### NeuralNet
def derivative_bcel(prediction, ground_truth):
    x =  np.where(ground_truth == 0, 1.0 / (1.0 - prediction), -1.0 / prediction)
    return x

class Linear:
    def __init__(self, input_size, output_size):
        self.input_size = input_size
        self.output_size = output_size
        self.W = np.random.uniform(-1, 1,(self.input_size,self.output_size))
        self.B = np.zeros((1, self.output_size))
    
    def forward(self, x):
        self.fw = x
        return np.dot(x, self.W) + self.B
    
    def backward(self, d, lr):
        d_w = np.dot(self.fw.T, d)
        d_e = np.dot(d, self.W.T)
        d_b = np.sum(d, axis=0, keepdims=True)
        self.W -= lr * d_w / self.fw.shape[0]
        self.B -= lr * d_b / self.fw.shape[0]
        return d_e


class ReLU:
    def __init__(self):
        pass

    def forward(self, x):
        self.fw = x
        return x * (x > 0)
    
    def backward(self, d, lr):
        return d * np.where(self.fw > 0, 1.0, 0.0)
    

class Sigmoid:
    def __init__(self):
        pass
    
    def forward(self, x):
        self.fw = x
        self.out = 1.0 / (1.0 + np.exp(-x))
        return self.out
    
    def backward(self, d, lr):
        return d * (self.out * (1.0 - self.out))
    

class NeuralNetwork:
    def __init__(self, layers: list):
        self.layers = layers

    def forward_pass(self, x):
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward_pass(self, deriv, lr):
        for layer in reversed(self.layers):
            deriv = layer.backward(deriv, lr)


In [160]:
nn = NeuralNetwork([
    Linear(obs_shape,32), ReLU(),
    Linear(32,32), ReLU(),
    Linear(32, num_actions),
])

In [161]:
state, info = env.reset()
action = env.action_space.sample()
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated

# Berechne Q-Values für alle möglichen Aktionen
q_values = nn.forward_pass(state)
print("#### q_values")
print(q_values)
print("#### next state")
print(next_state)
print("#### action")
print(action)
print("#### reward")
print(reward)

#### q_values
[[-0.09657385  0.20543849]]
#### next state
[-0.01413328  0.17400022 -0.03787411 -0.3036097 ]
#### action
1
#### reward
1.0


In [162]:
Transition = namedtuple("Transition",("state", "action", "next_state", "reward", "done"))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [163]:
# Berechne Bellman-Gleichung
GAMMA = 0.95

# Rechte Seite des Temporal Difference Errors
q_target = reward + GAMMA * np.max(nn.forward_pass(next_state), axis=1) * (1 - done)

print(q_target)

[0.31712957]


Abschließend soll der Agent implementiert werden. Fu ̈r das Training des neuronalen Netzes soll die Bellman-Gleichung (Q(s, a) = r + γQ(s′, π(s′))) verwendet werden. Verwenden Sie den Mean Squared Error Loss um den Fehler zu bestimmen. U ̈berlegen Sie sich eine M ̈oglichkeit, wie sie einen guten Tradeoff zwischen Exploration und Exploitation implementieren k ̈onnen. Stellen Sie den Trainingsverlauf in einer Grafik dar. An der y-Achse soll dabei der akkumulierte Reward am Ende einer Episode abzulesen sein.

In [164]:
MSE = (q_values - q_target) ** 2

In [165]:
LR = 0.001
BATCH_SIZE = 64
MEMORY_CAPACITY = 10000
EPSILON_DECAY = 0.995
MIN_EPSILON = 0.01
GAMMA = 0.95
TARGET_UPDATE = 10

In [166]:
def epsilon_greedy(q_values, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(num_actions))
    else:
        return np.argmax(q_values)

In [167]:
# Initialize Neural Network and Memory
nn = NeuralNetwork([
    Linear(obs_shape, 32), ReLU(),
    Linear(32, 32), ReLU(),
    Linear(32, num_actions),
])

target_nn = NeuralNetwork([
    Linear(obs_shape, 32), ReLU(),
    Linear(32, 32), ReLU(),
    Linear(32, num_actions),
])