<a href="https://colab.research.google.com/github/achanhon/coursdeeplearningcolab/blob/master/rl_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium
!pip install swig
!pip install "gymnasium[box2d]"

In [None]:
import torch
import gymnasium

GAMMA=0.99

class MemoryBuffer:
    def __init__(self):
        self.i = 0
        self.full = False

        L=50000
        self.s = torch.zeros(L,8)
        self.a = torch.zeros(L,4)
        self.r = torch.zeros(L)
        self.s_ = torch.zeros(L,8)
        self.f = torch.zeros(L)

    def push(self, s, a, r, s_, f):
        self.s[self.i] = s
        self.a[self.i][a] = 1
        self.r[self.i] = r
        self.s_[self.i] = s_
        self.f[self.i] = 1-f
        self.i += 1
        if self.i >= self.r.shape[0]:
            self.full = True
            self.i = 0

    def getBatch(self, B=64):
        if self.full:
            I = list((torch.rand(B) * self.r.shape[0]).long())
        else:
            I = list((torch.rand(B) * self.i).long())
        return (self.s[I], self.a[I], self.r[I], self.s_[I], self.f[I])

def trial(env,agent, T, memory):
    totalR = 0
    s, info = env.reset()
    s = torch.Tensor(s)
    for _ in range(1000):
        a = agent.sample(s,T)
        s_, r, terminated, truncated, info = env.step(a)
        s_ = torch.Tensor(s_)

        memory.push(s,a,r,s_,terminated or truncated)
        totalR+=r
        if terminated or truncated:
            return totalR
        else:
            s = s_

def train(agent,T,memory):
    optimizer = torch.optim.Adam(agent.parameters(), lr=0.0001, weight_decay=0.001)

    meanloss = torch.zeros(50)
    for step in range(50):
        B = memory.getBatch()
        S, A, R, S_,F = B

        Q = agent.Q(S)
        QA = (Q * A).sum(1)
        Q_ = agent.V(S_,T)
        loss = ((GAMMA * Q_ * F + R - QA)**2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        with torch.no_grad():
            meanloss[step] = loss.clone()
    return float(meanloss.mean())


In [None]:
def leakyRelu(x):
    return torch.minimum(x,x*0.2)

class Block(torch.nn.Module):
    def __init__(self):
        super(Block,self).__init__()

        self.f1 =torch.nn.Linear(24,8)
        self.f2 =torch.nn.Linear(8,24)
        self.f3 =torch.nn.Linear(24,16)

    def forward(self,x):
        f = leakyRelu(self.f1(x))
        f = leakyRelu(self.f2(f))
        f = leakyRelu(self.f3(f))

        tmp = torch.zeros(x.shape[0],8)
        f = torch.cat([tmp,f],dim=1)
        return x+f


class LunarAgent(torch.nn.Module):
    def __init__(self):
        super(LunarAgent,self).__init__()

        self.b1 = Block()
        self.b2 = Block()
        self.b3 = Block()
        self.b4 = Block()
        self.b5 = Block()

        self.A =torch.nn.Linear(24,4)

    def forward(self,x):
        code = torch.zeros(x.shape[0],16)
        x = torch.cat([x,code],dim=1)

        x = self.b1(x)
        x = self.b2(x)
        x = self.b3(x)
        x = self.b4(x)
        x = self.b5(x)

        return self.A(x)

    def Q(self,x):
        return self.forward(x)

    def pi(self,Q,T):
        return torch.nn.functional.softmax(T*Q,dim=1)

    def V(self,x,T):
        Q =self.Q(x)
        pi = self.pi(Q,T)
        return (Q*pi).sum(1)

    def sample(self,x,T):
        with torch.no_grad():
            pi = self.pi(self.Q(x.view(1,-1)),T)
            return int(torch.multinomial(pi, num_samples=1))

In [None]:
env = gymnasium.make("LunarLander-v3")

memory = MemoryBuffer()
agent = LunarAgent()

T = 0.25
for _ in range(10):
    trial(env,agent,T,memory)

for _ in range(10):
    T = 2*T
    for _ in range(50):
        v = trial(env,agent,T,memory)
        l = train(agent,T,memory)
        print("\t",v,l)
    tot = 0
    for _ in range(10):
        tot+=trial(env,agent,T,memory)
    print(T,tot/10)
