<a href="https://colab.research.google.com/github/achanhon/coursdeeplearningcolab/blob/master/Untitled26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
class Game:
  def __init__(self,other=None):
    self.A = [(0,1),(-1,0),(1,0),(0,0),(0,2)]
    if other is None:
      self.t = 0
      self.f=False
      self.p = (1,0)
      self.road = (torch.rand(3,100)<0.05).half()
      self.road[:,0:2]=0
    else:
      self.p,self.t,self.f,self.road = other

  def copy(self):
    return Game((self.p,self.t,self.f,self.road))

  def getVisibleState(self):
    assert not self.f, print("get state of a final state")
    return (self.p[0],self.p[1]-self.t), self.road[:,self.t:self.t+7]

  def getVisibleStateString(self):
    p,road = self.getVisibleState()
    string_parts = []
    for r in range(3):
      for c in range(7):
        if road[r][c]==1:
          string_parts.append("o")
        else:
          string_parts.append(" ")
        if r==p[0] and c==p[1]:
          string_parts[-1]="x"
      string_parts.append("\n")
    return "----------\n"+"".join(string_parts)+"----------"

  def possibleAction(self,dr,dc):
    if self.p[0]+dr<0:
      return False
    if self.p[0]+dr>2:
      return False
    if self.p[1]+dc<self.t+1:
      return False
    if self.p[1]+dc>self.t+5:
      return False
    return True

  def listPossibleActions(self):
    return [i for i in range(5) if self.possibleAction(self.A[i][0],self.A[i][1])]

  def update(self,a):
    assert not self.f, print("update a final state")
    dr,dc = self.A[a]
    assert self.possibleAction(dr,dc), print("unacceptable action")
    p = self.p
    if self.road[p[0]+dr][p[1]+dc]==1:
      self.f = True
      return -100
    if dc==2 and self.road[p[0]][p[1]+1]==1:
      self.f = True
      return -100
    self.p = (p[0]+dr,p[1]+dc)
    self.t = self.t+1
    if self.t==90:
      self.f = True
    return 1


In [2]:
import random

print("check")
game = Game()
for i in range(10):
  if game.f:
    break
  print(game.getVisibleStateString())
  a = random.choice(game.listPossibleActions())
  game.update(a)
  print(i,a)

check
----------
       
x      
       
----------
0 0
----------
      o
x      
       
----------
1 0
----------
     o 
x      
       
----------
2 0
----------
    o  
x      
       
----------
3 4
----------
   o   
 x     
       
----------
4 1
----------
x o    
       
       
----------
5 4


In [3]:
class RL(torch.nn.Module):
  def __init__(self):
    super(RL,self).__init__()

    self.l1 = torch.nn.Conv2d(2,8,kernel_size=3,padding=1)
    self.l2 = torch.nn.Conv2d(10,32,kernel_size=3,padding=1)
    self.l3 = torch.nn.Conv2d(34,64,kernel_size=(3,7))
    self.f = torch.nn.Linear(64,5)

  def forward(self,x):
    z = torch.nn.functional.leaky_relu(self.l1(x))
    z = torch.cat([z,x],dim=1)
    z = torch.nn.functional.leaky_relu(self.l2(z))
    z = torch.cat([z,x],dim=1)
    z = torch.nn.functional.leaky_relu(self.l3(z))
    z = z.view(z.shape[0],64)
    return self.f(z)

  def toTensor(self,x):
    z = torch.zeros(2,3,7)
    p,road = x
    z[0]=road
    z[1][p[0]][p[1]]=1
    return z

  def toTensorS(self,X):
    Z = torch.zeros(len(X),2,3,7)
    F = torch.ones(len(X))
    for i in range(len(X)):
      if X[i] is not None:
        Z[i] = self.toTensor(X[i])
      else:
        F[i]=0
    return Z,F

  def Q(self,X):
    Z,F = self.toTensorS(X)
    q,_ = self.forward(Z.cuda()).max(1)
    return q*F.cuda()

  def Qa(self,X,a):
    Z,_ = self.toTensorS(X)
    q = self.forward(Z.cuda())
    A = torch.zeros(len(X),5)
    for i in range(len(X)):
      A[i][a[i]]=1
    return (q*A.cuda()).sum(1)


  def policy(self,x,allowed):
    p = self.forward(self.toTensor(x).unsqueeze(0))[0]
    assert p.shape[0]==5
    f = torch.ones(5)
    for i in allowed:
      f[i]=0
    p = torch.nn.functional.softmax(p - 50*f,0)
    a = torch.multinomial(p, 1).item()
    if a in allowed:
      return a
    else:
      print("???")
      return random.choice(game.listPossibleActions())


In [4]:
def explore(agent, buffer, nbruns):
    agent.cpu()
    averagetotalreward = 0
    for i in range(nbruns):
        totalreward = 0
        game = Game()
        for j in range(1000):
            x = game.getVisibleState()
            a = agent.policy(x,game.listPossibleActions())
            r = game.update(a)
            totalreward = totalreward + r
            if game.f:
              buffer.append((x, a, r, None))
              break
            else:
              buffer.append((x, a, r, game.getVisibleState()))
        averagetotalreward = averagetotalreward + totalreward
    return averagetotalreward / nbruns


def training(agent, buffer, nbsteps,verbose):
    lr = 0.0001
    gamma=0.9
    agent.cuda()
    optimizer = torch.optim.Adam(agent.parameters(), lr=lr)

    buffercopy = []
    random.shuffle(buffer)
    for step in range(nbsteps):
        if len(buffer)<64:
          break

        X, A, R, XX = [],[],torch.zeros(64).cuda(),[]
        for i in range(64):
          x, a, r, xx = buffer.pop()
          X.append(x)
          A.append(a)
          R[i]=r
          XX.append(xx)
          buffercopy.append((x, a, r, xx))

        Qa = agent.Qa(X,A)
        Q = agent.Q(XX)
        assert Q.shape==Qa.shape
        assert Q.shape==R.shape

        tmp = gamma*Q+R.cuda()-Qa
        tmp = torch.min(tmp*tmp,tmp.abs())
        loss = tmp.sum()

        if step % 20 == 19 and verbose:
            print("\t", step, loss)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(agent.parameters(), 10)
        optimizer.step()

    return buffercopy+buffer

In [5]:
buffer = []
agent = RL()
score = explore(agent, buffer, 100) #warmup
for i in range(1000):
  score = explore(agent, buffer, 10)
  if i%3==0:
      print(i, "score",score, len(buffer))
  buffer = training(agent, buffer, 100, i%3==0)
  if len(buffer)>100000:
    random.shuffle(buffer)
    buffer = buffer[0:100000]

0 score -85.7 1729
	 19 tensor(658.6105, device='cuda:0', grad_fn=<SumBackward0>)
3 score -79.7 2236
	 19 tensor(261.3740, device='cuda:0', grad_fn=<SumBackward0>)
6 score -81.8 2726
	 19 tensor(656.5745, device='cuda:0', grad_fn=<SumBackward0>)
	 39 tensor(457.0306, device='cuda:0', grad_fn=<SumBackward0>)
9 score -89.5 3282
	 19 tensor(255.1986, device='cuda:0', grad_fn=<SumBackward0>)
	 39 tensor(456.6625, device='cuda:0', grad_fn=<SumBackward0>)
12 score -75.4 3838
	 19 tensor(348.7864, device='cuda:0', grad_fn=<SumBackward0>)
	 39 tensor(254.1586, device='cuda:0', grad_fn=<SumBackward0>)
15 score -92.4 4177
	 19 tensor(553.7122, device='cuda:0', grad_fn=<SumBackward0>)
	 39 tensor(455.9207, device='cuda:0', grad_fn=<SumBackward0>)
	 59 tensor(346.2603, device='cuda:0', grad_fn=<SumBackward0>)
18 score -85.0 4670
	 19 tensor(643.2094, device='cuda:0', grad_fn=<SumBackward0>)
	 39 tensor(250.0684, device='cuda:0', grad_fn=<SumBackward0>)
	 59 tensor(541.6766, device='cuda:0', grad_f

KeyboardInterrupt: 