<a href="https://colab.research.google.com/github/caffein1371/colab/blob/master/20201220_dqn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

In [60]:
! pip install JSAnimation



In [74]:
xvfb-run -a jupyter notebook

SyntaxError: ignored

In [61]:
! pip install torch-summary



In [62]:
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):
  plt.figure(figsize=(frames[0].shape[1]/72.0,frames[0].shape[0]/72.0),dpi=72)
  patch = plt.imshow(frames[0])
  plt.axis('off')

  def animate(i):
    patch.set_data(frames[i])

  anim = animation.FuncAnimation(plt.gcf(),animate,frames=len(frames),interval=50)

  anim.save('movie_catpole_DQN.mp4')
  display(display_animation(anim,default_mode='loop'))

In [63]:
from collections import namedtuple
Tr = namedtuple('tr',('name_a','value_b'))
Tr_object = Tr('名前Aです',100)

print (Tr_object)
print (Tr_object.value_b)

tr(name_a='名前Aです', value_b=100)
100


In [64]:
from collections import namedtuple

Transition = namedtuple('Transition',('state','action','next_state','reward'))

In [65]:
ENV = 'CartPole-v0'
GANMA = 0.99
MAX_STEPS = 200
NUM_EPISODES = 500

In [66]:
#経験を保存するメモリクラスを定義する
class ReplayMemory:

  def __init__(self,CAPACITY):
    self.capacity = CAPACITY
    self.memory = []
    self.index = 0

  def push(self,state,action,state_next, reward):
    #transition = (state,action,state_next,reward)をメモリに保存する

    if len(self.memory)< self.capacity:
      self.memory.append(None)

      self.memory[self.index] = Transition(state,action,state_next,reward)

      self.index = (self.index+1)%self.capacity #保存するindexを１つずらす

  def sample(self,batch_size):
    #batch_size分だけ，ランダムに保存内容を取り出す
    return random.sample(self.memory,batch_size)

  def __len__(self):
    return len(self.memory)

In [67]:
#エージェントがもつ脳となるクラスです，DQNを実行します
#Q関数をディープラーニングのネットワークをクラスとして定義

import random
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchsummary import summary

BATCH_SIZE = 32
CAPACITY = 10000

class Brain:
  def __init__(self,num_states,num_actions):
    #DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.num_actions = num_actions #CartPoleの行動(右に左に押す)の２つ取得

    self.memory = ReplayMemory(CAPACITY)

    #NNを構築
    self.model = nn.Sequential()
    self.model.add_module('fc1',nn.Linear(num_states,32))
    self.model.add_module('relu1',nn.ReLU())
    self.model.add_module('fc2',nn.Linear(32,32))
    self.model.add_module('relu2',nn.ReLU())
    self.model.add_module('fc3',nn.Linear(32,num_actions))

    #model = model.to(DEVICE)
    print (self.model)

    #最適化手法の設定
    self.optimizer = optim.Adam(self.model.parameters(),lr = 0.0001)

  def replay(self):
    #メモリサイズの確認
    if len(self.memory)<BATCH_SIZE:
      return

    #ミニバッチの作成
    transitions = self.memory.sample(BATCH_SIZE)

    #各変数をミニバッチに対応する形に変形
    batch = Transition(*zip(*transitions))

    #各変数の要素をミニバッチに対応する形に変形する
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)

    reward_batch = torch.cat(batch.reward)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                     if s is not None])
  
    #教師信号となるQ(s_tma_t)値を求める
    #ネットワークを推論モードに切り替える
    self.model.eval()

    state_action_values = self.model(state_batch).gather(1,action_batch)

    non_final_mask = torch.ByteTensor(
        tuple(map(lambda s:s is not None,batch.next_state)))
    #まずは全部0にしておく
    next_state_values = torch.zeros(BATCH_SIZE)

  #次の状態があるindexの最大Q値を求める．出力にサクセスし，max(1)で列方向の最大値の「値,index」を求めます
  #そしてそのQ値(index =0)を出力します
  #detachでその値を取り出します
    next_state_values[non_final_mask] = self.model(
        non_final_next_states).max(1)[0].detach()
  #教師となる値をQ学習の式から求める
    expected_state_action_values = reward_batch + GANMA * next_state_values
  #結合パラメータの更新
  #ネットワークを訓練モードに切り替える
    self.model.train()

  #損失関数を計算する(smooth_l1_lossはHuberloss)
  # expected_state_action_valuesはsizeが[minbatch]になっているので，unsqueezeで[minibatch x 1]へ
    loss = F.smooth_l1_loss(state_action_values,expected_state_action_values.unsqueeze(1))

    #結合パラメータを更新する
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

  def decide_action(self,state,episode):
  #ε-greedy法で徐々に最適行動のみを採用する
    epsilon = 0.5 *(1/(episode +1))

    if epsilon <= np.random.uniform(0,1):
      self.model.eval()
      with torch.no_grad():
        action = self.model(state).max(1)[1].view(1,1)
    else:
      action = torch.LongTensor(
          [[random.randrange(self.num_actions)]]) #0.1の行動をランダムに返す
      #actionは[torch.LongTensor of size 1x1]の形になります
    return action

In [68]:
# CartPoleで動くエージェントクラスです．棒付き台車そのものになります
class Agent:
  def __init__(self,num_states,num_actions):
    self.brain = Brain(num_states,num_actions)

  def update_q_function(self):
    #Q関数を更新する
    self.brain.replay()

  def get_action(self,state,episode):
    #行動を決定する
    action = self.brain.decide_action(state,episode)
    return action

  def memorize(self,state,action,state_next,reward):
    self.brain.memory.push(state,action,state_next,reward)

In [71]:
#CartPoleを実行する環境のクラスです
class Enviroment:

  def __init__(self):
    self.env = gym.make(ENV)#実行する課題を設定
    self.num_states = self.env.observation_space.shape[0]
    #課題の状態と行動の数を設定
    self.num_actions = self.env.action_space.n #右左の２択の２を取得
    self.agent = Agent(self.num_states,self.num_actions)

  def run(self):
    episode_10_list = np.zeros(10)

    complete_episodes = 0
    episode_final = False
    frames = []

    #試行回数分繰り返す
    for episode in range(NUM_EPISODES):
      observation = self.env.reset() #環境の初期化

      state = observation #環境をそのまま状態sとして使用
      state = torch.from_numpy(state).type(
          torch.FloatTensor)
      #FloatTensor of size 4をsize 1 x4に変換
      state = torch.unsqueeze(state,0)

      for step in range(MAX_STEPS):#1エピソードのループ
        if episode_final is True:
          frames.append(self.env.render(mode = 'rgb_array'))

        action = self.agent.get_action(state,episode)

        #行動a_tの実行により，s_{t+1}とdoneフラグを求める
        #actionから.item()を指定して，中身を取り出す
        observation_next, _,done, _ = self.env.step(action.item()) #rewardとinfoは使わないので_にする

        #報酬を与える，
        if done:
          state_next = None
          #直近10episodeのたてたstep数リストに追加
          episode_10_list = np.hstack((episode_10_list[1:],step+1)) 
          
          if step< 195:
            reward = torch.FloatTensor([-1.0])#途中で失敗したら罰則として報酬-1を与える
            complete_episodes = 0
          else:
            reward = torch.FloatTensor([1.0]) #立ったまま終了時は報酬1を与える
            complete_episodes = complete_episodes + 1 #連続記録を更新

        else:
          reward = torch.FloatTensor([0.0])
          state_next = observation_next
          state_next = torch.from_numpy(state_next).type(torch.FloatTensor)
          state_next = torch.unsqueeze(state_next,0)

        #メモリに経験を追加
        self.agent.memorize(state,action,state_next,reward)

        #Experimence ReplayでQ関数を更新する
        self.agent.update_q_function()

        #観測の更新
        state = state_next

        if done:
          print ('%d Episode: Finished after %d steps: 10試行の平均step数 = %.1f' %(episode,step+1,episode_10_list.mean()))
          break

      if episode_final is True:
        display_frames_as_gif(frames)
        break
      #10連続で200step立ち続けたら成功
      if complete_episodes >=10:
        print ('10回連続成功')
        episode_final = True

In [78]:
#mainクラス
cartpole_env = Enviroment()
cartpole_env.run()

Sequential(
  (fc1): Linear(in_features=4, out_features=32, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=32, out_features=2, bias=True)
)
0 Episode: Finished after 9 steps: 10試行の平均step数 = 0.9
1 Episode: Finished after 15 steps: 10試行の平均step数 = 2.4
2 Episode: Finished after 10 steps: 10試行の平均step数 = 3.4
3 Episode: Finished after 10 steps: 10試行の平均step数 = 4.4
4 Episode: Finished after 9 steps: 10試行の平均step数 = 5.3
5 Episode: Finished after 10 steps: 10試行の平均step数 = 6.3
6 Episode: Finished after 8 steps: 10試行の平均step数 = 7.1
7 Episode: Finished after 9 steps: 10試行の平均step数 = 8.0
8 Episode: Finished after 10 steps: 10試行の平均step数 = 9.0
9 Episode: Finished after 9 steps: 10試行の平均step数 = 9.9
10 Episode: Finished after 9 steps: 10試行の平均step数 = 9.9
11 Episode: Finished after 8 steps: 10試行の平均step数 = 9.2
12 Episode: Finished after 9 steps: 10試行の平均step数 = 9.1
13 Episode: Finished after 8 steps: 10試行の平均step数 = 8.9
14 Epis



15 Episode: Finished after 10 steps: 10試行の平均step数 = 8.9
16 Episode: Finished after 9 steps: 10試行の平均step数 = 9.0
17 Episode: Finished after 10 steps: 10試行の平均step数 = 9.1
18 Episode: Finished after 10 steps: 10試行の平均step数 = 9.1
19 Episode: Finished after 11 steps: 10試行の平均step数 = 9.3
20 Episode: Finished after 13 steps: 10試行の平均step数 = 9.7
21 Episode: Finished after 17 steps: 10試行の平均step数 = 10.6
22 Episode: Finished after 17 steps: 10試行の平均step数 = 11.4
23 Episode: Finished after 55 steps: 10試行の平均step数 = 16.1
24 Episode: Finished after 182 steps: 10試行の平均step数 = 33.4
25 Episode: Finished after 80 steps: 10試行の平均step数 = 40.4
26 Episode: Finished after 53 steps: 10試行の平均step数 = 44.8
27 Episode: Finished after 74 steps: 10試行の平均step数 = 51.2
28 Episode: Finished after 80 steps: 10試行の平均step数 = 58.2
29 Episode: Finished after 40 steps: 10試行の平均step数 = 61.1
30 Episode: Finished after 30 steps: 10試行の平均step数 = 62.8
31 Episode: Finished after 31 steps: 10試行の平均step数 = 64.2
32 Episode: Finished after 32 steps: 

NameError: ignored

In [75]:
!apt-get install python-opengl -y

!apt install xvfb -y

!pip install pyvirtualdisplay

!pip install piglet

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  libgle3
The following NEW packages will be installed:
  python-opengl
0 upgraded, 1 newly installed, 0 to remove and 14 not upgraded.
Need to get 496 kB of archives.
After this operation, 5,416 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 python-opengl all 3.1.0+dfsg-1 [496 kB]
Fetched 496 kB in 1s (487 kB/s)
Selecting previously unselected package python-opengl.
(Reading database ... 144865 files and directories currently installed.)
Preparing to unpack .../python-opengl_3.1.0+dfsg-1_all.deb ...
Unpacking python-opengl (3.1.0+dfsg-1) ...
Setting up python-opengl (3.1.0+dfsg-1) ...
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  xvfb
0 upgraded, 1 newly installed, 0 to remove and 14 not upgraded.
Need to get 784 kB of 

In [77]:
from pyvirtualdisplay import Display
Display().start()

<pyvirtualdisplay.display.Display at 0x7fe57cdf7240>

In [81]:
! pip install -c conda-forge pyglet
# pyglet==1.2.4?

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'conda-forge'[0m
