## DQNを用いた二自由度マニピュレータの動作計画

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque

from env import Arm2DEnv
from dqn import DQN
from buffer import ReplayBuffer

In [None]:
# 行動を選択(ε-greedy法を使用)
def select_action(model, state, action_dim, epsilon=0.2):
    # 探索
    if np.random.rand() < epsilon:
        return np.random.randint(action_dim)
    
    # 活用
    state = torch.FloatTensor(state).unsqueeze(0)
    with torch.no_grad():
        q_values = model(state) # 各行動のq値を取得 (action_dim, )
    return q_values.argmax().item()

In [None]:
# 学習ループ
def train():
    # 環境とハイパパラメータのセット
    env = Arm2DEnv()
    input_dim = len(env.get_state())
    action_dim = len(env.action_space)

    # DQNとtarget networkを設定する
    model = DQN(input_dim, action_dim)
    target_network = DQN(input_dim, action_dim)
    target_network.load_state_dict(model.state_dict()) # モデルと同じパラメータにさせる

    # 最適化アルゴリズムのセット
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # ハイパパラメータ
    batch_size = 64
    gamma = 0.99
    epsilon = 1.0
    epsilon_min = 0.1
    epsilon_decay = 0.995
    target_update_freq = 10

    episodes = 1_000
    max_steps = 50

    # バッファのセット
    buffer = ReplayBuffer(batch_size)

    for episode in range(episodes):
        state = env.reset()
        total_reward = 0

        for step in range(max_steps):
            # ε-greedyで行動選択
            action = select_action(model, state, action_dim, epsilon)
