In [13]:
import gym
import torch
import random
import numpy as np
from collections import deque

In [14]:
# 定义DQN神经网络模型
class DQNet(torch.nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_size, hidden_size)
        self.fc2 = torch.nn.Linear(hidden_size, hidden_size)
        self.fc3 = torch.nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [15]:
# 定义DQN代理类
class DQNAgent():
    def __init__(self, state_size, action_size, hidden_size, lr, gamma, epsilon, epsilon_min, epsilon_decay, memory_size, batch_size):
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.memory = deque(maxlen=memory_size)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = DQNet(state_size, action_size, hidden_size).to(self.device)
        self.target_model = DQNet(state_size, action_size, hidden_size).to(self.device)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
        self.loss_fn = torch.nn.MSELoss()
        
    # 存储经验到回放缓存
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    # 从回放缓存中获取批次经验数据
    def get_batch(self):
        batch = random.sample(self.memory, self.batch_size)
        state = torch.FloatTensor([exp[0] for exp in batch]).to(self.device)
        action = torch.LongTensor([exp[1] for exp in batch]).to(self.device)
        reward = torch.FloatTensor([exp[2] for exp in batch]).to(self.device)
        next_state = torch.FloatTensor([exp[3] for exp in batch]).to(self.device)
        done = torch.FloatTensor([exp[4] for exp in batch]).to(self.device)
        return state, action, reward, next_state, done
        
    # 选择动作
    def choose_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = torch.FloatTensor(state).to(self.device)
            q_values = self.model(state)
            return torch.argmax(q_values).item()
        
    # 更新模型
    def learn(self):
        if len(self.memory) < self.batch_size:
            return
        state, action, reward, next_state, done = self.get_batch()
        q_values = self.model(state)
        next_q_values = self.target_model(next_state).detach()
        q_target = reward + (1 - done) * self.gamma * torch.max(next_q_values, dim=1)[0]
        q_target = q_target.unsqueeze(1)
        q_values = q_values.gather(1, action.unsqueeze(1))
        loss = self.loss_fn(q_values, q_target)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # 软更新目标模型
        for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
            target_param.data.copy_(self.gamma * param.data + (1 - self.gamma) * target_param.data)
            
        # 衰减探索率
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    # 训练代理
    def train(self, env, episodes):
        scores = deque(maxlen=100)
        for episode in range(episodes):
            state = env.reset()
            score = 0
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = env.step(action)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                score += reward
                self.learn()
            scores.append(score)
            mean_score = np.mean(scores)
            print(f'Episode {episode} - Score: {score} - Mean Score: {mean_score:.2f}')
            if mean_score >= 195:
                print(f'CartPole-v1 solved in {episode} episodes!')
                break

In [16]:
# 实例化DQN代理
state_size = 4
action_size = 2
hidden_size = 64
lr = 0.001
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
memory_size = 10000
batch_size = 64
agent = DQNAgent(state_size, action_size, hidden_size, lr, gamma, epsilon, epsilon_min, epsilon_decay, memory_size, batch_size)

# 训练DQN代理
env = gym.make('CartPole-v1')
episodes = 1000
agent.train(env, episodes)

# 关闭环境
env.close()

Episode 0 - Score: 33.0 - Mean Score: 33.00
Episode 1 - Score: 11.0 - Mean Score: 22.00
Episode 2 - Score: 13.0 - Mean Score: 19.00


  state = torch.FloatTensor([exp[0] for exp in batch]).to(self.device)


Episode 3 - Score: 51.0 - Mean Score: 27.00
Episode 4 - Score: 11.0 - Mean Score: 23.80
Episode 5 - Score: 20.0 - Mean Score: 23.17
Episode 6 - Score: 35.0 - Mean Score: 24.86
Episode 7 - Score: 10.0 - Mean Score: 23.00
Episode 8 - Score: 10.0 - Mean Score: 21.56
Episode 9 - Score: 13.0 - Mean Score: 20.70
Episode 10 - Score: 9.0 - Mean Score: 19.64
Episode 11 - Score: 15.0 - Mean Score: 19.25
Episode 12 - Score: 11.0 - Mean Score: 18.62
Episode 13 - Score: 13.0 - Mean Score: 18.21
Episode 14 - Score: 10.0 - Mean Score: 17.67
Episode 15 - Score: 13.0 - Mean Score: 17.38
Episode 16 - Score: 11.0 - Mean Score: 17.00
Episode 17 - Score: 10.0 - Mean Score: 16.61
Episode 18 - Score: 11.0 - Mean Score: 16.32
Episode 19 - Score: 8.0 - Mean Score: 15.90
Episode 20 - Score: 8.0 - Mean Score: 15.52
Episode 21 - Score: 9.0 - Mean Score: 15.23
Episode 22 - Score: 9.0 - Mean Score: 14.96
Episode 23 - Score: 10.0 - Mean Score: 14.75
Episode 24 - Score: 10.0 - Mean Score: 14.56
Episode 25 - Score: 19