In [312]:
import numpy as np
import gymnasium as gym
import random
import matplotlib.pyplot as plt
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [313]:
env = gym.make('CliffWalking-v0')
state_size = env.observation_space.n
action_size = env.action_space.n

In [314]:
state_size

np.int64(48)

In [315]:
action_size

np.int64(4)

In [316]:
class DQN(nn.Module):
    
    def __init__(self,state_size,action_size):
        super().__init__()
        self.fc1 = nn.Linear(state_size,24)
        self.fc2 = nn.Linear(24,24)
        self.fc3 = nn.Linear(24,action_size)
    
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [317]:
dqn = DQN(state_size,action_size)

In [318]:
x = torch.ones((1,48))

In [319]:
x

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [320]:
dqn(x)

tensor([[0.0214, 0.0153, 0.1408, 0.0546]], grad_fn=<AddmmBackward0>)

In [321]:
np.random.random()

0.5669245214256547

In [322]:
def convert(state):
    ans = np.zeros(state_size)
    ans[state] = 1
    state_tensor = torch.FloatTensor(ans)
    return state_tensor

In [323]:
class Agent:
    
    def __init__(self):
        
        self.eplison = 1.0
        self.eplison_min = 0.01
        self.eplison_decay = 0.995
        self.gamma = 0.95
        
        self.model = DQN(state_size,action_size)
        self.crition = nn.MSELoss()
        self.optimizer = optim.AdamW(self.model.parameters(),lr=0.001)
        
        self.memory = deque(maxlen=2000)
    
    
    def take_action(self,state,explore=True):
        if explore:
            if np.random.random() < self.eplison:
                action = np.random.randint(action_size)
            else:
                with torch.no_grad():
                    action_values = self.model(state)
                    action = torch.argmax(action_values).item()
            return action
        else:
            with torch.no_grad():
                action_values = self.model(state)
                action = torch.argmax(action_values).item()
            return action
                

    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    
    def update(self):
        batch_size = 32
        arr = random.sample(self.memory,batch_size)
        loss = 0
        for state,action,reward,next_state,done in arr:
            
            state_tensor = convert(state)
            action_values = self.model(state_tensor)
            
            next_state_tensor = convert(next_state)
            target_f = reward
            if not done:
                with torch.no_grad():
                    target_values = self.model(next_state_tensor)
                    target_f = reward + self.gamma * torch.max(target_values)
                    target_f = target_f.item()
#             tag = action_values.clone().detach()
#             tag[action] = target_f
            loss += (action_values[action]-target_f)**2
        loss /= len(arr)
#         loss = self.crition(action_values,tag)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        if self.eplison > self.eplison_min:
            self.eplison *= self.eplison_decay

In [336]:
class QtableAgent:
    
    def __init__(self):
        self.table = np.zeros((state_size,action_size))
        self.gamma = 0.95
        self.lr = 0.01
        self.eplison = 0.1
        self.eplison_min = 0.01
        self.eplison_decay = 0.995
    
    def update(self,state,action,reward,next_state,done):
        error = self.table[state][action] -(reward + self.gamma * np.max(self.table[next_state]*(1-done)))
        self.table[state][action] -= self.lr * error
    
    def take_action(self,state):
        if np.random.random() < self.eplison:
            return np.random.randint(action_size)
        else:
            return np.argmax(self.table[state])
        
    def best_action(self,state):
        action = np.argmax(self.table[state])
        return action
        

In [324]:
agent = Agent()

In [325]:
for e in range(500):
    
    state,_ = env.reset()
    count = 0
    rw = 0
    for x in range(500):
        state_tensor = convert(state)
        action = agent.take_action(state_tensor)
        next_state,reward,done,_,_ = env.step(action)
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        count += 1
        rw += reward
        if done:
            break
        
        if len(agent.memory) > 32:
            agent.update()
        

    if e % 10 == 0:  # 修正条件
        print(f"Episode: {e}/{500}, Reward: {rw}, Epsilon: {agent.eplison:.4f}")


Episode: 0/500, Reward: -891, Epsilon: 0.4373
Episode: 10/500, Reward: -500, Epsilon: 0.0100
Episode: 20/500, Reward: -17, Epsilon: 0.0100
Episode: 30/500, Reward: -17, Epsilon: 0.0100
Episode: 40/500, Reward: -15, Epsilon: 0.0100
Episode: 50/500, Reward: -15, Epsilon: 0.0100
Episode: 60/500, Reward: -51, Epsilon: 0.0100
Episode: 70/500, Reward: -15, Epsilon: 0.0100
Episode: 80/500, Reward: -15, Epsilon: 0.0100
Episode: 90/500, Reward: -15, Epsilon: 0.0100
Episode: 100/500, Reward: -15, Epsilon: 0.0100
Episode: 110/500, Reward: -34, Epsilon: 0.0100
Episode: 120/500, Reward: -15, Epsilon: 0.0100
Episode: 130/500, Reward: -500, Epsilon: 0.0100
Episode: 140/500, Reward: -15, Epsilon: 0.0100
Episode: 150/500, Reward: -500, Epsilon: 0.0100
Episode: 160/500, Reward: -15, Epsilon: 0.0100
Episode: 170/500, Reward: -15, Epsilon: 0.0100
Episode: 180/500, Reward: -15, Epsilon: 0.0100
Episode: 190/500, Reward: -15, Epsilon: 0.0100
Episode: 200/500, Reward: -15, Epsilon: 0.0100
Episode: 210/500, Re

In [328]:
def test_agent(agent, episodes=10, render=True):
    env = gym.make('CliffWalking-v0', render_mode='human' if render else None)
    state_size = env.observation_space.n
    
    for e in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            if render:
                env.render()
            state = convert(state)
            action = agent.take_action(state, explore=False)  # 测试时不探索
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
            
            if done:
                print(f"Test episode: {e+1}/{episodes}, score: {total_reward}")
                break

In [329]:
test_agent(agent)

Test episode: 1/10, score: -15



KeyboardInterrupt



In [340]:
agent = QtableAgent()
for e in range(5000):
    
    state,_ = env.reset()
    count = 0
    rw = 0
    for x in range(500):
        action = agent.take_action(state)
        next_state,reward,done,_,_ = env.step(action)
        agent.update(state,action,reward,next_state,done)
        state = next_state
        count += 1
        rw += reward
        if done:
            break
    if e % 10 == 0:  # 修正条件
        print(f"Episode: {e}/{500}, Reward: {rw}")

Episode: 0/500, Reward: -71
Episode: 10/500, Reward: -476
Episode: 20/500, Reward: -344
Episode: 30/500, Reward: -667
Episode: 40/500, Reward: -397
Episode: 50/500, Reward: -606
Episode: 60/500, Reward: -300
Episode: 70/500, Reward: -182
Episode: 80/500, Reward: -348
Episode: 90/500, Reward: -414
Episode: 100/500, Reward: -232
Episode: 110/500, Reward: -205
Episode: 120/500, Reward: -118
Episode: 130/500, Reward: -270
Episode: 140/500, Reward: -371
Episode: 150/500, Reward: -75
Episode: 160/500, Reward: -440
Episode: 170/500, Reward: -80
Episode: 180/500, Reward: -223
Episode: 190/500, Reward: -234
Episode: 200/500, Reward: -101
Episode: 210/500, Reward: -123
Episode: 220/500, Reward: -57
Episode: 230/500, Reward: -68
Episode: 240/500, Reward: -143
Episode: 250/500, Reward: -343
Episode: 260/500, Reward: -395
Episode: 270/500, Reward: -188
Episode: 280/500, Reward: -507
Episode: 290/500, Reward: -60
Episode: 300/500, Reward: -65
Episode: 310/500, Reward: -63
Episode: 320/500, Reward: -

Episode: 4000/500, Reward: -118
Episode: 4010/500, Reward: -13
Episode: 4020/500, Reward: -235
Episode: 4030/500, Reward: -13
Episode: 4040/500, Reward: -13
Episode: 4050/500, Reward: -17
Episode: 4060/500, Reward: -122
Episode: 4070/500, Reward: -21
Episode: 4080/500, Reward: -15
Episode: 4090/500, Reward: -123
Episode: 4100/500, Reward: -15
Episode: 4110/500, Reward: -13
Episode: 4120/500, Reward: -13
Episode: 4130/500, Reward: -28
Episode: 4140/500, Reward: -13
Episode: 4150/500, Reward: -13
Episode: 4160/500, Reward: -13
Episode: 4170/500, Reward: -13
Episode: 4180/500, Reward: -123
Episode: 4190/500, Reward: -15
Episode: 4200/500, Reward: -13
Episode: 4210/500, Reward: -15
Episode: 4220/500, Reward: -122
Episode: 4230/500, Reward: -13
Episode: 4240/500, Reward: -13
Episode: 4250/500, Reward: -21
Episode: 4260/500, Reward: -15
Episode: 4270/500, Reward: -13
Episode: 4280/500, Reward: -14
Episode: 4290/500, Reward: -15
Episode: 4300/500, Reward: -15
Episode: 4310/500, Reward: -13
Ep

In [341]:
def test_agent1(agent, episodes=10, render=True):
    env = gym.make('CliffWalking-v0', render_mode='human' if render else None)
    state_size = env.observation_space.n
    
    for e in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            if render:
                env.render()
            action = agent.best_action(state)  # 测试时不探索
            next_state, reward, done, _, _ = env.step(action)
            state = next_state
            total_reward += reward
            
            if done:
                print(f"Test episode: {e+1}/{episodes}, score: {total_reward}")
                break

In [342]:
test_agent1(agent)

Test episode: 1/10, score: -13
Test episode: 2/10, score: -13
Test episode: 3/10, score: -13
Test episode: 4/10, score: -13
Test episode: 5/10, score: -13
Test episode: 6/10, score: -13
Test episode: 7/10, score: -13
Test episode: 8/10, score: -13
Test episode: 9/10, score: -13
Test episode: 10/10, score: -13
