You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Ask for help to solve the problem of increasing memory. This part of the code has only been modified a little, and the memory will keep increasing during the training process.
#18
Closed
HongYegg opened this issue
Mar 5, 2021
· 2 comments
#状态输入是相机图像
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
import gc
from torch.distributions import Categorical
from torch import from_numpy, no_grad, save, load, tensor, clamp
from torch import float as torch_float
from torch import long as torch_long
from torch import min as torch_min
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
import numpy as np
from torch import manual_seed
from collections import namedtuple
import torch
import tracemalloc
class PPOAgent:
"""
PPOAgent implements the PPO RL algorithm (https://arxiv.org/abs/1707.06347).
It works with a set of discrete actions.
It uses the Actor and Critic neural network classes defined below.
"""
def __init__(self, numberOfActorOutputs, clip_param=0.2, max_grad_norm=0.5, ppo_update_iters=5,
batch_size=8, gamma=0.99, use_cuda=False, actor_lr=0.001, critic_lr=0.003, seed=None):
super().__init__()
if seed is not None:
manual_seed(seed)
# Hyper-parameters
self.clip_param = clip_param
self.max_grad_norm = max_grad_norm
self.ppo_update_iters = ppo_update_iters
self.batch_size = batch_size
self.gamma = gamma
self.use_cuda = use_cuda
# models
self.actor_net = Actor(numberOfActorOutputs)
self.critic_net = Critic()
if self.use_cuda:
self.actor_net.cuda()
self.critic_net.cuda()
# Create the optimizers
self.actor_optimizer = optim.Adam(self.actor_net.parameters(), actor_lr)
self.critic_net_optimizer = optim.Adam(self.critic_net.parameters(), critic_lr)
# Training stats
self.buffer = []
def work(self, agentInput, type_="simple"):
"""
type_ == "simple"
Implementation for a simple forward pass.
type_ == "selectAction"
Implementation for the forward pass, that returns a selected action according to the probability
distribution and its probability.
type_ == "selectActionMax"
Implementation for the forward pass, that returns the max selected action.
"""
# agentInput = from_numpy(agentInput).float().unsqueeze(0)
if self.use_cuda:
agentInput = agentInput.cuda()
with no_grad():
action_prob = self.actor_net(agentInput)
if type_ == "simple":
output = [action_prob[0][i].data.tolist() for i in range(len(action_prob[0]))]
return output
elif type_ == "selectAction":
c = Categorical(action_prob)
action = c.sample()
return action.item(), action_prob[:, action.item()].item()
elif type_ == "selectActionMax":
return np.argmax(action_prob).item(), 1.0
else:
raise Exception("Wrong type in agent.work(), returning input")
def getValue(self, state):
"""
Gets the value of the current state according to the critic model.
:param state: agentInput
:return: state's value
"""
state = from_numpy(state)
with no_grad():
value = self.critic_net(state)
return value.item()
def save(self, path):
"""
Save actor and critic models in the path provided.
:param path: path to save the models
:return: None
"""
save(self.actor_net.state_dict(), path + '_actor.pkl')
save(self.critic_net.state_dict(), path + '_critic.pkl')
print('模型保存成功')
def load(self, path):
"""
Load actor and critic models from the path provided.
:param path: path where the models are saved
:return: None
"""
actor_state_dict = load(path + '_actor.pkl')
critic_state_dict = load(path + '_critic.pkl')
self.actor_net.load_state_dict(actor_state_dict)
self.critic_net.load_state_dict(critic_state_dict)
print('模型加载成功')
def storeTransition(self, transition):
"""
Stores a transition in the buffer to be used later.
:param transition: state, action, action_prob, reward, next_state
:return: None
"""
self.buffer.append(transition)
print(len(self.buffer))
def trainStep(self, batchSize=None):
"""
Performs a training step or update for the actor and critic models, based on transitions gathered in the
buffer. It then resets the buffer.
If provided with a batchSize, this is used instead of default self.batch_size
:param: batchSize: int
:return: None
"""
tracemalloc.start()
snapshot1 = tracemalloc.take_snapshot()
if batchSize is None:
if len(self.buffer) < self.batch_size:
return
batchSize = self.batch_size
# print(self.buffer[0].state.size())
# mm = [t.state for t in self.buffer]
# print(type(mm))
# state = tensor(mm[0], dtype=torch_float)
state = tensor([t.state.numpy() for t in self.buffer], dtype=torch_float)
state = state.squeeze()
print(state.size())
action = tensor([t.action for t in self.buffer], dtype=torch_long).view(-1, 1)
reward = [t.reward for t in self.buffer]
old_action_log_prob = tensor([t.a_log_prob for t in self.buffer], dtype=torch_float).view(-1, 1)
# Unroll rewards
R = 0
Gt = []
for r in reward[::-1]:
R = r + self.gamma * R
Gt.insert(0, R)
Gt = tensor(Gt, dtype=torch_float)
if self.use_cuda:
state, action, old_action_log_prob = state.cuda(), action.cuda(), old_action_log_prob.cuda()
Gt = Gt.cuda()
for _ in range(self.ppo_update_iters):
for index in BatchSampler(SubsetRandomSampler(range(len(self.buffer))), batchSize, False):
# Calculate the advantage at each step
Gt_index = Gt[index].view(-1, 1)
V = self.critic_net(state[index])
delta = Gt_index - V
advantage = delta.detach()
# Get the current prob
action_prob = self.actor_net(state[index]).gather(1, action[index]) # new policy
# PPO
ratio = (action_prob / old_action_log_prob[index])
surr1 = ratio * advantage
surr2 = clamp(ratio, 1 - self.clip_param, 1 + self.clip_param) * advantage
# update actor network
action_loss = -torch_min(surr1, surr2).mean() # MAX->MIN descent
self.actor_optimizer.zero_grad()
action_loss.backward()
nn.utils.clip_grad_norm_(self.actor_net.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
# update critic network
value_loss = F.mse_loss(Gt_index, V)
self.critic_net_optimizer.zero_grad()
value_loss.backward()
nn.utils.clip_grad_norm_(self.critic_net.parameters(), self.max_grad_norm)
self.critic_net_optimizer.step()
del self.buffer[:]
gc.collect()
print('shit')
snapshot2 = tracemalloc.take_snapshot()
top_stats = snapshot2.compare_to(snapshot1, 'lineno')
print("[ Top 10 differences ]")
for stat in top_stats[:10]:
print(stat)
Hello @HongYegg. Thank you for bringing this to our attention.
I can notice a minor increase in memory consumption of the python process on the task manager. It seems to come from the PPOAgent train step as you demostrate in your code using tracemalloc.
This increase is of an order of 1-2 mb after 500-1000 episodes of training. The agent solves the problem after 1000-1500 episodes usually. The python process starts at 94.4 mb.
To be honest it seems really minor and out of the scope of this tutorials repo. As far as i've used this implementation in the past it never seemed to cause any memory issues.
There are many discussions online 1, 2, 3 about increasing memory consuption during training when using PyTorch and can be caused by many different things as far as i can tell.
#状态输入是相机图像
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
import gc
from torch.distributions import Categorical
from torch import from_numpy, no_grad, save, load, tensor, clamp
from torch import float as torch_float
from torch import long as torch_long
from torch import min as torch_min
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
import numpy as np
from torch import manual_seed
from collections import namedtuple
import torch
import tracemalloc
Transition = namedtuple('Transition', ['state', 'action', 'a_log_prob', 'reward', 'next_state'])
class PPOAgent:
"""
PPOAgent implements the PPO RL algorithm (https://arxiv.org/abs/1707.06347).
It works with a set of discrete actions.
It uses the Actor and Critic neural network classes defined below.
"""
class Actor(nn.Module):
def init(self, numberOfInputs, numberOfOutputs):
super(Actor, self).init()
self.fc1 = nn.Linear(numberOfInputs, 10000)
self.fc2 = nn.Linear(10000, 10000)
self.action_head = nn.Linear(10000, numberOfOutputs)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
action_prob = F.softmax(self.action_head(x), dim=1)
return action_prob
class Critic(nn.Module):
def init(self, numberOfInputs):
super(Critic, self).init()
self.fc1 = nn.Linear(numberOfInputs, 10000)
self.fc2 = nn.Linear(10000, 10000)
self.state_value = nn.Linear(10000, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
value = self.state_value(x)
return value
class Actor(nn.Module):
def init(self, numberOfOutputs):
super(Actor, self).init()
self.resnet18 = models.resnet18(pretrained=True)
self.fc1 = nn.Linear(1000, 32)
self.fc2 = nn.Linear(32, 8)
self.action_head = nn.Linear(8, numberOfOutputs)
class Critic(nn.Module):
def init(self):
super(Critic, self).init()
self.resnet18 = models.resnet18(pretrained=True)
self.fc1 = nn.Linear(1000, 32)
self.fc2 = nn.Linear(32, 8)
self.state_value = nn.Linear(8, 1)
The text was updated successfully, but these errors were encountered: