In [1]:
import gym
import numpy as np
import random
import torch
from itertools import count


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

device

device(type='cpu')

In [269]:
from collections import deque

class Memory():
    def __init__(self, capacity=10000, short_capacity=2000):
        self.capacity = capacity
        self.short_capacity = short_capacity
        self.reset()
        
    def push(self, old_state, action, reward, new_state, final):
        self.short_memory.append([old_state, action, reward, new_state, final])
        
    def commit(self, final=True): 
        self.short_memory[-1][-1] = final
        self.memory.extendleft(self.short_memory)
        self.short_memory.clear()
        
    def reset(self):
        self.memory = deque(maxlen=self.capacity)
        self.short_memory = deque(maxlen=self.short_capacity)
        

In [393]:
d = deque(maxlen=1000)

In [396]:
d

deque([4, 3, 2, 1])

In [395]:
d.extendleft([1,2,3,4])

In [270]:
# class Memory():
#     def __init__(self, capacity=10000):
#         self.capacity = capacity
#         self.memory = []
#         self.short_memory = []
        
#     def push(self, old_state, action, reward, new_state, final):
#         self.short_memory.append([old_state, action, reward, new_state, final])
        
#     def commit(self, final=True): 
#         if len(self.memory) + len(self.short_memory) < self.capacity:
#             self.memory.extend(self.short_memory)
#         else:
#             for memo in self.short_memory:
#                 self.memory.insert(random.randint(0, len(self.memory) - 1), memo)
                
#         self.memory = self.memory[:self.capacity]
#         self.short_memory = []

In [271]:
class Qnet(torch.nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(4, 32),
#             torch.nn.LeakyReLU(inplace=True),
#             torch.nn.Linear(32, 32),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(32, 2),
#             torch.nn.Sigmoid(),
        #     torch.nn.Softmax(dim=1),
        )
    
    def forward(self, x):
        return self.model(x)

In [272]:
class Agent:
    def __init__(self, observation_space=None, action_space=None, device=None):
        self.device = device
        
        self.reset()
    
    def reset(self):
        self._mode = 'train'
        self._episode_count = 0
        self._eps = 0.99999
        self._eps_min = 0.01
        self._gamma = 0.9995
        
        self.policy_net = Qnet().to(self.device)
        self.target_net = Qnet().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=0.05)
        
        self.memory = Memory(capacity=10000000)  
    
    def push_observation(self, old_state, action, reward, state, final):
        self.memory.push(old_state, action, reward, state, final)
    
    def episode_ended(self):
        self.memory.commit()
        self._episode_count += 1
        
    def get_action(self, state):
        def get_model_action(state):
            with torch.no_grad():
                input = torch.FloatTensor(state).to(self.device)
                model_output = self.policy_net(input.unsqueeze(0))
                model_output = model_output.cpu().detach().numpy()
                return np.argmax(model_output[0])
            
        if self._mode == 'eval':
            return get_model_action(state)
        if self._mode == 'train':
            self._eps *= self._gamma
            self._eps = max(self._eps, self._eps_min)
            if random.random() > self._eps:
                return get_model_action(state)
            else:
                return random.randint(0, 1)
        raise "unknown mode"
        
    def train_memory(self):
        memory = self.memory.memory
        try:
            memory = random.sample(memory, 512)
        except ValueError:
            return

        s0 = [x[0] for x in memory]
        a = [x[1] for x in memory]
        r = [x[2] for x in memory]
        s1 = [x[3] for x in memory]
        done = [1. if x[4] else 0. for x in memory]


        s0 = torch.FloatTensor(s0).to(self.device)
        s1 = torch.FloatTensor(s1).to(self.device)
        a = torch.LongTensor(a).to(self.device)
        r = torch.FloatTensor(r).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        q_values = self.policy_net(s0)
        next_q_values = self.target_net(s1)
        next_q_value = next_q_values.max(1)[0]

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        expected_q_value = r + 0.999 * next_q_value * (1.0 - done)
        # Notice that detach the expected_q_value
        loss = (expected_q_value.detach() - q_value).pow(2).mean()

        self.optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1)
        self.optim.step()
        
        def soft_update(target_model, source_model, tau=0.001):
            for target_param, source_param in zip(target_model.parameters(), source_model.parameters()):
                target_param.data.copy_(tau*source_param.data + (1.0-tau)*target_param.data)
                
        soft_update(self.target_net, self.policy_net)
        
        
#         if self._episode_count % 10 == 0:
#             self.target_net.load_state_dict(self.policy_net.state_dict())
  
    def train(self):
        self._mode = 'train'
    def eval(self):
        self._mode = 'eval'

In [950]:
class ActorCriticNet(torch.nn.Module):
    def __init__(self):
        super(ActorCriticNet, self).__init__()
        self.model = torch.nn.Sequential(
#             torch.nn.BatchNorm1d(4),
            torch.nn.Linear(4, 64),
#             torch.nn.LeakyReLU(inplace=True),
#             torch.nn.Linear(32, 32),
            torch.nn.LeakyReLU(inplace=True),
#             torch.nn.BatchNorm1d(64),
            torch.nn.Linear(64, 64),
            torch.nn.LeakyReLU(inplace=True),
#             torch.nn.BatchNorm1d(64),
#             torch.nn.Sigmoid(),
        #     torch.nn.Softmax(dim=1),
        )
        
        self.critic = torch.nn.Sequential(
#             torch.nn.Linear(64, 64),
#             torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(64, 64),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(64, 1)
        )
        
        self.actor = torch.nn.Sequential(
#             torch.nn.Linear(64, 64),
#             torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(64, 64),
            torch.nn.LeakyReLU(inplace=True),
            torch.nn.Linear(64, 2),
#             torch.nn.LeakyReLU(inplace=True),
#             torch.nn.LogSoftmax(dim=1),
            torch.nn.Softmax(dim=1),
#             torch.nn.functional.softmax(),
        )
    
    def forward(self, x):
        res = self.model(x)
        return self.critic(res), self.actor(res)
    
# def calc_actual_state_values(states, rewards, dones):
#     R = []
#     rewards.reverse()

#     # If we happen to end the set on a terminal state, set next return to zero
#     if dones[-1] == True: next_return = 0
        
#     # If not terminal state, bootstrap v(s) using our critic
#     # TODO: don't need to estimate again, just take from last value of v(s) estimates
#     else: 
#         s = torch.from_numpy(states[-1]).float().unsqueeze(0)
#         next_return = model.get_state_value(Variable(s)).data[0][0] 
    
#     # Backup from last state to calculate "true" returns for each state in the set
#     R.append(next_return)
#     dones.reverse()
#     for r in range(1, len(rewards)):
#         if not dones[r]: this_return = rewards[r] + next_return * GAMMA
#         else: this_return = 0
#         R.append(this_return)
#         next_return = this_return

#     R.reverse()
#     state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1)
    
#     return state_values_true

class ActorCritic:
    def __init__(self, observation_space=None, action_space=None, device=None):
        self.device = device
        
        self.reset()
    
    def reset(self):
        self._mode = 'train'
        self._episode_count = 0
        self._eps = 0.99999
        self._eps_min = 0.01
        self._gamma = 0.9995
        
        self.policy_net = ActorCriticNet().to(self.device)
        self.target_net = ActorCriticNet().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=0.003)
        
        self.memory = Memory(capacity=20000)  
    
    def push_observation(self, old_state, action, reward, state, final):
        self.memory.push(old_state, action, reward, state, final)
    
    def episode_ended(self):
        self.memory.commit()
        self._episode_count += 1
        
    def get_action(self, state, deterministic=False):
        self.target_net.eval()
        self.policy_net.eval()
        with torch.no_grad():
            input = torch.FloatTensor(state).to(self.device).unsqueeze(0)
#             sv, action_probs = self.target_net(input)
            sv, action_probs = self.policy_net(input)
#             action_probs = torch.nn.functional.softmax(action_probs_v)
#             print(input, sv, action_probs_v, action_probs)
        
        if deterministic == False:
            return action_probs.multinomial(1).data[0][0].item()
        else:
#             assert False
            return action_probs.max(1)[1].data[0].item()
        
        
#         if self._mode == 'eval':
#             return get_model_action(state)
#         if self._mode == 'train':
#             self._eps *= self._gamma
#             self._eps = max(self._eps, self._eps_min)
#             if random.random() > self._eps:
#                 return get_model_action(state)
#             else:
#                 return random.randint(0, 1)
#         raise "unknown mode"
        
    def train_memory(self):
        self.policy_net.train()
#         self.policy_net.load_state_dict(self.target_net.state_dict())
        
        memory = self.memory.memory
#         try:
#             memory = random.sample(memory, 128)
#         except ValueError:
#             return

        s0 = [x[0] for x in memory]
        a = [x[1] for x in memory]
        r = [x[2] for x in memory]
        s1 = [x[3] for x in memory]
        done = [1. if x[4] else 0. for x in memory]


        s0 = torch.FloatTensor(s0).to(self.device)
        s1 = torch.FloatTensor(s1).to(self.device)
        a = torch.LongTensor(a).to(self.device).view(-1, 1)
        r = torch.FloatTensor(r).to(self.device)
        done = torch.FloatTensor(done).to(self.device)
        
        s0_values, action_probs = self.policy_net(s0)
        action_log_probs = action_probs.log()
        chosen_action_log_probs = action_log_probs.gather(1, a)
        
        s1_values = self.policy_net(s1)[0] * 0.9999 + r# + s0_values
#         expected_s1_values = s1_values * 0.99 * (1-done) #+ r
        
#         r_l = list(r.numpy())
#         R = [0]
#         for rr in r_l[1:]:
#             R.append(R[-1]*0.95 + rr)
#         print(R, r_l)
#         assert False
#         true_state_values = torch.FloatTensor(R).to(self.device).unsqueeze(0)
        
        
        advantages = s1_values - s0_values
        
        entropy = (action_probs * action_log_probs).sum(1).mean()
        action_gain = (chosen_action_log_probs * advantages).mean()
        value_loss = advantages.pow(2).mean()
        total_loss = value_loss*0.8 - action_gain - 0.0001*entropy
        
        self.optim.zero_grad()
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1)
        self.optim.step()

#         q_values = self.policy_net(s0)
#         next_q_values = self.target_net(s1)
#         next_q_value = next_q_values.max(1)[0]

#         q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
#         expected_q_value = r + 0.999 * next_q_value * (1.0 - done)
#         # Notice that detach the expected_q_value
#         loss = (expected_q_value.detach() - q_value).pow(2).mean()

#         self.optim.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1)
#         self.optim.step()
        
#         def soft_update(target_model, source_model, tau=0.05):
#             for target_param, source_param in zip(target_model.parameters(), source_model.parameters()):
#                 target_param.data.copy_(tau*source_param.data + (1.0-tau)*target_param.data)
                
#         soft_update(self.target_net, self.policy_net)
        
        
#         if self._episode_count % 10 == 0:
#             self.target_net.load_state_dict(self.policy_net.state_dict())
  
    def train(self):
        self._mode = 'train'
    def eval(self):
        self._mode = 'eval'

In [951]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 2000

In [952]:
env.observation_space, env.action_space

(Box(4,), Discrete(2))

In [953]:
agent = ActorCritic(observation_space=4, action_space=2, device=device)
agent.train()

In [954]:
len_mem = []
try:
    for episode in range(100000):
        state = env.reset()

        for i in count():
#             env.render()

#             action = get_model_action(policy_net, state, device)
            action = agent.get_action(state)
            old_state = state
            state, reward, done, info = env.step(action)

            if done:
                env.close()
                len_mem.append(i)
                
                if abs(i-2000) < 10:
                    reward = 1
                    final = False
                else:
                    reward = -1
                    final = True
                
                agent.push_observation(old_state, action, reward, state, final)
                agent.episode_ended()
                break
            else:
                agent.push_observation(old_state, action, reward, state, False)
        
        if episode % 100 == 0:
            print('Episode {:3d}: {:10.3f} iterations, eps={:6.6f}'.format(episode, 
                                                                           np.mean(len_mem[-100:]), 
                                                                           agent._eps))
        agent.train_memory()
        agent.memory.reset()
    
except KeyboardInterrupt:
    env.close()

Episode   0:     11.000 iterations, eps=0.999990
Episode 100:     64.220 iterations, eps=0.999990
Episode 200:    190.900 iterations, eps=0.999990


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure()
plt.plot(len_mem, 'o', alpha=0.2)

In [921]:
agent.eval()

try:
    env.seed = random.randint(0, 100000)
    obs = env.reset()
    env._max_episode_steps = 100000
    for t in count():
        
        if t % 500 == 0:
            print(t)
        
        env.render()
        action = agent.get_action(obs, deterministic=True)
#         print(action)
        obs, reward, done, info = env.step(action)
        if done:
            env.close()
            break

    print(t)
except KeyboardInterrupt:
    pass
finally:
    env.close()
    print (t)

0
288


In [12]:
model = policy_net

In [13]:
model.to('cpu')

Qnet(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=32, bias=True)
    (1): LeakyReLU(negative_slope=0.01, inplace)
    (2): Linear(in_features=32, out_features=2, bias=True)
  )
)

In [14]:
torch.save(model, 'model.torch')

  "type " + obj.__name__ + ". It won't be checked "


In [26]:
model = torch.load('model.torch')

In [27]:
model.to(device)

Qnet(
  (model): Sequential(
    (0): Linear(in_features=4, out_features=32, bias=True)
    (1): LeakyReLU(negative_slope=0.01, inplace)
    (2): Linear(in_features=32, out_features=2, bias=True)
  )
)

In [28]:
policy_net = model