In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from collections import namedtuple, deque
import time 
import torch.optim as optim
%matplotlib inline

In [2]:
env = gym.make('LunarLander-v2')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [20]:
class QNetwork(nn.Module):  
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed=0):

        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        """Build a network that maps state -> action values."""
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

In [21]:
def exp_dist(experiences):
    states, actions, next_states, rewards, dones = zip(*experiences)
    next_states = torch.from_numpy(np.array(list(next_states))).to(device)
    states = torch.from_numpy(np.array(list(states))).to(device)
    rewards = torch.from_numpy(np.array(list(rewards))).to(device)
    actions = torch.from_numpy(np.array(list(actions))).to(device)
    dones = torch.from_numpy(np.array(list(dones))).to(device)
    return (states, actions, next_states, rewards, dones)

In [22]:
class exp_relay:
    
    def __init__(self, window = 1000, sample_size = 100):  
        self.window = window
        self.sample_size = sample_size
        self.memory = deque(maxlen=window)
        
    def add(self,experience):
        self.memory.append(experience)
    
    def sample_mem(self):
        return random.sample(self.memory, self.sample_size)

In [23]:
class Agent:
    
    def __init__(self, gamma, eps_min=0.01):
        self.epsilon = 1
        self.eps_min = eps_min
        self.gamma = gamma
        self.eps_decay = 0.999
        self.nA = env.action_space.n
        self.loss = 0
    
    def q_prob(self, state, network):
        policy = np.ones(self.nA)*(self.epsilon/self.nA)
        qstate = network.forward(state)
        best_pos = np.argmax(policy)
        policy[best_pos] = (1 - self.epsilon) + (self.epsilon/self.nA)
        return policy
    
    def action2(self, state, policy_net):
        if random.uniform(0,1) < self.epsilon:
            return env.action_space.sample()
        else:
            return np.random.choice(np.arange(self.nA), p=self.q_prob(state, policy_net))
        
    def action(self, state, policy_net):
        policy_net.eval()
        with torch.no_grad():
            action_values = policy_net(state)
        policy_net.train()
        # Epsilon-greedy action selection
        if random.random() > self.epsilon:
            return np.argmax(action_values.cpu().data.numpy()) #expliding
        else:
            return random.choice(np.arange(4))  #exploration

        
    def update(self, experiences, policy_net, target_net, optimizer, tau):
        states, actions, next_states, rewards,dones = exp_dist(experiences)
        current = policy_net.forward(states).gather(-1, actions.reshape(actions.size()[0],1))
        target = target_net.forward(next_states)
        max_val = target.argmax(dim=-1)
        final = target.gather(-1, max_val.reshape(max_val.shape[0],1))
        target = rewards.reshape(rewards.size()[0],1) + self.gamma*final*(1-dones.reshape(dones.size()[0],1))
        self.loss = F.mse_loss(target, current)
        optimizer.zero_grad()
        self.loss.backward()
        optimizer.step()
        # performing soft update
        #for target_param, local_param in zip(target_net.parameters(), policy_net.parameters()):
            #target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [24]:
experience = namedtuple('experience',['state','action','next_state','reward','done'])
state_n = 8
learning_rate = 5e-4
action_n = env.action_space.n
policy_net = QNetwork(state_n, action_n).to(device)
target_net = QNetwork(state_n, action_n).to(device)
target_net.eval()
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(params=policy_net.parameters(), lr = learning_rate)

In [25]:
gamma = 0.995; window = int(1e5); sample = 128; eps_min=0.01;update = 4
update_freq = 4
agent = Agent(gamma, eps_min)
memory = exp_relay(window,sample)
num_episode = 10000
reward_data = []
max_t = 1000
tau = 1e-3
stepup = 0

In [26]:
for i in range(1,num_episode+1):
    state = env.reset()
    reward_val = 0
    for _ in range(max_t):
        state = torch.from_numpy(state).to(device)
        action = agent.action(state, policy_net)
        next_state, reward, done, _ = env.step(action)
        if done == False:
            done = 0
        else:
            done = 1
        memory.add(experience(state.cpu().numpy(),action,next_state,reward,done))
#         agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
        reward_val += reward
        stepup = (stepup+1)%update
        if stepup == 0:
            if len(memory.memory) > sample:
                experiences = memory.sample_mem()
                agent.update(experiences, policy_net,target_net,optimizer,tau)
        state = next_state
        if done:
            break
    reward_data.append(reward_val)
    agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
    print("\rAverage rewards {} with epsilon {} with loss {}".format(np.mean(reward_data[:-50]), agent.epsilon, agent.loss), end=' ')
    if i % 100 == 0:
        torch.save(policy_net.state_dict(), './hello64')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Average rewards -55.96013482192812 with epsilon 0.01 with loss 13.267919041001912  518667997677 6 

In [None]:
states, actions, next_states, rewards,dones = exp_dist(experiences)
current = policy_net.forward(states).gather(-1, actions.reshape(actions.size()[0],1))
target = target_net.forward(next_states)
max_val = target.argmax(dim=-1)
final = target.gather(-1, max_val.reshape(max_val.shape[0],1))
target = rewards.reshape(rewards.size()[0],1) + gamma*final*(1-dones.reshape(dones.size()[0],1))
loss = F.mse_loss(target, current)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss)

In [None]:
rewards[0:2].size()
current.size()

In [None]:
target = rewards + self.gamma*final*(1-dones)

In [None]:
rewards[0:2]

In [None]:
h + rewards[0:2].reshape(2,1)

In [29]:
experiences = []
policy_net_infer = QNetwork(state_n, action_n).to(device)
policy_net_infer.load_state_dict(torch.load('./hello64'))
policy_net_infer.eval()
for _ in range(3):
    state = env.reset()
    while True:
        env.render()
        action = policy_net_infer.forward(torch.from_numpy(state).to(device)).argmax().cpu().numpy()
        #action = env.action_space.sample()agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
        next_state, reward, done, _ = env.step(action)
        print("\r Rewards : {}".format(reward), end=' ')
        state = next_state
        time.sleep(0.05)
        if done:
            break
env.close()

 Rewards : 100 84906970056909e-06  

In [None]:
states, actions, next_states, rewards = exp_dist(experiences)
next_states = torch.from_numpy(np.array(list(next_states)))
states = torch.from_numpy(np.array(list(states)))
rewards = torch.from_numpy(np.array(list(rewards)))
actions = torch.from_numpy(np.array(list(actions)))
g = policy_net.forward(states)

In [None]:
g = [x for x in policy_net.parameters()]
policy_net.fc1.weight

In [None]:
target_net.fc1.weight

In [None]:
for target_param, local_param in zip(target_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [None]:
tau = 2

In [None]:
target_net.fc1.weight

In [None]:
policy_net.fc1.weight