In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
from collections import namedtuple
import time 
import torch.optim as optim
%matplotlib inline

In [3]:
env = gym.make('LunarLander-v2')



In [4]:
class QNetwork(nn.Module):  
    """Actor (Policy) Model."""

    def __init__(self, state_size, action_size, seed=32):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
        """
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        "*** YOUR CODE HERE ***"
        self.fc1 = nn.Linear(state_size, 10)
        self.fc2 = nn.Linear(10, 6)
        self.fc3 = nn.Linear(6, action_size)

    def forward(self, x):
        """Build a network that maps state -> action values."""
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        return x

In [5]:
def exp_dist(experiences):
    states, actions, next_states, rewards = zip(*experiences)
    next_states = torch.from_numpy(np.array(list(next_states)))
    states = torch.from_numpy(np.array(list(states)))
    rewards = torch.from_numpy(np.array(list(rewards)))
    actions = torch.from_numpy(np.array(list(actions)))
    return (states, actions, next_states, rewards)

In [6]:
class exp_relay:
    
    def __init__(self, window = 1000, sample_size = 100):  
        self.window = window
        self.sample_size = sample_size
        self.memory = []
        
    def size(self):
        if len(self.memory) == self.window:
            return True
        else:
            return False
        
    def add(self,experience):
        if self.size() is True:
            self.memory.append(experience)
            del self.memory[0]
        else:
            self.memory.append(experience)
    
    def sample_mem(self):
        return random.sample(self.memory, self.sample_size)

In [7]:
class Agent:
    
    def __init__(self, gamma, alpha, eps_min=0.01):
        self.epsilon = 1
        self.eps_min = eps_min
        self.gamma = gamma
        self.alpha = alpha
        self.eps_decay = 0.999
        self.nA = env.action_space.n
    
    def q_prob(self, state, network):
        policy = np.ones(self.nA)*(self.epsilon/self.nA)
        qstate = network.forward(state)
        best_pos = np.argmax(policy)
        policy[best_pos] = (1 - self.epsilon) + (self.epsilon/self.nA)
        return policy
    
    def action(self, state, policy_net):
        if random.uniform(0,1) < self.epsilon:
            return env.action_space.sample()
        else:
            return np.random.choice(np.arange(self.nA), p=self.q_prob(state, policy_net))
        
    def update(self, experiences, policy_net, target_net, optimizer):
        states, actions, next_states, rewards = exp_dist(experiences)
        data = policy_net.forward(states)
        data_next = target_net.forward(next_states)
        h = data_next.argmax(dim=1)
        data_next = torch.tensor([data_next[(x,h[x])] for x in range(len(h))])
        data = torch.tensor([data[(x,actions[x])] for x in range(len(actions))])
        target_data = rewards + self.gamma*data_next
        loss = F.mse_loss(target_data, data)
        loss.requires_grad = True
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [8]:
experience = namedtuple('experience',['state','action','next_state','reward'])
state_n = 8
learning_rate = 0.1
action_n = env.action_space.n
policy_net = QNetwork(state_n, action_n)
target_net = QNetwork(state_n, action_n)
target_net.eval()
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(params=policy_net.parameters(), lr = learning_rate)

In [9]:
gamma = 1; window = 10000; sample = 1000; eps_min=0.2; alpha = 0.1
update_freq = 100
agent = Agent(gamma,alpha, eps_min)
memory = exp_relay(window,sample)
num_episode = 10000
reward_data = []
max_t = 200

In [10]:
for i in range(1,num_episode+1):
    state = env.reset()
    reward_val = 0
    if i % update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())
    for _ in range(max_t):
        state = torch.from_numpy(state)
        action = agent.action(state, policy_net)
        next_state, reward, done, _ = env.step(action)
        memory.add(experience(state.numpy(),action,next_state,reward))
#         agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
        reward_val += reward
        if len(memory.memory) >= sample:
            experiences = memory.sample_mem()
            agent.update(experiences, policy_net,target_net,optimizer)
        reward_val += reward
        state = next_state
        if done:
            break
    reward_data.append(reward_val)
    if i%100 == 0:
        agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
        print("Average rewards {} with epsilon {}".format(np.mean(reward_data[:-100]), agent.epsilon))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Average rewards nan with epsilon 0.999
Average rewards -353.7629581701791 with epsilon 0.998001
Average rewards -357.63389682088774 with epsilon 0.997002999
Average rewards -347.3136437056912 with epsilon 0.996005996001
Average rewards -342.20900897333746 with epsilon 0.995009990004999
Average rewards -347.4080702685288 with epsilon 0.994014980014994
Average rewards -348.97323012417183 with epsilon 0.993020965034979
Average rewards -353.8512079972512 with epsilon 0.9920279440699441
Average rewards -358.59341038169805 with epsilon 0.9910359161258742
Average rewards -353.84078969963764 with epsilon 0.9900448802097482
Average rewards -354.065446104087 with epsilon 0.9890548353295385
Average rewards -354.9399101126231 with epsilon 0.988065780494209
Average rewards -359.4194829210453 with epsilon 0.9870777147137147
Average rewards -357.3990196373936 with epsilon 0.986090636999001
Average rewards -359.5422229520402 with epsilon 0.9851045463620021
Average rewards -361.01259037829465 with epsi

KeyboardInterrupt: 

In [None]:
len(reward_data)

In [None]:
experiences = []
for _ in range(1):
    state = env.reset()
    while True:
        env.render()
        action = policy_net.forward(torch.from_numpy(state)).argmax().numpy()
        #action = env.action_space.sample()agent.epsilon = max(agent.epsilon*agent.eps_decay, agent.eps_min)
        next_state, reward, done, _ = env.step(action)
        experiences.append(experience(state, action,reward,next_state))
        state = next_state
        time.sleep(0.1)
        if done:
            break
env.close()

In [None]:
next_states = torch.from_numpy(np.array(list(next_states)))
states = torch.from_numpy(np.array(list(states)))
rewards = torch.from_numpy(np.array(list(rewards)))
actions = torch.from_numpy(np.array(list(actions)))
g = policy_net.forward(states)

In [None]:
torch.tensor([g[(x,actions[x])] for x in range(len(actions))])

In [None]:
stu = namedtuple('stu', ['age', 'nation'])
s = stu(4,5)
f = stu(10,11)
g = []
g.append(s)
g.append(f)
a,b=zip(*g)
b

In [None]:
d = torch.tensor([data_next[(x,h[x])] for x in range(len(h))])

In [None]:
actions

In [None]:
d

In [None]:
np.mean([1,2,3])

In [None]:
memory.memory.__len__()