# Deep Neural Network을 이용한 함수 근사에서 필요한 torch basics

## Experience Replay
```
class ExperienceReplay:
    #Initialize replay memory D to capacity N
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        
    def push(self, state, action, new_state, reward, done):
        transition = (state, action, new_state, reward, done)
        
        if self.position >= len(self.memory):
            self.memory.append(transition)
        else:
            self.memory[self.position] = transition
            
        self.position = (self.position + 1) % self.capacity
        
    def sample(self, batch_size):
        return zip(*random.sample(self.memory, batch_size))
        
    def __len__(self):
        return len(self.memory)

```

In [30]:
import random
memory = []

def push(state, action, new_state, reward, done):
    
    transition = (state, action, new_state, reward, done)
        
    memory.append(transition)
        
def sample(batch_size):
    
    return zip(*random.sample(memory, batch_size))

In [31]:
for i in range(10):
    push([i, i+1, i+2, i+3], i, i, i, False) #(state, action, new_state, reward, done)
    
memory

[([0, 1, 2, 3], 0, 0, 0, False),
 ([1, 2, 3, 4], 1, 1, 1, False),
 ([2, 3, 4, 5], 2, 2, 2, False),
 ([3, 4, 5, 6], 3, 3, 3, False),
 ([4, 5, 6, 7], 4, 4, 4, False),
 ([5, 6, 7, 8], 5, 5, 5, False),
 ([6, 7, 8, 9], 6, 6, 6, False),
 ([7, 8, 9, 10], 7, 7, 7, False),
 ([8, 9, 10, 11], 8, 8, 8, False),
 ([9, 10, 11, 12], 9, 9, 9, False)]

In [37]:
list(sample(3))

[([1, 2, 3, 4], [7, 8, 9, 10], [0, 1, 2, 3]),
 (1, 7, 0),
 (1, 7, 0),
 (1, 7, 0),
 (False, False, False)]

## Select Action 

- state가 4 개의 feature로 구성되고 각 state에서의 action이 2 가지인 MDP의 parametrized state action value function

In [38]:
import torch
import torch.nn as nn
import numpy as np

n_inputs = 4  # state feature
n_outputs = 2  # action space
hidden_layer = 64

class NeuralNetwork(nn.Module):
    def __init__(self) -> None:
        super(NeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(n_inputs, hidden_layer)
        self.linear2 = nn.Linear(hidden_layer, n_outputs)

    def forward(self, x):
        a1 = torch.tanh(self.linear1(x))
        output = self.linear2(a1)
        return output

- 입력 : 4 개 feature 로 구성된 state 
- 출력 : 2 개 action values  

- $max_{a'}Q(s', a';\theta)$

In [39]:
Q = NeuralNetwork()
action_values = Q(torch.tensor([0.1, 0.2, 0.3, 0.4]))
action_values

tensor([-0.0354,  0.2647], grad_fn=<AddBackward0>)

In [40]:
action = torch.argmax(action_values).item() 
action

1

## Sample random minibatch

- batch size : 3

In [41]:
batch_size = 3

states, actions, new_states, rewards, dones = sample(batch_size)

states = torch.Tensor(states)
actions = torch.LongTensor(actions)
rewards = torch.Tensor([rewards])
dones = torch.Tensor(dones)

states, actions, rewards, dones

(tensor([[ 8.,  9., 10., 11.],
         [ 2.,  3.,  4.,  5.],
         [ 3.,  4.,  5.,  6.]]),
 tensor([8, 2, 3]),
 tensor([[8., 2., 3.]]),
 tensor([0., 0., 0.]))

## State-Action Value (q value) - DQN 

In [42]:
action_values = Q(states).detach()
print(action_values)
max_action_values = torch.max(action_values, 1)[0]
max_action_values

tensor([[-0.0114,  0.9470],
        [-0.1124,  1.1367],
        [-0.0863,  1.1283]])


tensor([0.9470, 1.1367, 1.1283])

## State-Action Value (q value) - Double DQN

In [43]:
new_values = Q(states).detach()
print(new_values)

max_action_indexes = torch.max(new_values, 1)[1]
print(max_action_indexes)

max_new_state_values = new_values.gather(1, max_action_indexes.unsqueeze(1)).squeeze(1)
print(max_new_state_values)

tensor([[-0.0114,  0.9470],
        [-0.1124,  1.1367],
        [-0.0863,  1.1283]])
tensor([1, 1, 1])
tensor([0.9470, 1.1367, 1.1283])


## torch.gather

- torch.gather 함수 (또는 torch.Tensor.gather)는 다중 인덱스 선택 방법  

- 첫 번째 인수 인 input은 요소를 선택하려는 소스 텐서. 두 번째 dim은 수집하려는 차원. 마지막으로 index는 입력을 인덱싱하는 인덱스.

In [44]:
q_values = Q(states)
q_values

tensor([[-0.0114,  0.9470],
        [-0.1124,  1.1367],
        [-0.0863,  1.1283]], grad_fn=<AddmmBackward>)

In [45]:
action = torch.LongTensor([1, 0, 1]).unsqueeze(1)
action

tensor([[1],
        [0],
        [1]])

In [46]:
torch.gather(q_values, 1, action)

tensor([[ 0.9470],
        [-0.1124],
        [ 1.1283]], grad_fn=<GatherBackward>)

In [47]:
q_values.gather(1, action)

tensor([[ 0.9470],
        [-0.1124],
        [ 1.1283]], grad_fn=<GatherBackward>)

## torch.distribution.Categorical

- sample method  
- log_prob method

In [48]:
from torch.distributions import Categorical

m = Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))

In [49]:
for _ in range(5):
    print(m.sample())  # equal probability of 0, 1, 2, 3

tensor(0)
tensor(1)
tensor(1)
tensor(3)
tensor(3)


In [50]:
action_logits = torch.rand(5)
action_probs = torch.softmax(action_logits, dim=-1)
action_probs

tensor([0.1574, 0.1607, 0.3504, 0.1863, 0.1453])

In [51]:
action_probs.sum()

tensor(1.)

In [52]:
dist = Categorical(action_probs)
action = dist.sample()
print(action)
print(dist.log_prob(action), torch.log(action_probs[action]))

tensor(4)
tensor(-1.9293) tensor(-1.9293)
