## Problem 1: Correlation between samples

## Problem 2: Non-stationary targets

## Solution
1. Go deep
2. #### **experience replay**
3. Seperate Networks

In [6]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym.envs.registration import register
import random

In [7]:
env = gym.make('CartPole-v0')
env = env.unwrapped

# DQN

In [8]:
import numpy as np
import torch
import torch.nn as nn

In [9]:
input_dim = env.observation_space.shape[0]
hidden_dim = 128
output_dim = env.action_space.n

In [16]:
class DQN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DQN, self).__init__()     
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.lin = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, output_dim)
        )

    def convert_to_tensor(sefl, x):
        return torch.tensor(x, dtype=torch.float)
    
    def forward(self, state):
        if type(state) != torch.tensor:
            state = self.convert_to_tensor(state)
        
        if state.dim() == 1:
            state.unsqueeze_(0)
        
        out = self.lin(state)
        return out

In [21]:
class Fitter():
    def __init__(self, behaviorDQN, targetDQN):
        self.gamma = 0.9
        self.behaviorDQN = behaviorDQN
        self.targetDQN = targetDQN
        self.criterion = nn.MSELoss()
        self.optim = torch.optim.Adam(self.behaviorDQN.parameters(), lr=0.1)
        
    def train(self, train_batch):
        x_stack = torch.empty([0, self.behaviorDQN.input_dim])
        y_stack = torch.empty([0, self.behaviorDQN.output_dim])

        for s, a, r, s_new, done in train_batch:
            q_target = self.targetDQN(s)
            q_target1 = self.targetDQN(s_new)

            if done:
                q_target[0, a] = r
            else:
                q_target[0, a] = r + self.gamma*torch.max(q_target1)
            
            s_tensor = torch.tensor(s, dtype=torch.float).unsqueeze(0)
            x_stack = torch.cat((x_stack, s_tensor), dim=0)
            y_stack = torch.cat([y_stack, q_target], dim=0)
        
        # 예측치(pred)와 목표치(true)
        q_behavior = self.behaviorDQN(x_stack)
        q_true = y_stack
        
        self.optim.zero_grad()
        loss = self.criterion(q_true, q_behavior)
        loss.backward()
        self.optim.step()
        return loss
    
    def update_targetDQN(self):
        self.targetDQN.load_state_dict(self.behaviorDQN.state_dict())
        return 
    
    def bot_test(self):
        s = env.reset()
        n_step = 0
        while True:
            a = torch.argmax(self.behaviorDQN(s)).item()
            s, r, done, _ = env.step(a)
            n_step += 1
            if done:
                print('Total score: %s'%n_step)
                return n_step
            
            if n_step > 10000:
                print('Very nice!')
                self.bot_play()
                
    def bot_play(self):
        s = env.reset()
        n_step = 0
        while True:
            a = torch.argmax(self.behaviorDQN(s)).item()
            env.render()
            s, r, done, _ = env.step(a)
            n_step += 1
            if done:
                print('Total score: %s'%n_step)
                env.close()
                return n_step
            
            

In [22]:
behaviorDQN = DQN(input_dim, hidden_dim, output_dim)
targetDQN = DQN(input_dim, hidden_dim, output_dim)
fitter = Fitter(behaviorDQN, targetDQN)

## Store replay train

In [None]:
from collections import deque

train_epoch = 1000
replay_memory = deque()
max_replay = 50000
reward_ls = []

for i in range(train_epoch):    
    s = env.reset()
    e_rate = 1. / ((i/10)+1)  # exploration rate (random action)
    done = False
    
    while not done:
        # Choose an action by e-greedy
        q_pred = fitter.behaviorDQN(s)
        
        if np.random.rand(1) < e_rate :
            a = env.action_space.sample()
        else:
            a = torch.argmax(q_pred).item()
        
        # action 실행 후, replay_memory에 기록
        s_new, r, done, _ = env.step(a)
        
        if done:
            r = -10
        
        replay_memory.append([s, a , r, s_new, done])
        if len(replay_memory) > max_replay:
            replay_memory.popleft()
        
        s = s_new
               
    # train every 10 epoch
    if i%10 == 1:
        fitter.update_targetDQN()
        
        for _ in range(50):
            batch_size = np.min([10, len(replay_memory)])
            mini_batch = random.sample(replay_memory, batch_size)
            loss = fitter.train(mini_batch)
    
    if i%10 == 1:
        reward = fitter.bot_test()
        reward_ls.append(reward)
        

  


Total score: 301
Total score: 122
Total score: 45
Total score: 41
Total score: 69
Total score: 63
Total score: 57
Total score: 87
Total score: 54
Total score: 95
Total score: 109
Total score: 180
Total score: 130
Total score: 158
Total score: 110
Total score: 175
Total score: 99
Total score: 123
Total score: 139
Total score: 33
Total score: 127
Total score: 23
Total score: 248
Total score: 228
Total score: 1704
Total score: 368
Total score: 281
Total score: 498
Total score: 640
Total score: 227
Total score: 181
Total score: 165
Total score: 149
Total score: 109
Very nice!
Total score: 176292
Total score: 10002
Very nice!
