In [None]:
%run dqn_model.ipynb

In [None]:
import gym
import random
import os
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
class Agent():
    def __init__(self, state_size, action_size, cold_start=False):
        self.model_path = "./models/model.pth"
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.learning_rate = 0.0001
        self.gamma = 0.95
        self.exploration_rate = 1.0
        self.exploration_min = 0.01
        self.exploration_decay = 0.995
        self.model = DeepQN(input_size = self.state_size, output_size = self.action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr = self.learning_rate)
        self.criterion = nn.MSELoss()

    def load_model(self):
        """Loads the model weights from the path set in the object."""
        self.model.load_state_dict(torch.load(self.model_path))

    def act(self, state):
        """Acts according to epsilon greedy policy."""
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        
        state = torch.tensor(state, dtype=torch.float)
        prediction = self.model(state)
        return torch.argmax(prediction).item()

    def remember(self, state, action, reward, next_state, done):
        """Saves experience in memory."""
        self.memory.append((state, action, reward, next_state, done))

    def learn(self, sample_batch_size):
        """Learning from experience in memory."""
        
        if len(self.memory) < sample_batch_size:
            return # Don't learn until we have at least batch size experiences
        sample_batch = random.sample(self.memory, sample_batch_size)
        
        state, action, reward, next_state, done = zip(*sample_batch)
        
        state = torch.tensor(state, dtype=torch.float)
        action = torch.tensor(action, dtype=torch.float)
        reward = torch.tensor(reward, dtype=torch.float)
        next_state = torch.tensor(next_state, dtype=torch.float)
        
        # Predicted Q values with current state
        prediction = self.model(state)
        target = prediction.clone()
        
        for idx in range(len(done)):
            Q_new = reward[idx]
            if not done[idx]:
                Q_new = reward[idx] + self.gamma * torch.max(self.model(next_state[idx]))

            target[idx][torch.argmax(action[idx]).item()] = Q_new
            
        
        # Zero out gradients
        self.optimizer.zero_grad()
        
        # Calc loss and do back propagation
        loss = self.criterion(target, prediction)
        loss.backward()
        
        self.optimizer.step()    
            
        if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay

In [None]:
class CartPole:
    def __init__(self):
        self.sample_batch_size = 128
        self.episodes = 20000
        self.env = gym.make('CartPole-v1')

        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.agent = Agent(self.state_size, self.action_size)
        
    def load_model(self):
        self.agent.load_model()

    def run_test(self):
        self.env = gym.make('CartPole-v1')
        
        state = self.env.reset()

        self.env.render()
        done = False

        index = 0
        while not done:
            state = np.reshape(state, [1, self.state_size])
            action = self.agent.act(state)
            next_state, _, done, _ = self.env.step(action)
            state = next_state
            index += 1
            self.env.render()
        self.env.close()
        
        print(f'Score: {index+1}')

    def run_train(self):
        scores = []
        avg_score = []
        total_score = 0
        for index_episode in tqdm(range(self.episodes)):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])

            done = False
            episode_score = 0
            while not done:
                # Turning off render while training
                # self.env.render()

                # Choose action
                action = self.agent.act(state)

                # Act
                next_state, reward, done, _ = self.env.step(action)

                # Remember experience
                next_state = np.reshape(next_state, [1, self.state_size])
                self.agent.remember(state, action, reward, next_state, done)
                
                state = next_state
                episode_score += 1


            # Console output of learning process
            #print(f'Episode {index_episode+1}/{self.episodes} Score: {index + 1}')

            # Save cumulative reward in this episode and an avarage reward until now
            scores.append(episode_score + 1)
            total_score += (episode_score + 1)
            avg_score.append(total_score / (index_episode+1))

            # Pustamo agenta da uci.
            self.agent.learn(self.sample_batch_size)

            # Early stopping
            if index_episode > 10:
                last_10 = scores[-10:]
                avg = sum(last_10) / 10
                if avg > 490:
                    break

        
        # Save model weights
        self.agent.model.save()

        # Return training history
        return scores, avg_score