In [1]:
%run dqn_model.ipynb

In [2]:
import gym
import random
import os
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
class Agent():
    def __init__(self, state_size, action_size, cold_start=True):
        self.model_path = "./models/model.pth"
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=10000)
        self.learning_rate = 0.0005
        self.gamma = 0.95
        self.exploration_rate = 1.0
        self.exploration_min = 0.05
        self.exploration_decay = 0.9999
        
        self.policy_net = DeepQN(input_size = self.state_size, output_size = self.action_size).to(device)
        self.target_net = DeepQN(input_size = self.state_size, output_size = self.action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        
        self.optimizer = optim.RMSprop(self.policy_net.parameters()) #optim.Adam(self.model.parameters(), lr = self.learning_rate)
        self.criterion = nn.SmoothL1Loss() # Huber loss
        
        if not cold_start:
            if os.path.isfile(self.model_path):
                self.load_model()
                self.exploration_rate = self.exploration_min

    def load_model(self):
        """Loads the model weights from the path set in the object."""
        self.policy_net.load_state_dict(torch.load(self.model_path))
        self.target_net.load_state_dict(policy_net.state_dict())
        self.target_net.eval()

    def act(self, state):
        """Acts according to epsilon greedy policy."""
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        
        with torch.no_grad():
            state = torch.tensor(state, dtype=torch.float).to(device)
            prediction = self.policy_net(state)
            action = torch.argmax(prediction).item()
        
        return action

    def remember(self, state, action, reward, next_state, done):
        """Saves experience in memory."""
        self.memory.append((state, action, reward, next_state, done))

    def learn(self, sample_batch_size):
        """Learning from experience in memory."""
        
        if len(self.memory) < sample_batch_size:
            return # Don't learn until we have at least batch size experiences
        
        sample_batch = random.sample(self.memory, sample_batch_size)
        state, action, reward, next_state, done = zip(*sample_batch)
        
        state = torch.tensor(state, dtype=torch.float).to(device)
        action = torch.tensor(action, dtype=torch.float).to(device)
        reward = torch.tensor(reward, dtype=torch.float).to(device)
        next_state = torch.tensor(next_state, dtype=torch.float).to(device)
        
        # Predicted Q values with current state
        prediction = self.policy_net(state)
        
        with torch.no_grad():
            target = prediction.clone()

            for idx in range(len(done)):
                Q_new = reward[idx]
                if not done[idx]:
                    next_state_values = self.target_net(next_state[idx])
                    Q_new = reward[idx] + self.gamma * torch.max(next_state_values)

                target[idx][torch.argmax(action[idx]).item()] = Q_new
            
        
        # Zero out gradients
        self.optimizer.zero_grad()
        
        # Calc loss and do back propagation
        loss = self.criterion(prediction, target)
        loss.backward()
        # Gradient clipping 
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()    
            
        if self.exploration_rate > self.exploration_min:
            self.exploration_rate *= self.exploration_decay
        print(self.exploration_rate)

In [None]:
class CartPole:
    def __init__(self):
        self.sample_batch_size = 128
        self.episodes = 50000
        self.env = gym.make('CartPole-v1')

        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.agent = Agent(self.state_size, self.action_size)
        
    def load_model(self):
        self.agent.load_model()

    def run_test(self):
        self.env = gym.make('CartPole-v1')
        
        state = self.env.reset()

        self.env.render()
        done = False

        index = 0
        while not done:
            state = np.reshape(state, [1, self.state_size])
            action = self.agent.act(state)
            next_state, _, done, _ = self.env.step(action)
            state = next_state
            index += 1
            self.env.render()
        self.env.close()
        
        print(f'Score: {index+1}')

    def run_train(self):
        scores = []
        avg_score = []
        total_score = 0
        for index_episode in range(self.episodes):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])

            done = False
            episode_score = 0
            while not done:
                # Turning off render while training
                # self.env.render()

                # Choose action
                action = self.agent.act(state)

                # Act
                next_state, reward, done, _ = self.env.step(action)

                # Remember experience
                next_state = np.reshape(next_state, [1, self.state_size])
                self.agent.remember(state, action, reward, next_state, done)
                
                state = next_state
                episode_score += 1
                
            # Perform optimization
            self.agent.learn(self.sample_batch_size)

            # # Update the target network, copying all weights and biases in DQN
            if index_episode % 10 == 0:
                self.agent.target_net.load_state_dict(self.agent.policy_net.state_dict())
                
            # Console output of learning process
            print(f'Episode {index_episode+1}/{self.episodes} Score: {episode_score + 1}')

            # Save cumulative reward in this episode and an avarage reward until now
            scores.append(episode_score + 1)
            total_score += (episode_score + 1)
            avg_score.append(total_score / (index_episode+1))

            # Pustamo agenta da uci.
            self.agent.learn(self.sample_batch_size)
            
            

            # Early stopping
            if index_episode > 100:
                last_100 = scores[-100:]
                avg = sum(last_100) / 100
                if avg > 485:
                    break

        
        # Save model weights
        self.agent.model.save()

        # Return training history
        return scores, avg_score