In [1]:
import torch
import torch.nn as nn
import random
from tqdm import tqdm
import pickle
import gym
import numpy as np
import collections
import time
import pylab as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DQNSolver(nn.Module):
    '''
    Neural Net n linear layers
    '''

    def __init__(self, input_shape, n_actions):
        super(DQNSolver, self).__init__()
        
        self.fc = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
    
    def forward(self, x):
        print(f'Forward!')
        return self.fc(x)

In [48]:
class DQNAgent:

    def __init__(self, state_space, action_space, max_memory_size, batch_size, gamma, lr,
                 dropout, exploration_max, exploration_min, exploration_decay, pretrained):
        
        # Define DQN Layers
        self.state_space = state_space
        self.action_space = action_space
        self.pretrained = pretrained
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # DQN network  
        self.dqn = DQNSolver(state_space, action_space).to(self.device)

        if self.pretrained:
            self.dqn.load_state_dict(torch.load("DQN.pt", map_location=torch.device(self.device)))
        self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=lr)

        # Create memory
        self.max_memory_size = max_memory_size
        #self.STATE_MEM = torch.zeros(max_memory_size, *self.state_space)
        self.STATE_MEM = torch.zeros(max_memory_size, self.state_space)
        #self.ACTION_MEM = torch.zeros(max_memory_size, 1)
        self.ACTION_MEM = torch.zeros(max_memory_size, self.action_space)
        self.REWARD_MEM = torch.zeros(max_memory_size, 1)
        #self.STATE2_MEM = torch.zeros(max_memory_size, *self.state_space)
        self.STATE2_MEM = torch.zeros(max_memory_size, self.state_space)
        self.DONE_MEM = torch.zeros(max_memory_size, 1)
        self.ending_position = 0
        self.num_in_queue = 0

        self.memory_sample_size = batch_size
        
        # Learning parameters
        self.gamma = gamma
        self.l1 = nn.SmoothL1Loss().to(self.device) # Also known as Huber loss
        self.exploration_max = exploration_max
        self.exploration_rate = exploration_max
        self.exploration_min = exploration_min
        self.exploration_decay = exploration_decay


    def remember(self, state, action, reward, state2, done):
        """Store the experiences in a buffer to use later"""
        self.STATE_MEM[self.ending_position] = state.float()
        self.ACTION_MEM[self.ending_position] = action.float()
        self.REWARD_MEM[self.ending_position] = reward.float()
        self.STATE2_MEM[self.ending_position] = state2.float()
        self.DONE_MEM[self.ending_position] = done.float()
        self.ending_position = (self.ending_position + 1) % self.max_memory_size  # FIFO tensor
        self.num_in_queue = min(self.num_in_queue + 1, self.max_memory_size)


    def batch_experiences(self):
        """Randomly sample 'batch size' experiences"""
        idx = random.choices(range(self.num_in_queue), k=self.memory_sample_size)
        STATE = self.STATE_MEM[idx]
        ACTION = self.ACTION_MEM[idx]
        REWARD = self.REWARD_MEM[idx]
        STATE2 = self.STATE2_MEM[idx]
        DONE = self.DONE_MEM[idx]      
        return STATE, ACTION, REWARD, STATE2, DONE


    def random_action(self, max_fingers, max_keys):
        action = np.zeros(max_keys, dtype=np.float32)

        fingers = random.randrange(max_fingers)
        for finger in range(fingers):
            action[random.randrange(max_keys)] += 1
        
        return action
    
    def act(self, state):
        
        """Epsilon-greedy action"""
        
        num_fingers = 10
        num_keys = self.state_space

        if random.random() < self.exploration_rate:
            #return torch.tensor([[random.randrange(self.action_space)]])
            print(f'Random - ', end='')
            return torch.tensor(self.random_action(num_fingers, num_keys))
        else:
            print(f'Exploit - ', end='')
            q_vals = self.dqn(state.to(self.device)).cpu()
            print(f'q_v: {q_vals.dtype} ({q_vals.size()})')

            #high_indexes = torch.topk(q_vals, num_fingers).indices
            #for c in range(len(q_vals)):
            #    if c not in high_indexes:
            #        q_vals[c] = 0

            #return torch.argmax(q_vals).unsqueeze(0).unsqueeze(0).cpu()
            return q_vals


    def experience_replay(self):
        '''
        Perhaps try to train for every step instead...
        '''
        if self.memory_sample_size > self.num_in_queue:
            return
    
        # Sample a batch of experiences
        STATE, ACTION, REWARD, STATE2, DONE = self.batch_experiences()
        STATE = STATE.to(self.device)
        ACTION = ACTION.to(self.device)
        REWARD = REWARD.to(self.device)
        STATE2 = STATE2.to(self.device)
        DONE = DONE.to(self.device)
        
        self.optimizer.zero_grad()
        # Q-Learning target is Q*(S, A) <- r + γ max_a Q(S', a) 
        target = REWARD + torch.mul((self.gamma * self.dqn(STATE2).max(1).values.unsqueeze(1)), 1 - DONE)
        current = self.dqn(STATE).gather(1, ACTION.long())
        
        loss = self.l1(current, target)
        loss.backward() # Compute gradients
        self.optimizer.step() # Backpropagate error

        self.exploration_rate *= self.exploration_decay
        
        # Makes sure that exploration rate is always at least 'exploration min'
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)


    def train(self, state, action, reward, state_next, step):
        '''
        Our own train algorithm
        '''

        self.optimizer.zero_grad()
        # Q-Learning target is Q*(S, A) <- r + γ max_a Q(S', a)
        
        print(f'r: {reward.dtype} ({reward.size()}) s_n: {state_next.dtype} ({state_next.size()}) a: {action.dtype} ({action.size()}), s: {state.dtype} ({state.size()}), ', end='')
        target = (reward + self.gamma * self.dqn(state_next.to(self.device)).cpu()).squeeze(0) #.max(0).values.unsqueeze(0)
        current = self.dqn(state.to(self.device)).cpu()
        
        print(f'train step {step}, t: {target.size()} c: {current.size()}')
        loss = self.l1(current, target)
        loss.backward(retain_graph=True) # Compute gradients
        self.optimizer.step() # Backpropagate error

        self.exploration_rate *= self.exploration_decay
        
        # Makes sure that exploration rate is always at least 'exploration min'
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)

In [4]:
class piano_env:

    def __init__(self, num_keys, num_fingers):
        self.num_keys = num_keys
        self.num_fingers = num_fingers
        self.observation_space = num_keys
        self.action_space = num_keys


    def ext_reward(self, state):
        '''
        state is a np.array of velocities(?)
        What should we reward?
        '''
        num_keys = len(state)
        reward = 0

        for key in state:
            if key:
                reward += key

        return reward

    def step(self, action):
        '''
        Replaces env.step()
        Returns state_next, reward, terminal (array, float, bool)
        '''
        state = action #because this is very simple

        return (state, self.ext_reward(state), False)
    
    def reset(self):
        return np.zeros(self.num_keys, dtype=np.float32)

In [49]:
training_mode = True
pretrained = False
#with torch.autograd.set_detect_anomaly(True):
run(training_mode, pretrained)


  0%|          | 0/10 [00:00<?, ?it/s]

0: state torch.float32 (torch.Size([128]))Random - action: torch.float32 reward: 5.0
r: torch.float32 (torch.Size([1, 1])) s_n: torch.float32 (torch.Size([128])) a: torch.float32 (torch.Size([128])), s: torch.float32 (torch.Size([128])), Forward!
Forward!
train step 1, t: torch.Size([128]) c: torch.Size([128])
1: state torch.float32 (torch.Size([128]))Random - action: torch.float32 reward: 2.0
r: torch.float32 (torch.Size([1, 1])) s_n: torch.float32 (torch.Size([128])) a: torch.float32 (torch.Size([128])), s: torch.float32 (torch.Size([128])), Forward!
Forward!
train step 2, t: torch.Size([128]) c: torch.Size([128])
2: state torch.float32 (torch.Size([128]))Random - action: torch.float32 reward: 8.0
r: torch.float32 (torch.Size([1, 1])) s_n: torch.float32 (torch.Size([128])) a: torch.float32 (torch.Size([128])), s: torch.float32 (torch.Size([128])), Forward!
Forward!
train step 3, t: torch.Size([128]) c: torch.Size([128])
3: state torch.float32 (torch.Size([128]))Random - action: torch




RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 128]], which is output 0 of AsStridedBackward0, is at version 13; expected version 12 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [23]:
def run(training_mode, pretrained, num_episodes=10, exploration_max=1):
   
    #env = gym.make('Breakout-v0') # can change the environmeent accordingly
    keys = 128
    fingers = 10
    env = piano_env(keys, fingers)
    #env = create_env(env)  # Wraps the environment so that frames are grayscale
    observation_space = env.observation_space
    action_space = env.action_space
    
    agent = DQNAgent(state_space=observation_space,
                     action_space=action_space,
                     max_memory_size=30000,
                     batch_size=32,
                     gamma=0.90,
                     lr=0.00025,
                     dropout=0.2,
                     exploration_max=1.0,
                     exploration_min=0.02,
                     exploration_decay=0.99,
                     pretrained=pretrained)
    
    # Restart the enviroment for each episode
    #num_episodes = num_episodes
    #env.reset()

    total_rewards = []
    if training_mode and pretrained:
        with open("total_rewards.pkl", 'rb') as f:
            total_rewards = pickle.load(f)
    
    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor(state)
        total_reward = 0
        steps = 0
        while True:
            print(f'{steps}: state {state.dtype} ({state.size()})', end='')
            action = agent.act(state)
            print(f'action: {action.dtype} ', end='')
            steps += 1
            
            #state_next, reward, terminal, info = env.step(int(action[0]))
            state_next, reward, terminal = env.step(action)
            print(f'reward: {reward}')
            total_reward += reward
            #state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)
            
            terminal = torch.tensor([int(terminal)]).unsqueeze(0)
            
            if training_mode:
                #agent.remember(state, action, reward, state_next, terminal)
                #agent.experience_replay()
                agent.train(state, action, reward, state_next, steps)
            
            state = state_next
            if terminal:
                break

            if steps >= 100:
                break
        
        total_rewards.append(total_reward)
        
        if ep_num != 0 and ep_num % 100 == 0:
            print("Episode {} score = {}, average score = {}".format(ep_num + 1, total_rewards[-1], np.mean(total_rewards)))
        num_episodes += 1
    
    print("Episode {} score = {}, average score = {}".format(ep_num + 1, total_rewards[-1], np.mean(total_rewards)))
    
    # Save the trained memory so that we can continue from where we stop using 'pretrained' = True
    if training_mode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.ending_position, f)
        with open("num_in_queue.pkl", "wb") as f:
            pickle.dump(agent.num_in_queue, f)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(total_rewards, f)


        torch.save(agent.dqn.state_dict(), "DQN.pt")  
        torch.save(agent.STATE_MEM,  "STATE_MEM.pt")
        torch.save(agent.ACTION_MEM, "ACTION_MEM.pt")
        torch.save(agent.REWARD_MEM, "REWARD_MEM.pt")
        torch.save(agent.STATE2_MEM, "STATE2_MEM.pt")
        torch.save(agent.DONE_MEM,   "DONE_MEM.pt")
    
    #env.close()