In [1]:
import copy
import random
import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt
from PIL import Image
import math
import torchvision.transforms as T
import numpy as np
import time
import socket
from IPython.display import clear_output

In [2]:
class DQN():
    ''' Deep Q Neural Network class. '''
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.05):
        self.criterion = torch.nn.MSELoss()
        self.model = torch.nn.Sequential(
                        torch.nn.Linear(state_dim, hidden_dim),
                        torch.nn.LeakyReLU(),
                        torch.nn.Linear(hidden_dim, hidden_dim*2),
                        torch.nn.LeakyReLU(),
                        torch.nn.Linear(hidden_dim*2, action_dim))
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr) #cambiar
        
    def update(self, state, y):
        """Update the weights of the network given a training sample. """
        tensor = torch.Tensor(state)
        y_pred = self.model(tensor)
        loss = self.criterion(y_pred, Variable(torch.Tensor(y)))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def predict(self, state):
        """ Compute Q values for all actions using the DQL. """
        with torch.no_grad():
            return self.model(torch.Tensor(state))

state_dim -> input red nuronal, depende del estado

action_dim -> 4 acciones: up, down, left y right

hidden_dim -> ajustar hiperparámetro

lr -> ajustar hiperparámetro

In [3]:
action_dim = 4
hidden_dim = 100
lr = 0.05

### Prueba preliminar
Distancia a la pill más cercana en cada una de las direcciones.

In [4]:
state_dim = 4

In [5]:
episodes = 100
gamma = 0.7

In [6]:
model = DQN(state_dim, action_dim, hidden_dim, lr)

In [7]:
class Game():
    def __init__(self, host="localhost", port=38514, numEpisodes = 100):
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.episodes = numEpisodes
        try:
            self.sock.bind((host, port))
        except socket.error as err:
            print('Bind failed. Error Code : ' .format(err))
        
    def connect(self):
        self.sock.listen(1)
        self.conn, _ = self.sock.accept()
        self.conn.send(bytes(str(self.episodes) + "\n",'UTF-8'))
        
    def get_state(self):
        data = self.conn.recv(512)
        data = data.decode(encoding='UTF-8')
        lista=data.split(";")
        reward= int(lista[1])
        #print(data)
        if lista[0] == "gameOver":
            return None, reward
        next_state=list(map(int, lista[0].replace("[","").replace("]","").split(",")))
        return next_state,reward
    
    def send_action(self, action):
        self.conn.send(bytes(str(action) + "\n",'UTF-8'))
    
    """def step(self,action):
        self.conn.send(bytes(str(action) + "\n",'UTF-8'))
        data = self.conn.recv(512)
        data = data.decode(encoding='UTF-8')
        lista=data.split(";")
        #print(data)
        next_state=list(map(int, lista[0].replace("[","").replace("]","").split(",")))
        reward= int(lista[1])
       
        return next_state,reward"""

In [8]:
def plot_res(values, title=''):   
    ''' Plot the reward curve and histogram of results over time.'''
    # Update the window after each episode
    #clear_output(wait=True)
    
    # Define the figure
    f, ax = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
    f.suptitle(title)
    ax[0].plot(values, label='score per run')
    ax[0].axhline(195, c='red',ls='--', label='goal')
    ax[0].set_xlabel('Episodes')
    ax[0].set_ylabel('Reward')
    x = range(len(values))
    ax[0].legend()
    # Calculate the trend
    try:
        z = np.polyfit(x, values, 1)
        p = np.poly1d(z)
        ax[0].plot(x,p(x),"--", label='trend')
    except:
        print('')
    
    # Plot the histogram of results
    ax[1].hist(values[-50:])
    ax[1].axvline(195, c='red', label='goal')
    ax[1].set_xlabel('Scores per Last 50 Episodes')
    ax[1].set_ylabel('Frequency')
    ax[1].legend()
    plt.show()

In [14]:
def q_learning(model, episodes = 100, gamma=0.7, epsilon=0.3, title = 'DQN'):
    """Deep Q Learning algorithm using the DQN. """
    game = Game(numEpisodes = episodes)
    game.connect()
    final = []
    episode_i=0
    for episode in range(episodes):
        episode_i+=1
                
        # Reset state
        state, _ = game.get_state()
        
        total = 0
        
        while True:
            # Implement greedy search policy to explore the state space
            if random.random() < epsilon:
                action = random.randint(0,3)
            else:
                q_values = model.predict(state)
                action = torch.argmax(q_values).item()
                
            # Take action and add reward to total
            game.send_action(action)
            next_state, reward = game.get_state()    
           
            
            # Update total
            total += reward
            if type(q_values) != list:
                q_values = q_values.tolist()
            else:
                 q_values = model.predict(state).tolist()
            
            if next_state == None:
                q_values[action] = reward
                # Update network weights
                model.update(state, q_values)
                break
            
            q_values_next = model.predict(next_state)
            q_values[action] = reward + gamma * torch.max(q_values_next).item()
            model.update(state, q_values)
            
            state = next_state
            
        
        final.append(total)
        #plot_res(final, title)
        
        print("episode: {}, total reward: {}".format(episode_i, total))
            
        torch.save(model, "model.mdl")
    return sum(final)/len(final)

In [10]:
def q_learning_old(model, episodes = 100, gamma=0.7, epsilon=0.3, title = 'DQN'):
    """Deep Q Learning algorithm using the DQN. """
    game = Game(numEpisodes = episodes)
    game.connect()
    final = []
    episode_i=0
    for episode in range(episodes):
        episode_i+=1
                
        # Reset state
        state = game.get_state()
        
        done = False
        total = 0
        
        while not done:
            # Implement greedy search policy to explore the state space
            q_values = model.predict(state) #Devuelve array con probabilidades de accion
            #action = torch.argmax(q_values).item() #Accion a realizar
            # Implement greedy search policy to explore the state space
            if random.random() < epsilon:
                action = random.randint(0,3)
            else:
                q_values = model.predict(state)
                action = torch.argmax(q_values).item()
                
            # Take action and add reward to total
            next_state, reward, done = game.step(action) #Quizas reward requiera tratamiento, 
            
            #print("Action:" , action)
           #print("next_state", next_state)
            #print("reward", reward)
            #print("done", done)
            
            # Update total and memory
            total += reward
            q_values = q_values.tolist()
             
            if done:
                q_values[action] = reward
                # Update network weights
                model.update(state, q_values)
                break
                
            # Update network weights using the last step only
            q_values_next = model.predict(next_state)
            q_values[action] = reward + gamma * torch.max(q_values_next).item()
            model.update(state, q_values)
            
            state = game.get_state()
            #state = next_state
        
        final.append(total)
        #plot_res(final, title)
        
        print("episode: {}, total reward: {}".format(episode_i, total))
            
        #if total == 500:
            #torch.save(model, "model.mdl")
            #return final
    return final
    
    

In [15]:
q_learning(model, 100)

episode: 1, total reward: 2850
episode: 2, total reward: 1370
episode: 3, total reward: 1750
episode: 4, total reward: 1140
episode: 5, total reward: 2390
episode: 6, total reward: 1780
episode: 7, total reward: 2590
episode: 8, total reward: 2050
episode: 9, total reward: 2020
episode: 10, total reward: 1940
episode: 11, total reward: 1180
episode: 12, total reward: 750
episode: 13, total reward: 1520
episode: 14, total reward: 2350
episode: 15, total reward: 2810
episode: 16, total reward: 1740
episode: 17, total reward: 1530
episode: 18, total reward: 1530
episode: 19, total reward: 1760
episode: 20, total reward: 1070
episode: 21, total reward: 1430
episode: 22, total reward: 2210
episode: 23, total reward: 1670
episode: 24, total reward: 1720
episode: 25, total reward: 1670
episode: 26, total reward: 1280
episode: 27, total reward: 3830
episode: 28, total reward: 1910
episode: 29, total reward: 1810
episode: 30, total reward: 1460
episode: 31, total reward: 2240
episode: 32, total

1801.7