<a href="https://colab.research.google.com/github/aashmauprety/Final_UAV/blob/master/MyEnv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from PIL import Image
from google.colab.patches import cv2_imshow
import cv2
import matplotlib.pyplot as plt
import pickle
from matplotlib import style
import time

style.use("ggplot")


In [0]:
SIZE = 10
HM_EPISODES = 25000
COLLISION_PENALTY = -1
SUCCESS_REWARD = 1
MOVE_PENALTY = 0
epsilon = 0.9
EPS_DECAY = 0.9998  # Every episode will be epsilon*EPS_DECAY
SHOW_EVERY = 3000  # how often to play through env visually.

start_q_table = None # None or Filename

LEARNING_RATE = 0.1
DISCOUNT = 0.95
PLAYER_1 = 1
PLAYER_2 = 2
STATION_1 = 3
d = {1: (255, 175, 0),
     2: (0, 255, 0),
     3: (0, 0, 255)}


In [0]:
class Drone:
    def __init__(self):
        self.x = np.random.randint(0, SIZE)
        self.y = np.random.randint(0, SIZE)

    def __str__(self):
        return f"{self.x}, {self.y}"

    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)
    
    def action(self, choice):
        '''
        Gives us 4 total movement options. (0,1,2,3)
        '''
        if choice == 0:
            self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=0, y=0)
        # elif choice == 2:
        #     self.move(x=-1, y=1)
        # elif choice == 3:
        #     self.move(x=1, y=-1)

    def move(self, x=False, y=False):

        # If no value for x, move randomly
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x

        # If no value for y, move randomly
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y


        # If we are out of bounds, fix!
        if self.x < 0:
            self.x = 0
        elif self.x > SIZE-1:
            self.x = SIZE-1
        if self.y < 0:
            self.y = 0
        elif self.y > SIZE-1:
            self.y = SIZE-1
      

In [12]:
player1 = Drone()
player2 = Drone()
station = Drone()
print(player1)
print(player2)
print(player1 - player2)
player1.move(1,2)
print(player1)
player1.action(1)
print(player1)

5, 4
6, 2
(-1, 2)
6, 6
5, 7


In [0]:
if start_q_table is None:
    # initialize the q-table#
    q_table = {}
    for i in range(-SIZE+1, SIZE):
        for ii in range(-SIZE+1, SIZE):
            for iii in range(-SIZE+1, SIZE):
                    for iiii in range(-SIZE+1, SIZE):
                        q_table[((i, ii), (iii, iiii))] = [np.random.uniform(-5, 0) for i in range(2)]
else:
    with open(start_q_table, "rb") as f:
        q_table = pickle.load(f)                       

In [10]:
print(q_table[((-9, -2), (3, 9))])


[-1.8779468650912596, -1.6826995265269922]


In [18]:
episode_rewards = []

for episode in range(HM_EPISODES):
    player1 = Drone()
    player2 = Drone()
    station = Drone()
    if episode % SHOW_EVERY == 0:
        print(f"on #{episode}, epsilon is {epsilon}")
        print(f"{SHOW_EVERY} ep mean: {np.mean(episode_rewards[-SHOW_EVERY:])}")
        show = True
    else:
        show = False

    episode_reward = 0
    for i in range(200):
      obs = (player1-player2, player1-station)
      #print(obs)
      if np.random.random() > epsilon:
          # GET THE ACTION
          action = np.argmax(q_table[obs])
      else:
          action = np.random.randint(0, 2)
      # Take the action!
      player1.action(action) 
       #### MAYBE ###
      player2.move()
        #food.move()
        ##############

      if player1.x == station.x and player1.y == station.y and player2.x == station.x and player2.y == station.y:
          reward = -COLLISION_PENALTY
      elif player1.x == station.x and player1.y == station.y or player2.x == station.x and player2.y == station.y:
          reward = SUCCESS_REWARD
      else:
          reward = MOVE_PENALTY
      ## NOW WE KNOW THE REWARD, LET'S CALC YO
      # first we need to obs immediately after the move.
      new_obs = (player1-player2, player1-station)
      max_future_q = np.max(q_table[new_obs])
      current_q = q_table[obs][action]

      if reward == SUCCESS_REWARD:
          new_q = SUCCESS_REWARD
      else:
          new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
      q_table[obs][action] = new_q

      if show:
          env = np.zeros((SIZE, SIZE, 3), dtype=np.uint8)  # starts an rbg of our size
          env[player1.x][player1.y] = d[PLAYER_1]  # sets the food location tile to green color
          env[player2.x][player2.y] = d[PLAYER_2]  # sets the player tile to blue
          env[station.x][station.y] = d[STATION_1]  # sets the enemy location to red
          img = Image.fromarray(env, 'RGB')  # reading to rgb. Apparently. Even tho color definitions are bgr. ???
          img = img.resize((300, 300))  # resizing so we can see our agent in all its glory.
          cv2.imshow("image", np.array(img))  # show it!
          if reward == SUCCESS_REWARD or reward == -COLLISION_PENALTY:  # crummy code to hang at the end if we reach abrupt end for good reasons or not.
              if cv2.waitKey(500) & 0xFF == ord('q'):
                  break
          else:
              if cv2.waitKey(1) & 0xFF == ord('q'):
                  break

      episode_reward += reward
      if reward == FOOD_REWARD or reward == -ENEMY_PENALTY:
          break

    #print(episode_reward)
    episode_rewards.append(episode_reward)
    epsilon *= EPS_DECAY

moving_avg = np.convolve(episode_rewards, np.ones((SHOW_EVERY,))/SHOW_EVERY, mode='valid')

plt.plot([i for i in range(len(moving_avg))], moving_avg)
plt.ylabel(f"Reward {SHOW_EVERY}ma")
plt.xlabel("episode #")
plt.show()

with open(f"qtable-{int(time.time())}.pickle", "wb") as f:
    pickle.dump(q_table, f)      

on #0, epsilon is 0.9
3000 ep mean: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


DisabledFunctionError: ignored