# Environment

In [1]:
import numpy as np
import copy
from gym import Env
import datetime

class FrozenLake(Env):
    def __init__(self,studentNum:int=256, nonStationary = False):
        self.studentNum = studentNum
        self.nonStationary = nonStationary
        
        np.random.seed(self.studentNum)
        self.beginMap = make_map(self.studentNum) #*2
        self.beginMap[self.beginMap>1] = 1
        self.endMap = make_map(self.studentNum + 100)
        
        self.changeDir = self.endMap - self.beginMap
        self.changeDir *= 1/11000

        self.fixedMap = self.beginMap

        np.random.seed(datetime.datetime.now().microsecond)
        
        self.map = copy.deepcopy(self.fixedMap)
        self.time = 0
        self.reset()

    def reset(self):
        self.NSreset()
        if not self.nonStationary:
            self.map = copy.deepcopy(self.fixedMap)
            self.time = 0

        return self.state

    def NSreset(self):
        self.time += 1
        self.map += self.changeDir

        self.map[self.map>0.95]=0.95
        self.map[self.map<0.0]=0.0

        self.state = (0,0)
        self.done = False
        return self.state
    
    def states_transitions(self, state, action):
        x = state[0]
        y = state[1]
        states = np.array([[x,y-1], [x,y+1], [x-1 ,y], [x+1,y] ])


        if action == UP:
            selected = states[2]
        if action == DOWN:
            selected = states[3]
        if action == RIGHT:
            selected = states[1]
        if action == LEFT:
            selected = states[0]

        zero = np.zeros((4,2)).astype(int)
        three = (3 * np.ones((4,2))).astype(int)
        output = np.maximum(np.minimum(states, three),zero)
        output, indices = np.unique(output, axis = 0, return_counts= True)

        
        selected = np.maximum(np.minimum(selected, three[0]), zero[0])
        probs = indices * 0.025
        probs[np.argmax(np.sum(selected == output, axis = 1))] += 0.9

        return list(zip(output[:,0],output[:,1])), probs
    
    def possible_consequences(self,action:int,state_now=None):

        if state_now==None:
            state_now = self.state

        state = [state_now[0],state_now[1]]
        states, probs = self.states_transitions(state, action)
        aa = np.array(states) 
        fail_probs = self.map[(aa[:,0]),(aa[:,1])]
        dones = np.sum(aa == 3, axis = 1) == 2
        return states, probs, fail_probs,dones
    
    def step(self, a:int):
        if not (a in range(4)):
            raise Exception("action is not available!!!")
        
        states, probs, fail_probs,dones = self.possible_consequences(a)
        
        next_idx = np.random.choice(np.arange(len(states)), p = probs)
        next_state = states[next_idx]
        self.state = tuple(next_state)
        
        self.done = dones[next_idx]

        r = -1

        if self.done:
            r += 60
        elif np.random.rand()< fail_probs[next_idx]:
            r -= 15
            self.done = True

        return (self.state, r, self.done, {})

    def render(self,state=None):
        if state == None:
            state = self.state

        out = ""
        for i in range(4):
            out += "\n------------------------------\n| "
            for j in range(4):
                if (i,j) == state:
                    out += "\033[44m{:.3f}\033[0m | ".format(self.map[i,j])
                else :
                    out += "{:.3f} | ".format(self.map[i,j])

        out += "\n------------------------------"
        print(out)

    def environment_states(self):
        env_states = []
        for state_index in range(16):
            s0 = state_index % 4
            s1 = state_index//4
            env_states.append((s0,s1))
        return env_states

        
def set_max_min(var,maximum,minimum):
    return min(max(var,minimum),maximum)

def make_map(studentNum):
    np.random.seed(studentNum)  
    move = np.zeros(6)
    idx = np.random.choice(range(6),size=3,replace=False)
    move[idx] = 1

    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))
    
    map = np.random.rand(4,4)
    idx = np.array(lowprobs)

    map[idx[:,0],idx[:,1]] = 0.001 
    map[0,0] = 0.0
    map[3,3] = 0.0 

    return map

## Your Student ID

In [2]:
STUDENT_NUM = 400722102

# HyperParameters

In [3]:
#%% allowed actions
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

ACTIONS = [LEFT,DOWN,RIGHT,UP]

#%% hyperparameters
EPISODES = 10000
EPSILON = 0.1
LEARNING_RATE = 0.1
DISCOUNT = 0.9

## Map of environment

In [4]:
environment = FrozenLake(studentNum=STUDENT_NUM)

print("Environment with fail probabilities :")
environment.render()


Environment with fail probabilities :

------------------------------
| [44m0.000[0m | 0.001 | 0.086 | 0.517 | 
------------------------------
| 0.412 | 0.001 | 0.563 | 0.435 | 
------------------------------
| 0.134 | 0.001 | 0.001 | 0.001 | 
------------------------------
| 0.005 | 0.229 | 0.687 | 0.000 | 
------------------------------


## <h2><font color=indigo> Agent Implementation
Implement your q-learning (off-policy TD) agent here. You need to utilize the step function provided in the Environment class to interact with frozen lake environment.

In [5]:
import sys
import itertools
import random

class Q_Learning:
    def __init__(self, id, environment, discount , learning_rate = 0.1 , epsilon = 0.1 ,episodes=10000):

        self.environment = environment
        self.discount = discount
        self.episodes = episodes
        self.id = id
        self.learning_rate = learning_rate
        self.environment = environment
        self.epsilon = epsilon
        self.action_size = 4
        self.state_size = 16
        self.map_size = 16
        self.map_y = 4


        # Create our Q table with state_size rows and action_size columns (16x4)
        self.qtable = np.zeros((self.state_size, self.action_size))

          # convert states into 0 to max_size number (15 in this example)
    def State_Number(self, state):
        return self.map_y*state[0] + state[1]

    def Witch_action(self , state ):
      cstate = self.State_Number(state)
      #Choose an action a in the current world state (s)
      ## First we randomize a number
      exp_exp_tradeoff = random.uniform(0, 1)
      ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
      if exp_exp_tradeoff > self.epsilon:
            action = np.argmax(self.qtable[cstate,:])
            #print(exp_exp_tradeoff, "action", action)
      # Else doing a random choice --> exploration
      else:
            action =  action = np.random.choice(self.action_size)
      return action 



      
    # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    # qtable[new_state,:] : all the actions we can take from new state
    def update(self ,action, state, newstate, reward ):
      cstate = self.State_Number(state)
      new_state = self.State_Number(newstate)
      max_Q_next_state = self.qtable[new_state][np.argmax(self.qtable[new_state])]
      new_Q = self.qtable[cstate][action] + self.learning_rate * (reward + self.discount *max_Q_next_state - self.qtable[cstate][action])

      self.qtable[cstate][action] = new_Q


    # train the agent and return rewards for the given number of episodes
    def train(self):
      # List of rewards
      rewards = np.zeros(self.episodes)
      # 2 For life or until learning is stopped
      for episode in range(self.episodes):
      # Reset the environment
            state = self.environment.reset()
            step = 0
            done = False
            total_rewards = 0
            current_state = self.environment.reset()

            while (done == False) :

                bestAction = self.Witch_action(current_state)
                next_state, reward, done, _ = environment.step(bestAction)
                self.update(bestAction, current_state, next_state, reward)
                total_rewards += reward
                current_state = next_state         

                if done:
                    break
            rewards[episode] = total_rewards
            
      return rewards

        # print and return policy (greedy policy based on Q values)    
    def print(self):
        actions = ["LEFT","DOWN","RIGHT","UP"]
        policy = []
        policy_string = []
        
        for i in range(len(self.qtable)-1):
            best_Q = np.argmax(self.qtable[i])
            policy.append(best_Q)
            action = actions[best_Q]
            policy_string.append(action)
            

            print( i , "Best Action: ", action)
            
        return policy, policy_string

## <h2><font color=indigo> Q Values
Return the Q values that your agent learns in here:

In [6]:
agent = Q_Learning('aminfathi', environment, 0.9 , learning_rate = 0.5 , epsilon = 0.1 ,episodes= 10000)
train = agent.train()


In [7]:
q= agent.qtable
print(q)

[[19.55073099 21.25117862 29.70987633 20.27781071]
 [22.55640762 34.27277799 18.37572447 24.33366776]
 [24.1196412  12.85833291 -3.41919757  9.57525493]
 [-3.94785194 -0.5        -0.5        -0.5       ]
 [12.43778105 23.52044744 27.48682397 16.95707254]
 [20.88703845 39.79472378 21.19381667 27.06192036]
 [22.09683577 23.67546369 35.13202156 14.71707914]
 [15.14200032 48.48290638 22.5715067  -3.74485665]
 [26.57833206 26.63068419 33.60781843 17.49265789]
 [31.52336654 31.96802675 45.83798053 30.63466927]
 [36.29993423 35.20499579 52.0920756  25.7943259 ]
 [45.58103957 58.99938453 46.14096907 41.96521272]
 [ 7.52030262  5.97761992 32.63835064 17.65312106]
 [24.21896353 31.82155726 36.91914617 29.06706686]
 [32.67412772 35.12168548 58.98029954 39.03929948]
 [ 0.          0.          0.          0.        ]]


In [8]:
environment.render()


------------------------------
| 0.000 | 0.001 | 0.086 | 0.517 | 
------------------------------
| 0.412 | 0.001 | 0.563 | 0.435 | 
------------------------------
| 0.134 | 0.001 | 0.001 | 0.001 | 
------------------------------
| 0.005 | 0.229 | 0.687 | [44m0.000[0m | 
------------------------------


## <h2><font color=darkcyan> Policy
Return the optimal policy that your agent learns in here:

In [9]:
policy, policy_string = agent.print()

0 Best Action:  RIGHT
1 Best Action:  DOWN
2 Best Action:  LEFT
3 Best Action:  DOWN
4 Best Action:  RIGHT
5 Best Action:  DOWN
6 Best Action:  RIGHT
7 Best Action:  DOWN
8 Best Action:  RIGHT
9 Best Action:  RIGHT
10 Best Action:  RIGHT
11 Best Action:  DOWN
12 Best Action:  RIGHT
13 Best Action:  RIGHT
14 Best Action:  RIGHT
