In [1]:
import gym
import copy
from line_profiler import LineProfiler
from option_critic.utils import *
from matplotlib import pyplot as plt
from option_critic.fourrooms import FourRooms

from IPython.display import clear_output

In [2]:
#[env.id for env in gym.envs.registry.all() if env.id.startswith('Taxi')]
import Taxi_v0, Taxi_v1, Taxi_v4
import Taxi_v00, Taxi_v01, Taxi_v03, Taxi_v04
import Taxi_v000,Taxi_v001,Taxi_v003,Taxi_v004
env = gym.make("Taxi-v3")
env.render()
"""
Here's our restructured problem statement (from Gym docs):

"There are 4 locations (labeled by different letters), and our job is to pick up the passenger 
at one location and drop him off at another. We receive +20 points for a successful drop-off and 
lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up 
and drop-off actions."

- The filled square represents the taxi, which is yellow without a passenger and green with a passenger.
- The pipe ("|") represents a wall which the taxi cannot cross.
- R, G, Y, B are the possible pickup and destination locations. The blue letter represents the current passenger
  pick-up location, and the purple letter is the current destination.
"""

+---------+
|R: | : :[35mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



'\nHere\'s our restructured problem statement (from Gym docs):\n\n"There are 4 locations (labeled by different letters), and our job is to pick up the passenger \nat one location and drop him off at another. We receive +20 points for a successful drop-off and \nlose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up \nand drop-off actions."\n\n- The filled square represents the taxi, which is yellow without a passenger and green with a passenger.\n- The pipe ("|") represents a wall which the taxi cannot cross.\n- R, G, Y, B are the possible pickup and destination locations. The blue letter represents the current passenger\n  pick-up location, and the purple letter is the current destination.\n'

In [3]:
%matplotlib widget
goals = [(3,2), (3,9), (9,2), (9,9)]
four_room_envs =[None] * 4
for i in range(len(goals)):
    four_room_envs[i] = FourRooms()
    four_room_envs[i].reset()
    four_room_envs[i].goal =  four_room_envs[i].tostate[goals[i]]
    clear_output(True)
    fig = plt.subplot(2,2,i+1)
    plt.imshow(four_room_envs[i].render(show_goal=True), cmap='Blues')
    plt.axis('off')
    plt.show()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [4]:
env = gym.make("Taxi-v003")
dir(env)
print(dir(env))
hasattr(env, 's')

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_elapsed_steps', '_max_episode_steps', 'action_space', 'class_name', 'close', 'compute_reward', 'env', 'metadata', 'observation_space', 'render', 'reset', 'reward_range', 'seed', 'spec', 'step', 'unwrapped']


True

## Definition

### 0. utils

In [5]:
def arg_max(state_action):
    max_index_list = []
    max_value = state_action[0]
    for index, value in enumerate(state_action):
        if value > max_value:
            max_index_list.clear()
            max_value = value
            max_index_list.append(index)
        elif value == max_value:
            max_index_list.append(index)
    return random.choice(max_index_list)

In [6]:
def count_zero_state(q_table):
    zero_state = 0
    for arr_2d in q_table:
         zero_state += np.sum(np.sum(arr_2d,axis = 1) == 0)
    return zero_state

In [7]:
gym.make("Taxi-v004").render()

+---------+
|R:[43m [0m| :G: |
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [8]:
gym.make("Taxi-v003").render()

+---------+
|[35m[43mR[0m[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [9]:
gym.make("Taxi-v1").render()

+---------+
| : | : : |
| :[35mR[0m| :[34;1mG[0m: |
| : : : : |
| | : | :[43m [0m|
| |Y: | :B|
+---------+



In [10]:
gym.make("Taxi-v4").render()

+---------+
|R: | :[35mG[0m: |
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [11]:
EPISODE_NUM = 4000
MAX_EPISODE_LEN = 200
REPEAT_TIMES = 5  # train agent REAPEAT_TIMES to get averaged learning curves
EVALUATION_TIMES = 1 # evaluate target policy EVALUATION_TIMES after x updates in off-policy RL algorithms
CLIST = ['b','c', 'g', 'k','m', 'r', 'y', 'w'] #colors to plot learning curve


In [12]:
state = env.reset()
env.render()
print("Action space = {}".format(env.action_space))
print("State space = {}".format(env.observation_space))


+---------+
|R: | : :[35mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

Action space = Discrete(6)
State space = Discrete(500)


In [13]:

env.s = state
env.render()
env = gym.make("Taxi-v3")

+---------+
|R: | : :[35mG[0m|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [14]:

state = env.encode(2, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.env.s = state # ( ( (taxi row*5)+taxi col) *5 + pass_index *4) + dest_index)
env.render()

State: 228
+---------+
|[35mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [15]:
env.P[328] # {action: [(probability, nextstate, reward, done)]}

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [16]:
import numpy as np


import random
from IPython.display import clear_output
from time import sleep

def show_frames(env ,i, episode_rewards ):
    clear_output(wait=True)
    env.render()
    print(f"step {i} rewards={episode_rewards}")
    sleep(.1)




In [17]:
def smooth(y, radius):
    '''
    smooth data y by averaging the values in each window [max{index-radius,0}, min{(index+radius), len(y)-1}] 
    
    '''
    if (len(y) < 2 * radius + 1):
        return np.mean(y) * np.ones_like(y)
    else:
        convkernel = np.ones(2 * radius + 1)
        out = np.convolve(y, convkernel, mode = 'same') / np.convolve(np.ones_like(y), convkernel, mode = 'same')
        return out

In [18]:
def policy_evaluate(env, policy, times = 10):
    # store data, make average and return
    all_length, all_penalties, all_rewards = np.zeros(times), np.zeros(times), np.zeros(times)

    for i in range(times):

        state = env.reset()
        done = False
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        
        while not done:
            action = arg_max(policy[state])
            next_state, reward, done, info = env.step(action)
            state = next_state
            #update data for learning curve
            if reward == -10:
                episode_penalties +=1

            episode_rewards += reward
            episode_length += 1

        all_rewards[i] = episode_rewards
        all_penalties[i] = episode_penalties
        all_length[i] = episode_length
    return np.mean(all_rewards), np.mean(all_penalties), np.mean(all_length)

### 1. q-learning

In [19]:
def q_learning(env, train_episodes,init_q_table=None):
    """
    Training the agent
    Q(state,action)←(1−α)Q(state,action)+α(reward+γmaxaQ(next state,all actions))
    """  
    
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning

    timesteps =  train_episodes * MAX_EPISODE_LEN

    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    
    if init_q_table is None:
        q_table = np.zeros([env.observation_space.n, env.action_space.n])
    else:
        q_table = copy.deepcopy(init_q_table)
    
    env_copy = copy.deepcopy(env) # for policy_evaluate
    i = 0
    while i < timesteps:
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False

        while not done and i < timesteps:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1)< epsilon:
                action = env.action_space.sample()
            else:
                action = arg_max(q_table[state])

            # step to next state
            next_state, reward, done, info = env.step(action)       
            
            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            if i%200 == 0:
                #evaluate policy for learning curve after each episode
                episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, q_table, EVALUATION_TIMES)
                
                all_episodes_length.append(episode_length)
                all_penalties.append(episode_penalties)
                all_rewards.append(episode_rewards)
                
                #show training progress
                clear_output(wait=True)
                print(f"timesteps: {i}")
            i += 1
         
    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table

### 2. SARSA

In [20]:
def SARSA(env,  train_episodes,init_s_table = None):
    """Training the agent
    Q(state,action)←(1−α)Q(state,action)+α(reward+γQ(next state,next action))

    """  
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning

    train_episodes =  train_episodes 

    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    
    
    if init_s_table is None:
        s_table = np.zeros([env.observation_space.n, env.action_space.n])
    else:
        s_table = copy.deepcopy(init_s_table)
    env_copy = copy.deepcopy(env)
    
    for i in range(train_episodes):
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False

        # epsilon greedy alg balancing exporation and exploitation
        if random.uniform(0,1)< epsilon:
            action = env.action_space.sample()
        else:
            action = arg_max(s_table[state])

        while not done:

            # step to next state
            next_state, reward, done, info = env.step(action)

            # choose next action
            if random.uniform(0,1)< epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = arg_max(s_table[next_state])

            # update q-value
            q_value = (1-alpha) * s_table[state, action] + alpha * (reward + gamma * s_table[next_state,next_action])
            s_table[state, action] = q_value

            state = next_state
            action = next_action

        #evaluate policy for learning curve after each episode
        episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, q_table, EVALUATION_TIMES)

        #record data for learning curve
        all_episodes_length.append(episode_length)
        all_penalties.append(episode_penalties)
        all_rewards.append(episode_rewards)

        #show training progress
        if (i+1) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {i}")
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, s_table

### 3.PRQL 

In [21]:
def prql(env, train_episodes, past_policy):
        
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 1 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    mu = 0.95 # the decaying rate of fi
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []

    q_table = np.zeros([env.observation_space.n, env.action_space.n])
 
    past_table = copy.deepcopy(past_policy)
    env_copy = copy.deepcopy(env)
    
    for i in range(train_episodes):
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        f = fi
        while not done:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1) < f:
                action = arg_max(past_table[state])
                #print(f"f = {f} in episode {i} \n")
            elif random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
                #print("random action \n")
                
            else:
                action = arg_max(q_table[state])
                #print("on the policy now \n")

            # step to next state
            next_state, reward, done, info = env.step(action)

            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            f = f*mu
            
        #evaluate policy for learning curve after each episode
        episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, q_table, EVALUATION_TIMES)

        #record data for learning curve
        all_episodes_length.append(episode_length)
        all_penalties.append(episode_penalties)
        all_rewards.append(episode_rewards)

        #show training progress
        if (i+1) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {i}")
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table    


### 4. OPS-TL

In [22]:
def OPS_TL(env,train_episodes,past_policies):
    """
    Training the agent using algorithm in paper "An Optimal Online Method of Selecting 
    Source Policies for Reinforcement Learning"
    """
    # Hyper parameters
    alpha = 0.05 #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95 # (gamma) is the discount factor (0≤γ≤1)
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 0.95 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    c = 0.0049 # the factor in UCB-tuned deciding the rate of exploration, a lager c will lead to a higher exploration rate
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
   
    #win = {'reuse': [], 'new': []}
    timesteps = train_episodes * MAX_EPISODE_LEN
   
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    env_copy = copy.deepcopy(env)
    
    # pt = 1-k/(k+1500)
    
    #initialize the expected reward of every bandit/past_policy
    policy_expect = np.zeros([len(past_policies)])
    policy_times = np.zeros([len(past_policies)])
    for j in range(len(past_policies)):
        t = 0
        env.reset()
        _,policy_expect[j],_,_= policy_reuse(timesteps, env,q_table, env.reset(), t,[],[],[],past_policies[j])
        policy_times[j] += 1
        
    # training strat  
    i = 0
    while i < timesteps:
        state = env.reset()
        
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        
        k = i // MAX_EPISODE_LEN  + 1
        pt = 1-k/(k+1500)
        
        # use UCB1 to solve MAB(multi armed bandit problem
        if random.uniform(0,1) < pt:
            j = arg_max(policy_expect + np.sqrt(c * np.log(np.sum(policy_times)) / policy_times )) # UCB1-tuned
            episode_penalties, episode_rewards, episode_length,i = policy_reuse(timesteps, env, q_table, state, i,
                                                                              all_episodes_length,all_penalties,all_rewards,
                                                                              past_policy = past_policies[j])
            policy_expect[j] = (policy_expect[j]*policy_times[j] + episode_rewards)/( policy_times[j] +1)
            policy_times[j] += 1
#            if episode_length < 200:
#                win['reuse'].append(k)
        # follow episilon-greedy strategy    
        else:
            episode_penalties, episode_rewards, episode_length,i = epsilon_greedy(timesteps, env, q_table, state, i , 
                                                                                all_episodes_length,all_penalties,all_rewards,
                                                                                epsilon, alpha, gamma)
#            if episode_length < 200:
#                win['new'].append(k)        


        #show training progress
        if (k) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {k}")
    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table    
            
            
        

In [23]:
def OPS_Norm(env,train_episodes,past_policies, c = 0.0049):
    """
    Training the agent using algorithm in paper "An Optimal Online Method of Selecting 
    Source Policies for Reinforcement Learning"
    """
    # Hyper parameters
    alpha = 0.05 #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95 # (gamma) is the discount factor (0≤γ≤1)
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 0.95 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    c = c # the factor in UCB-tuned deciding the rate of exploration, a lager c will lead to a higher exploration rate
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    all_frequency = np.zeros([len(past_policies)+1, train_episodes])
#   win = {'reuse': [], 'new': []}
    timesteps = train_episodes * MAX_EPISODE_LEN
   
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    env_copy = copy.deepcopy(env)
    
    # pt = 1-k/(k+1500)
    
    #initialize the expected reward of every bandit/past_policy
    frequency = np.zeros([len(past_policies)+1, train_episodes])
    policy_expect = np.zeros([len(past_policies)])
    policy_times = np.zeros([len(past_policies)])
    for j in range(len(past_policies)):
        t = 0
        env.reset()
        for _ in range(10):
            _,temp,_,_= policy_reuse(timesteps, env,q_table, env.reset(), t,[],[],[],past_policies[j])
            policy_expect[j] += temp
        policy_expect[j] /= 10
        policy_times[j] += 1
        
    # training strat  
    i = 0
    while i < timesteps:
        state = env.reset()
        
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        
        k = i // MAX_EPISODE_LEN  + 1
        pt = 1-k/(k+1500)
        
        # use UCB1 to solve MAB(multi armed bandit problem
        if random.uniform(0,1) < pt:
            j = arg_max(policy_expect/20 + np.sqrt(c * np.log(np.sum(policy_times)) / policy_times )) # UCB1-tuned
            episode_penalties, episode_rewards, episode_length,i = policy_reuse(timesteps, env, q_table, state, i,
                                                                              all_episodes_length,all_penalties,all_rewards,
                                                                              past_policy = past_policies[j])
            policy_expect[j] = (policy_expect[j]*policy_times[j] + episode_rewards)/( policy_times[j] +1)
            policy_times[j] += 1
            frequency[j][k-1] += 1
#            if episode_length < 200:
#                win['reuse'].append(k)
        # follow episilon-greedy strategy    
        else:
            episode_penalties, episode_rewards, episode_length,i = epsilon_greedy(timesteps, env, q_table, state, i , 
                                                                                all_episodes_length,all_penalties,all_rewards,
                                                                                epsilon, alpha, gamma)
#            if episode_length < 200:
#                win['new'].append(k)        
            frequency[len(past_policies)][k-1] += 1

        #show training progress
        if (k) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {k}")
    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    for line in range(train_episodes):
        for row in range(len(past_policies)+1):
            all_frequency[row][line]=np.sum(frequency[row][:line+1], keepdims= True) /np.sum(np.sum(frequency[:,:line+1]))
    return all_episodes_length, all_penalties, all_rewards, all_frequency, q_table    

In [24]:
frequency =np.array([[1., 1., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.]])
all_frequency = np.zeros([4,3])

for line in range(3):
    for row in range(4):
        all_frequency[row][line]=  np.sum(frequency[row][:line+1]) /np.sum(np.sum(frequency[:,:line+1]))

    #all_frequency[:][line]=frequency[:][line]/ np.sum(frequency[:][:line],axis = 1, keepdims= True)
all_frequency

array([[1.        , 0.66666667, 0.6       ],
       [0.        , 0.33333333, 0.2       ],
       [0.        , 0.        , 0.2       ],
       [0.        , 0.        , 0.        ]])

In [25]:
a=np.arange(12)
print(a)
#结果：[ 0  1  2  3  4  5  6  7  8  9 10 11]

#reshape对一维数组进行修改形状 (4,3)修改为4行3列
a=np.ones([4,3])
a[:,:2]


[ 0  1  2  3  4  5  6  7  8  9 10 11]


array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [26]:
def epsilon_greedy(timesteps, env, q_table, initial_state, i,
                   all_episodes_length,all_penalties,all_rewards,epsilon = 0.1, alpha = 0.05,  gamma = 0.95 ):
    done = False
    state = initial_state
    episode_penalties, episode_rewards, episode_length = 0, 0, 0
    _env_copy = copy.deepcopy(env)
    while not done and i < timesteps:
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = arg_max(q_table[state])
            
        # step to next state
        next_state, reward, done, info = env.step(action)

        # update q-value
        q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
        q_table[state, action] = q_value

        state = next_state
      
        #update data for learning curve
        if reward == -10:
            episode_penalties +=1

        episode_rewards += reward
        episode_length += 1
        
        if i%200 == 0:
            #evaluate policy for learning curve after each episode
            _episode_rewards, _episode_penalties, _episode_length = policy_evaluate(_env_copy, q_table, EVALUATION_TIMES)

            all_episodes_length.append(_episode_length)
            all_penalties.append(_episode_penalties)
            all_rewards.append(_episode_rewards)

            #show training progress
            clear_output(wait=True)
            print(f"timesteps: {i}")
        i += 1

    return episode_penalties, episode_rewards, episode_length,i

In [27]:
def policy_reuse(timesteps,env, q_table, initial_state, i, 
                 all_episodes_length,all_penalties,all_rewards, past_policy,  fi = 0.95, mu=0.95, alpha = 0.05,  gamma = 0.95 ):
    done = False
    state = initial_state
    episode_penalties, episode_rewards, episode_length = 0, 0, 0

    _env_copy = copy.deepcopy(env)
    while not done and i < timesteps:
        if random.uniform(0,1) < fi:
            action = arg_max(past_policy[state])
        else:
            action = env.action_space.sample()
            
        # step to next state
        next_state, reward, done, info = env.step(action)

        # update q-value
        q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
        q_table[state, action] = q_value

        state = next_state
        fi = fi*mu
        #update data for learning curve
        if reward == -10:
            episode_penalties +=1

        episode_rewards += reward
        episode_length += 1
        
        if i%200 == 0:
            #evaluate policy for learning curve after each episode
            _episode_rewards, _episode_penalties, _episode_length = policy_evaluate(_env_copy, q_table, EVALUATION_TIMES)

            all_episodes_length.append(_episode_length)
            all_penalties.append(_episode_penalties)
            all_rewards.append(_episode_rewards)

            #show training progress
            clear_output(wait=True)
            print(f"timesteps: {i}")
        i += 1
        
    return episode_penalties, episode_rewards, episode_length,i
    

### tranform

In [39]:
def transform( target_rl_algo, episode_num, repeat_times, target_task, 
              source_task= None, source_rl_algo = None, policy_library = None):
    
    """
    Situation 1:
    no source task available, train agent on target task from scratch 
    
    Situation 2:
    source task is available but source policy unavailable, train agent on source task to get source policy, 
    then train agent on target task using knowledge from source policy
    
    Situation 3:
    source policies are availale, reuse source policies to train agent on target task
    """
    train_episodes = episode_num    
    # data collected during trainning
    all_episodes_length = np.zeros(train_episodes)
    all_penalties = np.zeros(train_episodes)
    all_rewards = np.zeros(train_episodes)
    if target_rl_algo is OPS_Norm and policy_library is not None:
        all_frequency = np.zeros([len(policy_library[0])+1, train_episodes])
    else:
        all_frequency = None
    
    all_trans_knowledge = []
    
    
    
    for i in range(repeat_times):
       
        
        #Situation 3    
        if policy_library is not None: 
            
            if target_task is str:
                env = gym.make(target_task)
            else:
                env = target_task
            past_policies = policy_library # policy_library = [ [[policy11][policy21]...] [[policy12][policy22]...]...]
            if target_rl_algo is OPS_Norm:
                episodes,penalties,rewards,frequency, *knowledge = target_rl_algo(env, train_episodes, past_policies[i]) 
            else:
                episodes,penalties,rewards, *knowledge = target_rl_algo(env, train_episodes, past_policies[i]) 
        #Situation 2   
        elif source_task is not None:
            if source_rl_algo is None:
                source_rl_algo = target_rl_algo
            if source_task is str:
                env = gym.make(source_task)
            else:
                env = source_task
            _, _,_ ,*knowledge = source_rl_algo(env,  train_episodes)
            
            if target_task is str:
                env = gym.make(target_task)
            else:
                env = target_task
            episodes,penalties,rewards,*knowledge = target_rl_algo(env, train_episodes , *knowledge)
         #Situation 1
        else: 
            if target_task is str:
                env = gym.make(target_task)
            else:
                env = target_task
           
            episodes,penalties,rewards,*knowledge = target_rl_algo(env, train_episodes)
            

        all_episodes_length += episodes
        all_penalties += penalties
        all_rewards += rewards
        if target_rl_algo is OPS_Norm and policy_library is not None:
            all_frequency += frequency
        if len(knowledge) > 1:
            all_trans_knowledge.append(knowledge)
        else:
            all_trans_knowledge.append(*knowledge)

    all_episodes_length /= repeat_times
    all_penalties /= repeat_times
    all_rewards /= repeat_times
    if target_rl_algo is OPS_Norm and policy_library is not None:
        all_frequency /= repeat_times
    
    return all_episodes_length, all_penalties, all_rewards, all_frequency, all_trans_knowledge

In [29]:
a,*args = 3,4,5
args[:]
def add(a,b):
    return a+b


In [30]:
np.random.choice(6)

3

add Training

### 1 q-learning

In [35]:
%%time
qlearn_episodes_length_fourroom, qlearn_penalties_fourroom, qlearn_rewards_fourroom, qlearn_q_table_fourroom = [[None] * len(four_room_envs) for _ in range(4)]
for i in range(len(four_room_envs)):
    qlearn_episodes_length_fourroom[i], qlearn_penalties_fourroom[i], qlearn_rewards_fourroom[i], _, qlearn_q_table_fourroom[i] = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[i],
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 10min 18s, sys: 1min 39s, total: 11min 57s
Wall time: 10min 53s


In [32]:
%%time

# training Taxi-v003 from scratch
qlearn_episodes_length_v003, qlearn_penalties_v003, qlearn_rewards_v003, _, qlearn_q_table_v003 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                       `                            repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v003",
                                                                                     source_task = None
                                                                                    )

SyntaxError: invalid syntax (<unknown>, line 5)

In [None]:
%%time

# training Taxi-v000 from scratch
qlearn_episodes_length_v000, qlearn_penalties_v000, qlearn_rewards_v000, _, qlearn_q_table_v000 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v000",
                                                                                     source_task = None
                                                                                    )

In [None]:
%%time

# training Taxi-v001 from scratch
qlearn_episodes_length_v001, qlearn_penalties_v001, qlearn_rewards_v001,_, qlearn_q_table_v001 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v001",
                                                                                     source_task = None
                                                                                    )

In [None]:
%%time

# training Taxi-v004 from scratch
qlearn_episodes_length_v004, qlearn_penalties_v004, qlearn_rewards_v004,_, qlearn_q_table_v004 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v004",
                                                                                     source_task = None
                                                                                    )

### 2 PRQL

In [43]:
%%time

pr_episodes_length_fourroom, pr_penalties_fourroom, pr_rewards_fourroom, pr_q_table_fourroom = [[None] * (len(four_room_envs)-1) for _ in range(4)]

#pr_fourroom_v012_3
for i in range(len(four_room_envs)-1):
    pr_episodes_length_fourroom[i], pr_penalties_fourroom[i], pr_rewards_fourroom[i], _, pr_q_table_fourroom[i] = transform(target_rl_algo = prql,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library = qlearn_q_table_fourroom[i]
                                                                                    )

Episode 3999
Training finished 

CPU times: user 1min 50s, sys: 8.79 s, total: 1min 59s
Wall time: 1min 49s


In [None]:
%%time
#pr_v014_003
pr_episodes_length_v014_003, pr_penalties_v014_003,pr_rewards_v014_003, _, pr_q_table_v014_003 = transform(target_rl_algo= prql,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = 5,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v004)

###  3 OPS-TL

In [44]:
%%time

tl_episodes_length_fourroom, tl_penalties_fourroom, tl_rewards_fourroom, tl_q_table_fourroom = [[None] * (len(four_room_envs)-1) for _ in range(4)]

#tl_fourroom_v012_3
for i in range(len(four_room_envs)-1):
    tl_episodes_length_fourroom[i], tl_penalties_fourroom[i], tl_rewards_fourroom[i], _, tl_q_table_fourroom[i] = transform(target_rl_algo = OPS_TL,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library =  [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_fourroom[0],qlearn_q_table_fourroom[1],qlearn_q_table_fourroom[2])]
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 8min 43s, sys: 1min 38s, total: 10min 21s
Wall time: 9min 17s


In [None]:
%%time
#ops_v014_003
tl_episodes_length_v014_003, tl_penalties_v014_003,tl_rewards_v014_003,tl_fre_v014_003, tl_q_table_v014_003 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

### 4 CAPS

In [45]:
%%time

ops_episodes_length_fourroom, ops_penalties_fourroom, ops_rewards_fourroom, ops_q_table_fourroom = [[None] * (len(four_room_envs)-1) for _ in range(4)]

#ops_fourroom_v012_3
for i in range(len(four_room_envs)-1):
    ops_episodes_length_fourroom[i], ops_penalties_fourroom[i], ops_rewards_fourroom[i], _, ops_q_table_fourroom[i] = transform(target_rl_algo = OPS_Norm,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library =  [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_fourroom[0],qlearn_q_table_fourroom[1],qlearn_q_table_fourroom[2])]
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 8min 46s, sys: 1min 22s, total: 10min 9s
Wall time: 9min 14s


In [None]:
%%time
#ops_v014_003
ops_episodes_length_v014_003, ops_penalties_v014_003,ops_rewards_v014_003,ops_fre_v014_003, ops_q_table_v014_003 = transform(target_rl_algo= OPS_Norm,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

### 5 option-critic

In [None]:
import Taxi_v000,Taxi_v001,Taxi_v003,Taxi_v004
import copy
option_policies, option_terminations, policy_over_options, nrewards = [ [None]*4 for i in range(4) ] 

In [None]:
env_list = ["Taxi-v000", "Taxi-v001", "Taxi-v003", "Taxi-v004"]

for i in range(len(env_list)):
    option_policies[i], option_terminations[i], policy_over_options[i], nrewards[i] = option_critic(env = gym.make(env_list[i]),nruns = 5)

In [None]:
%%time
trans_rewards = [None]*4
trans_termination = [None]*4
for i in [0,1,3]:
    _, trans_termination[i], _, trans_rewards[i]= option_critic( gym.make("Taxi-v003"), 2, option_policies[i], option_terminations[i], policy_over_options[i])
qlearn_q_table_fourroom

In [None]:
args = 'a b'
args.partition(' ')

In [None]:
%matplotlib widget
level = [0,1,3,4]
nepisodes = 4000
plt.xlabel('episodes')
plt.ylabel('rewards')
plt.plot(range(nepisodes), smooth(nrewards[1][0:4000],500),label ='oc_v3')
for i in [0,1,3]:
   plt.plot(range(nepisodes), smooth(trans_rewards[i],500),label ='oc_v'+str(level[i])+'_3')

plt.legend()
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns

In [None]:
#%matplotlib widget
RATIO = 1
SMOOTH_RADIUS = 50
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003, SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "ops_norm_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "ops_norm_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "ops_norm_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_v014_003,SMOOTH_RADIUS), label = "pr_v014_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_v014_003,SMOOTH_RADIUS), label = "ops_v014_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003,SMOOTH_RADIUS), label = "caps_v014_003",color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards[3],SMOOTH_RADIUS), label = "oc_v014_003",color='m')





plt.title("0-4000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
RATIO = 4
SMOOTH_RADIUS = 50
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "ops_norm_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "ops_norm_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "ops_norm_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "pr_v014_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_v014_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v014_003",color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards[3][:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "oc_v014_003",color='m')





plt.title("0-1000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
import seaborn as sns
RATIO = 1
SMOOTH_RADIUS = 100
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[0],SMOOTH_RADIUS), label = 'ploicy_v001')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[1],SMOOTH_RADIUS), label = 'ploicy_v004')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[2],SMOOTH_RADIUS), label = 'target policy')
plt.legend()
plt.xlabel('episodes')
plt.ylabel('frequency')
plt.title("frequency of policy selection")
plt.show()
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v04_003,SMOOTH_RADIUS), label = "ops_norm_v04_003",color='g')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v14_003,SMOOTH_RADIUS), label = "ops_norm_v14_003",color='r')

In [None]:
RATIO = 10
SMOOTH_RADIUS = 20

plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_norm_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_norm_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_norm_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v01_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v01_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v04_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v04_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v14_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v14_003",color='r')



plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "caps_v014_003", color='navy')

plt.title("first 400 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
%%time
#ops_v1_003
ops_episodes_length_v1_003, ops_penalties_v1_003,ops_rewards_v1_003, ops_q_table_v1_003 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v1",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v1])

In [None]:
%%time
#ops_v4_003
ops_episodes_length_v4_003, ops_penalties_v4_003,ops_rewards_v4_003, ops_q_table_v4_003 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v4])

In [None]:
%%time
#ops_v14_003
ops_episodes_length_v14_003, ops_penalties_v14_003,ops_rewards_v14_003, ops_q_table_v14_003 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)])

In [None]:
%%time
#ops_norm_v14_003
ops_norm_episodes_length_v14_003, ops__norm_penalties_v14_003,ops_norm_rewards_v14_003, ops_norm_q_table_v14_003 = transform(target_rl_algo= OPS_Norm,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = 5,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)])

In [None]:
import functools
#ops_norm_v14_003
pl=[[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)]
ctuple = (0, 0.0049, 0.1, 0.2, 0.4,0.8, 1, 2, 4, 8, 16)
ops_cnorm_episodes_length_v14_003,ops__cnorm_penalties_v14_003,ops_cnorm_rewards_v14_003,ops_cnorm_q_table_v14_003 = [0] * len(ctuple), [0]*len(ctuple),[0]*len(ctuple), [0]*len(ctuple)
for i in range(len(ctuple)):
    OPS_patial = functools.partial( OPS_Norm, c = ctuple[i])
    ops_cnorm_episodes_length_v14_003[i], ops__cnorm_penalties_v14_003[i],ops_cnorm_rewards_v14_003[i], ops_cnorm_q_table_v14_003[i] = transform(target_rl_algo= OPS_patial,
               episode_num = EPISODE_NUM,
               repeat_times = REPEAT_TIMES,
               target_task = "Taxi-v003",
               policy_library = pl)



In [None]:
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v003, 100), label = "qlearn_v003", color='k')
for i in range(len(ctuple)):
    plt.plot(range(EPISODE_NUM), smooth(ops_cnorm_rewards_v14_003[i], 100), label = "c = "+str(ctuple[i]))
    
plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
RATIO = 20
SMOOTH_RADIUS = 200


for i in range(len(ctuple)):
    plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "c = "+str(ctuple[i]))
    
plt.title("reward-epsidoe graph using OPS_norm different c")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
for i in range(len(ctuple)):
    print(f"c = {ctuple[i]}, reward = {smooth(ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//20], 200)[0]}" )

In [None]:
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

reward = [smooth( ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//20], 200)[0] for i in range(len(ctuple)) ]
"""
绘制水平条形图方法barh
参数一：y轴
参数二：x轴
"""
plt.barh(range(len(ctuple)), reward, height=0.7, color='steelblue', alpha=0.8)      # 从下往上画
plt.yticks(range(len(ctuple)), [f"c={str(ctuple[i])}" for i in range(len(ctuple))])
plt.xlim(15,20)
plt.xlabel("average reward ")
plt.title("average reward over first 200 episode under different c")
for x, y in enumerate(reward):
    plt.text(y + 0.2, x - 0.1, '%s' % y)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style = "darkgrid")


plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v003, 100), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v4_003, 100), label = "ops_v4_003", color='r')
plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v1_003, 100), label = "ops_v1_003", color='b')

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v14_003, 100), label = "ops_v14_003", color='y')
plt.plot(range(EPISODE_NUM), smooth(ops_norm_rewards_v14_003, 100), label = "ops_norm_v14_003", color='navy')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:

RATIO = 8
SMOOTH_RADIUS = 25
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v4_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v4_003", color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v1_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v1_003", color='b')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v14_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v14_003", color='y')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_norm_rewards_v14_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_norm_v14_003", color='navy')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
RATIO = 8
SMOOTH_WINDOW = 20
plt.plot(range(EPISODE_NUM//RATIO),smooth(qlearn_episodes_length_v003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'qlearn_v3')
'''
plt.plot(range(train_episodes),qlearn_episodes_length_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_episodes_length_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_episodes_length_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_episodes_length_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_episodes_length_v4_3, label = 'sarsa_v4_3')
'''
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_episodes_length_v1_003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'ops_v1_3')
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_episodes_length_v4_003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'ops_v4_3')

plt.title("episode-length graph")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
RATIO = 20
SMOOTH_RADIUS = 20
plt.plot(range(EPISODE_NUM//RATIO),smooth(qlearn_penalties_v03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'qlearn_v3')
'''
plt.plot(range(train_episodes),qlearn_episodes_length_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_episodes_length_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_episodes_length_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_episodes_length_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_episodes_length_v4_3, label = 'sarsa_v4_3')
'''
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_penalties_v1_03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'ops_v1_3')
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_penalties_v4_03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'ops_v4_3')

plt.title("episode-penalties graph")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show



In [None]:


def visit_all(dic):
    for value in dic.values():
        if value == 0:
            return False
    return True

sum = 0
times = 1000
for i in range(times):    
    dic = {i: 0 for i in range(100)}
    while not visit_all(dic):
        key = random.choice(range(len(dic)))
        dic[key] = 1
        sum += 1
sum /= times
print(sum)

In [None]:
def div_add(n):
    sum = 0
    for i in range(1,n+1):
        sum += 1/i
        
    return n * sum
div_add(3000)/200


In [None]:
sum = 0
times = 1000
for i in range(times):    
    dic = {i: 0 for i in range(100)}
    while dic[0] == 0:
        key = random.choice(range(len(dic)))
        dic[key] = 1
        sum += 1
sum /= times
print(sum)

In [None]:
def p_reuse(env, past_policy, train_episodes, fi = 1, mu = 0.95):
        
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []

    q_table = np.zeros([env.observation_space.n, env.action_space.n])
 
    past_table = copy.deepcopy(past_policy[0])
    
    
    for i in range(train_episodes):
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        f = fi
        while not done:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1) < f:
                action = arg_max(past_table[state])
                #print(f"f = {f} in episode {i} \n")
            elif random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
                #print("random action \n")
                
            else:
                action = arg_max(q_table[state])
                #print("on the policy now \n")

            # step to next state
            next_state, reward, done, info = env.step(action)

            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            f = f*mu
            #update data for learning curve
            if reward == -10:
                episode_penalties +=1

            episode_rewards += reward
            episode_length += 1

        #record data for learning curve
        all_episodes_length.append(episode_length)
        all_penalties.append(episode_penalties)
        all_rewards.append(episode_rewards)

        #show training progress
        if (i+1) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {i}")
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style = "darkgrid")



plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v3, 100), label = "qlearn_v3", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v4_3, 100), label = "ops_v4_3", color='r')
plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v1_3, 100), label = "ops_v1_3", color='b')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
%%time


# training v03 from scratch

qlearn_episodes_length_v03, qlearn_penalties_v03, qlearn_rewards_v03, qlearn_q_table_v03 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v03",
                                                                                     source_task = None
                                                                                    )



In [None]:
%%time


# training v01 from scratch

qlearn_episodes_length_v01, qlearn_penalties_v01, qlearn_rewards_v01, qlearn_q_table_v01 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v01",
                                                                                     source_task = None
                                                                                    )


In [None]:
%%time


# training v04 from scratch

qlearn_episodes_length_v04, qlearn_penalties_v04, qlearn_rewards_v04, qlearn_q_table_v04 = transform(target_rl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v04",
                                                                                     source_task = None
                                                                                    )


In [None]:
%%time
#ops_v04_03
ops_episodes_length_v04_03, ops_penalties_v04_03,ops_rewards_v04_03, ops_q_table_v04_03 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v03",
                                                                                       source_task = "Taxi-v04",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v04])

In [None]:
%time
#ops_v01_03
ops_episodes_length_v01_03, ops_penalties_v01_03,ops_rewards_v01_03, ops_q_table_v01_03 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v03",
                                                                                       source_task = "Taxi-v01",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v01])

In [None]:
plt.plot(range(500), smooth(qlearn_rewards_v5[:500], 20), label = "qlearn_v5")
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3")
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3")

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(500), smooth(ops_rewards_v4_5[:500], 20), label = "ops_v4_5")
plt.plot(range(500), smooth(ops_rewards_v1_5[:500], 20), label = "ops_v1_5")

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
for i in qlearn_rewards_v3:
    print(i)

In [None]:
env = gym.make("Taxi-v3")
r,_, _ = policy_evaluate(env, np.zeros([env.observation_space.n,env.action_space.n]), 1)
print(r)

In [None]:

plt.plot(range(EPISODE_NUM//10 ),smooth(pr_rewards_v4_3[-EPISODE_NUM//10: ],100), label = 'pr_v4_3')
plt.plot(range(EPISODE_NUM//10 ),smooth(pr_rewards_v4_3[-EPISODE_NUM//10: ],100), label = 'prql_v4_3')
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v3[-EPISODE_NUM//10:], 100), label = "qlearn_v3")
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v4[-EPISODE_NUM//10:], 100), label = "qlearn_v4")
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v1_3[-EPISODE_NUM//10:],100), label = "qlearn_v1_3")

plt.title("smoothed episode-reward graph: last {} episodes".format(EPISODE_NUM//10))
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
%%time
ops_episodes_length_v4_3, ops_penalties_v4_3,ops_rewards_v4_3, ops_q_table_v4_3 = transform(target_rl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = 10,
                                                                                       target_task = "Taxi-v3",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v4])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

train_episodes = EPISODE_NUM

plt.plot(range(train_episodes),smooth(qlearn_rewards_v3,100), 'k', label = 'qlearn_v3')
plt.plot(range(train_episodes),smooth(qlearn_rewards_v1_3,100), label = 'qlearn_v1_3')
plt.plot(range(train_episodes),smooth(qlearn_rewards_v4_3,100), label = 'qlearn_v4_3')


plt.plot(range(train_episodes),smooth(pr_rewards_v1_3,100), label = 'pr_v1_3')
plt.plot(range(train_episodes),smooth(pr_rewards_v4_3,100), label = 'pr_v4_3')


plt.plot(range(train_episodes),smooth(ops_rewards_v1_3,100), label = 'ops_v1_3')
plt.plot(range(train_episodes),smooth(ops_rewards_v4_3,100), label = 'ops_v4_3')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show




In [None]:
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v3[-train_episodes//10: ],100), 'k',label = 'qlearn_v3')
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v1_3[-train_episodes//10: ],100), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v4_3[-train_episodes//10: ],100), label = 'qlearn_v1_3')

#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v3[-train_episodes//10: ],100), label = 'sarsa_v3')
#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v1_3[-train_episodes//10: ],100), label = 'sarsa_v1_3')
#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v4_3[-train_episodes//10: ],100), label = 'sarsa_v1_3')

plt.plot(range(train_episodes//10 ),smooth(pr_rewards_v1_3[-train_episodes//10: ],100), label = 'pr_v1_3')
plt.plot(range(train_episodes//10 ),smooth(pr_rewards_v4_3[-train_episodes//10: ],100), label = 'pr_v4_3')

plt.plot(range(train_episodes//10 ),smooth(pr1_rewards_v1_3[-train_episodes//10: ],100), label = 'pr1_v1_3')
plt.plot(range(train_episodes//10 ),smooth(pr1_rewards_v4_3[-train_episodes//10: ],100), label = 'pr1_v4_3')


plt.title("smoothed episode-reward graph: last 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:

plt.plot(range(train_episodes//100),qlearn_rewards_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_rewards_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_rewards_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_rewards_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_rewards_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_rewards_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show



In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')


plt.title("q-learning smoothed episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),qlearn_episodes_length_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_episodes_length_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_episodes_length_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_episodes_length_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_episodes_length_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_episodes_length_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show


In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.title("q-learning smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("sarsa smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes),qlearn_penalties_v3, label = 'qlearn_v3')
plt.plot(range(train_episodes),qlearn_penalties_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_penalties_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_penalties_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_penalties_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_penalties_v4_3, label = 'sarsa_v4_3')

plt.title("episode-penalties graph")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show



In [None]:
plt.plot(range(train_episodes//100),qlearn_penalties_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_penalties_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_penalties_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_penalties_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_penalties_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_penalties_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')


plt.title("q-learning smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("sarsa smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

In [None]:
def evaluate(run_times,policy_over_options,option_policies, q_table = None ,task = "Taxi-v003", mode = 'learning agent'):
    """Evaluate agent's performance after policy updates"""
    env = gym.make(task)
    if q_table is None:
        list_length = 1
    else:
        list_length = len(q_table)
    all_epochs, all_penalties, all_rewards, all_success_rate =  np.zeros(list_length), np.zeros(list_length),np.zeros(list_length),np.zeros(list_length)
    all_epochs_std, all_penalties_std, all_rewards_std = np.zeros(list_length), np.zeros(list_length),np.zeros(list_length)
    
    
        
    for i in range( list_length ):
        table_epochs, table_penalties, table_rewards = np.zeros(run_times), np.zeros(run_times),np.zeros(run_times)
        if q_table is not None:
            table = q_table[i]
        success = 0
        for j in range(run_times):
            state = env.reset()
            epochs, penalties, reward, sum_reward = 0, 0, 0, 0

            done = False

            while not done:
                if mode == 'learning agent':
                    action = arg_max(table[state])
                elif mode == 'random agent':
                    action = np.random.randint(0,env.action_space.n)
                state, reward, done, info = env.step(action)

                if reward == -10:
                    penalties += 1
                elif reward == 20:
                    success += 1

                epochs += 1
                sum_reward += reward
                show_frames(env,epochs,sum_reward )
            table_penalties[j] = penalties
            table_epochs[j] = epochs
            table_rewards[j] = sum_reward

        all_success_rate[i] = success/run_times    
        all_epochs[i], all_penalties[i], all_rewards[i]  = np.mean(table_epochs), np.mean(table_penalties), np.mean(table_rewards)
        all_epochs_std[i], all_penalties_std[i], all_rewards_std[i] = np.std(table_epochs), np.std(table_penalties), np.std(table_rewards)
      
    print(f"Results after {run_times} runs:")
    print(f"Success rate : {np.mean(all_success_rate)}")
    print(f"Average  episode length : {np.mean(all_epochs)} ± {np.mean(all_epochs_std)}")
    print(f"Average penalties per episode: {np.mean(all_penalties)} ± {np.mean(all_penalties_std)}")
    print(f"Average rewards per episode: {np.mean(all_rewards)} ± {np.mean(all_rewards_std)}")

In [None]:
def op_evaluate(run_times,option_policies, option_terminations, policy_over_options, q_table = None ,task = "Taxi-v003", mode = 'learning agent'):
    """Evaluate agent's performance after policy updates"""
    env = gym.make(task)
    if q_table is None:
        list_length = 1
    else:
        list_length = len(q_table)
    all_epochs, all_penalties, all_rewards, all_success_rate =  np.zeros(list_length), np.zeros(list_length),np.zeros(list_length),np.zeros(list_length)
    all_epochs_std, all_penalties_std, all_rewards_std = np.zeros(list_length), np.zeros(list_length),np.zeros(list_length)
    
    
        
    for i in range( list_length ):
        table_epochs, table_penalties, table_rewards = np.zeros(run_times), np.zeros(run_times),np.zeros(run_times)
        if q_table is not None:
            table = q_table[i]
        success = 0
        for j in range(run_times):
            state = env.reset()
            epochs, penalties, reward, sum_reward = 0, 0, 0, 0

            done = False
            option = policy_over_options.evaluate(state)
            while not done:
                if mode == 'learning agent':
                    if option_terminations[option].sample(state):
                        option = policy_over_options.evaluate(state)

                    action = option_policies[option].evaluate(state)
                    
                    
                elif mode == 'random agent':
                    action = np.random.randint(0,env.action_space.n)
                state, reward, done, info = env.step(action)

                if reward == -10:
                    penalties += 1
                elif reward == 20:
                    success += 1

                epochs += 1
                sum_reward += reward
                show_frames(env,epochs,sum_reward )
            table_penalties[j] = penalties
            table_epochs[j] = epochs
            table_rewards[j] = sum_reward

        all_success_rate[i] = success/run_times    
        all_epochs[i], all_penalties[i], all_rewards[i]  = np.mean(table_epochs), np.mean(table_penalties), np.mean(table_rewards)
        all_epochs_std[i], all_penalties_std[i], all_rewards_std[i] = np.std(table_epochs), np.std(table_penalties), np.std(table_rewards)
      
    print(f"Results after {run_times} runs:")
    print(f"Success rate : {np.mean(all_success_rate)}")
    print(f"Average  episode length : {np.mean(all_epochs)} ± {np.mean(all_epochs_std)}")
    print(f"Average penalties per episode: {np.mean(all_penalties)} ± {np.mean(all_penalties_std)}")
    print(f"Average rewards per episode: {np.mean(all_rewards)} ± {np.mean(all_rewards_std)}")

In [None]:
i=1
op_evaluate(3,option_policies[i], option_terminations[i], policy_over_options[i])

In [None]:
evaluate(1000,qlearn_q_table_v1_3)

In [None]:
evaluate(1000,qlearn_q_table_v4_3)

In [None]:
evaluate(1000,sarsa_q_table_v3)

In [None]:
evaluate(1000,sarsa_q_table_v1_3)

In [None]:
evaluate(1000,sarsa_q_table_v4_3)

In [None]:
evaluate(1000, mode = 'random agent')

In [None]:
%%time
# training Taxi-v3 reusing policy from Taxi-v1
pr1_rewards_v1_3 = np.zeros(10000)
for i in range(10):
    pr1_episodes_length_v1_3, pr1_penalties_v1_3,rewards, pr1_q_table_v1_3 = prql(gym.make("Taxi-v3"), qlearn_q_table_v1[i], 10000)
    pr1_rewards_v1_3 += rewards
pr1_rewards_v1_3/=10

In [None]:
%%time
# training Taxi-v3 reusing policy from Taxi-v1
pr1_rewards_v4_3 = np.zeros(10000)
for i in range(10):
    pr1_episodes_length_v4_3, pr1_penalties_v4_3,rewards, pr1_q_table_v4_3 = prql(gym.make("Taxi-v3"), qlearn_q_table_v4[i], 10000)
    pr1_rewards_v4_3 += rewards
pr1_rewards_v4_3/=10

In [None]:
%%time

env = gym.make("Taxi-v4")
# training Taxi-v4 from scratch
lprofiler = LineProfiler(q_learning)
lprofiler.run('qlearn_episodes_length_v41, qlearn_penalties_v41, qlearn_rewards_v41, qlearn_q_table_v41 = q_learning(env,np.zeros([env.observation_space.n,env.action_space.n]),EPISODE_NUM)')
lprofiler.print_stats()