In [140]:
import gym
import copy
import time
import pandas as pd
from line_profiler import LineProfiler
from option_critic.utils import *
from matplotlib import pyplot as plt
from option_critic.fourrooms_copy import FourRooms
from IPython.display import clear_output

In [205]:
%load_ext autoreload
%autoreload 2
#[env.id for env in gym.envs.registry.all() if env.id.startswith('Taxi')]
import Taxi_v0, Taxi_v1, Taxi_v4
import Taxi_v00, Taxi_v01, Taxi_v03, Taxi_v04
import Taxi_v000,Taxi_v001,Taxi_v003,Taxi_v004
env = gym.make("Taxi-v01")
env.render()
"""
Here's our restructured problem statement (from Gym docs):

"There are 4 locations (labeled by different letters), and our job is to pick up the passenger 
at one location and drop him off at another. We receive +20 points for a successful drop-off and 
lose 1 point for every time-step it takes. There is also a 10 point penalty for illegal pick-up 
and drop-off actions."

- The filled square represents the taxi, which is yellow without a passenger and green with a passenger.
- The pipe ("|") represents a wall which the taxi cannot cross.
- R, G, Y, B are the possible pickup and destination locations. The blue letter represents the current passenger
  pick-up location, and the purple letter is the current destination.
"""

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


DeprecatedEnv: Env Taxi-v01 not found (valid versions include ['Taxi-v3', 'Taxi-v0', 'Taxi-v1', 'Taxi-v4', 'Taxi-v00', 'Taxi-v04', 'Taxi-v000', 'Taxi-v001', 'Taxi-v003', 'Taxi-v004'])

In [142]:
%matplotlib widget
%autoreload 2
#goals = [(8,8), (8,20), (20, 8), (20,20)]
goals = [(27,27), (18,24), (24, 18), (20,20)]
four_room_envs =[None] * 4
for i in range(len(goals)):
    four_room_envs[i] = FourRooms()
    four_room_envs[i].reset()
    four_room_envs[i].goal =  four_room_envs[i].tostate[goals[i]]
    clear_output(True)
    plt.subplot(2,2,i+1)
    plt.imshow(four_room_envs[i].render(show_goal=True), cmap='Blues')
    plt.axis('off')
    plt.title('level ' + str(i))
    plt.show()
    


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [25]:
env=gym.make("Taxi-v003")
env.seed(0)
env.reset()
env.render()

+---------+
|R:[43m [0m| : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [26]:
env.render()

+---------+
|R:[43m [0m| : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [27]:
gym.make("Taxi-v003").render()

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [28]:
gym.make("Taxi-v000").render()

+---------+
| : | : :R|
| : |[43m [0m: :B|
| : : : :[35mY[0m|
| | : | : |
| | : | :[34;1mG[0m|
+---------+



In [29]:
gym.make("Taxi-v3").render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |B: |
+---------+



In [30]:
state = env.reset()
env.render()
print("Action space = {}".format(env.action_space))
print("State space = {}".format(env.observation_space))


+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B:[43m [0m|
+---------+

Action space = Discrete(6)
State space = Discrete(500)


In [31]:
env.P[328] # {action: [(probability, nextstate, reward, done)]}

{0: [(1.0, 428, 0, False)],
 1: [(1.0, 228, 0, False)],
 2: [(1.0, 348, 0, False)],
 3: [(1.0, 328, 0, False)],
 4: [(1.0, 328, 0, False)],
 5: [(1.0, 328, 0, False)]}

In [33]:
map1 = gym.make("Taxi-v003").desc.flatten()
map2 = gym.make("Taxi-v001").desc.flatten()
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
damerau_levenshtein_distance(map1, map2)  # expected result: 1

8

### distance

In [143]:
from tqdm import tqdm
def state_dis(p1,p2,epsilon = 0.01, nsteps = 1000):
    # p {state:{action:{[(1.0, new_state, reward, done)]}}} 
    assert len(p1[0]) == len(p2[0]),'transitions have different action spaces'
    actions = range(len(p1[0]))
    states_dis, next_states_dis = np.zeros([len(p1), len(p2)]), np.zeros([len(p1), len(p2)])
    epsilon = epsilon
    gamma = 0.95
    step = 0
    
    for step in tqdm(range(nsteps)):
        for i  in range(len(p1)):
            for j in range(len(p2)):
                # d(si,sj) = max{cr*|rsi - rsj| + ct*Tk(d)(si,sj)} with ct =gamma, cr = 1-gamma , Tkd A.K.A emd
                dists = []
                for action in actions:
                    # print(i,j,action)
                    state_action1=p1[i][action][0] # contains (1.0, new_state, reward, done)
                    state_action2=p2[j][action][0]
                    
                    emd = abs(states_dis[state_action1[1]][state_action2[1]]) # d(s_i^{next_state} , s_j^{next_state})
                    dis = (1-gamma)*abs(state_action1[2] - state_action2[2]) + gamma*emd 
#                     if dis > 0:
#                          print(i,j,dis)
                    dists.append(dis)
                next_states_dis[i][j] = max(dists)
        if np.max(abs(next_states_dis - states_dis) ) < epsilon:

            break   
        states_dis = next_states_dis
        next_states_dis = np.zeros([len(p1), len(p2)])
        step += 1 
#         clear_output(wait=True)
#         print(f"step: {step}")

    return next_states_dis

In [41]:
%%time
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
import random
random.seed(1234)
np.random.seed(1234)

dl_list= []
pass_locs = [(i,j*2) for i in range(6) for j in range(5)]
obs_locs = [(i,j*2+1) for i in range(6) for j in range(4)]
for i in range(10000):    
    map1,map2 = np.zeros([6,10]),np.zeros([6,10])
    for mapi in [map1,map2]:
        for pos in  np.random.choice(range(30),4,replace = False):
            pos = pass_locs[pos]
            mapi[pos[0]][pos[1]] = 1
        for pos in obs_locs:
            mapi[pos[0]][pos[1]] = np.random.randint(2,4)
    dl_list.append(damerau_levenshtein_distance(map1.flatten(), map2.flatten())) 

KeyboardInterrupt: 

In [53]:
def generate_P(MAP):
    desc = np.asarray(MAP, dtype='c')
    locR, locG, locY, locB = np.asarray(desc == b'R').nonzero(), np.asarray(desc == b'G').nonzero(),np.asarray(desc == b'Y').nonzero(),np.asarray(desc == b'B').nonzero()
    #(loc[0][0] , loc[1][0] ) contains (array([x]),array([y])
    locs = [(locR[0][0] - 1 , locR[1][0] //2), (locG[0][0] -1 , locG[1][0] //2 ), (locY[0][0] -1  , locY[1][0] //2), (locB[0][0] -1 , locB[1][0] //2)]

    num_states = 500
    num_rows = 5
    num_columns = 5
    max_row = num_rows - 1
    max_col = num_columns - 1
    num_actions = 6
    P = {state: {action: []
                 for action in range(num_actions)} for state in range(num_states)}
    for row in range(num_rows):
        for col in range(num_columns):
            for pass_idx in range(len(locs) + 1):  # +1 for being inside taxi
                for dest_idx in range(len(locs)):
                    state = encode(row, col, pass_idx, dest_idx)
                    for action in range(num_actions):
                        # defaults
                        new_row, new_col, new_pass_idx = row, col, pass_idx
                        reward = 0  # default reward when there is no pickup/dropoff
                        done = False
                        taxi_loc = (row, col)

                        if action == 0:
                            new_row = min(row + 1, max_row)
                        elif action == 1:
                            new_row = max(row - 1, 0)
                        if action == 2 and desc[1 + row, 2 * col + 2] == b":":
                            new_col = min(col + 1, max_col)
                        elif action == 3 and desc[1 + row, 2 * col] == b":":
                            new_col = max(col - 1, 0)
                        elif action == 4:  # pickup
                            if (pass_idx < 4 and taxi_loc == locs[pass_idx]):
                                new_pass_idx = 4
                            else:  # passenger not at location
                                reward = 0
                        elif action == 5:  # dropoff
                            if (taxi_loc == locs[dest_idx]) and pass_idx == 4:
                                new_pass_idx = dest_idx
                                done = True
                                reward = 20
                            elif (taxi_loc in locs) and pass_idx == 4:
                                new_pass_idx = locs.index(taxi_loc)
                            else:  # dropoff at wrong location
                                reward = 0
                        new_state = encode(
                            new_row, new_col, new_pass_idx, dest_idx)
                        P[state][action].append(
                            (1.0, new_state, reward, done))
    return P

In [54]:
def encode( taxi_row, taxi_col, pass_loc, dest_idx):
    # (5) 5, 5, 4
    i = taxi_row
    i *= 5
    i += taxi_col
    i *= 5
    i += pass_loc
    i *= 4
    i += dest_idx
    return i

def decode(i):
    out = []
    out.append(i % 4)
    i = i // 4
    out.append(i % 5)
    i = i // 5
    out.append(i % 5)
    i = i // 5
    out.append(i)
    assert 0 <= i < 5
    return reversed(out)

In [145]:
def my_metric(dist_table):
    return np.sum(np.min(dist_table,axis= 0))

In [56]:
def make_random_maps(num,seed):
    random.seed(seed)
    np.random.seed(seed)
    dl_list= []
    pass_locs = [(i+1,j*2+1) for i in range(6) for j in range(5)]
    obs_locs = [(i+1,j*2+2) for i in range(6) for j in range(4)]
    obs = [':','|']
    maps = []
    for i in range(num):    
        MAP =  [
        "+---------+",
        "| : | : : |",
        "| : | : : |",
        "| : : : : |",
        "| | : | : |",
        "| | : | : |",
        "+---------+",
        ]  
        MAP = np.asarray(MAP,dtype = 'c')
        for pos, loc in  zip( np.random.choice(range(30),4,replace = False), ['R','G','Y','B']):
            pos = pass_locs[pos]
            MAP[pos[0]][pos[1]] = loc
        for pos in obs_locs:
            MAP[pos[0]][pos[1]] = obs[np.random.randint(0,2)]
        maps.append(MAP)
    return maps

In [148]:
%%time
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance
dl_dists= []
huasdorff_dists = []
kt_dists = []
my_dists = []
next_states_diststs = [ [None for j in range(len(maps))] for i in range(len(maps))]
maps = make_random_maps(20,2323)
for i in range(len(maps)):
    for j in range(i+1, len(maps)):
        clear_output(wait = True)
        print(f"iteration: {i}x{j}")        
        
        next_states_dis = state_dis(generate_P(maps[i]), generate_P(maps[j]),nsteps=1000)
        
        kt_dis = np.trace(next_states_dis)
        hausdorff_dis = max( np.max(np.min(next_states_dis,axis = 0)), np.max(np.min(next_states_dis,axis = 1)) )  
        my_dis = my_metric(next_states_dis)
        
        huasdorff_dists.append(hausdorff_dis)
        kt_dists.append(kt_dis)
        dl_dists.append(damerau_levenshtein_distance(maps[i].flatten(), maps[j].flatten()))
        my_dists.append(my_dis)
        next_states_diststs[i][j] = next_states_dis

  0%|          | 0/1000 [00:00<?, ?it/s]

iteration: 18x19


  9%|▉         | 90/1000 [02:22<24:02,  1.59s/it]

CPU times: user 7h 21min 12s, sys: 1min 3s, total: 7h 22min 15s
Wall time: 7h 23min 15s





In [254]:
all_data['Close_transfer_caps'][0].shape

(4000,)

In [149]:
%%time
np.random.RandomState(1234)
random.seed(1234)
dl_dists2= []
huasdorff_dists2 = []
kt_dists2 = []
my_dists2 = []

maps1 = make_random_maps(19,1234)
maps2 = make_random_maps(10,4567)
next_states_diststs2 = [ [None for j in range(len(maps2))] for i in range(len(maps1))]
for i in range(len(maps1)):
    for j in range(len(maps2)):
        clear_output(wait = True)
        print(f"iteration: {i}x{j}")  
        next_states_dis = state_dis(generate_P(maps1[i]), generate_P(maps2[j]))
        
        kt_dis = np.trace(next_states_dis)
        hausdorff_dis = max( np.max(np.min(next_states_dis,axis = 0)), np.max(np.min(next_states_dis,axis = 1)) )  
        my_dis = my_metric(next_states_dis)
        
        huasdorff_dists2.append(hausdorff_dis)
        kt_dists2.append(kt_dis)
        dl_dists2.append(damerau_levenshtein_distance(maps1[i].flatten(), maps2[j].flatten()))
        my_dists2.append(my_dis)
        next_states_diststs2[i][j] = next_states_dis

  0%|          | 0/1000 [00:00<?, ?it/s]

iteration: 18x9


  9%|▉         | 90/1000 [02:26<24:40,  1.63s/it]

CPU times: user 7h 35min 57s, sys: 13.9 s, total: 7h 36min 11s
Wall time: 7h 38min 20s





In [251]:
a = np.asarray(next_states_diststs2)
# np.argmin(a)
a.shape

(19, 10, 500, 500)

In [144]:
def autolabel(ax, rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    heights = rects[0]
    edges = rects[1]
    for i in range(len(heights)):
        height = heights[i]
        if height == 0:
            continue
        edge_left = edges[i]
        edge_right = edges[i+1]
        ax.annotate('{}'.format(int(height)),
                    xy=((edge_left + edge_right) / 2, height),
                    xytext=(0, -2),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


In [219]:
%matplotlib
fig,ax = plt.subplots(1,4,figsize = (15,3))
fig.suptitle('Distance between 20 random generated levels (190 instance) ')
rect = ax[0].hist(dl_dists,10)
ax[0].set_ylabel('counts')
ax[0].set_xlabel('damerau levenshtein distance')
autolabel(ax[0],rect)
rect = ax[1].hist(huasdorff_dists,10)
ax[1].set_ylabel('counts')
ax[1].set_xlabel('hausdorff distance')
autolabel(ax[1],rect)
rect = ax[2].hist(kt_dists,10)
ax[2].set_ylabel('counts')
ax[2].set_xlabel('simple kantorovich distance')
autolabel(ax[2],rect)
plt.savefig(fname = './3_dist_20maps ',dpi = 600)
rect = ax[3].hist(my_dists,10)
ax[3].set_ylabel('counts')
ax[3].set_xlabel('my distance')
autolabel(ax[3],rect)
plt.savefig(fname = './3_dist_20maps ',dpi = 600)

Using matplotlib backend: module://ipympl.backend_nbagg


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [220]:
%matplotlib
fig,ax = plt.subplots(1,4,figsize = (15,3))
fig.suptitle('Distance between 19*10 random generated levels (190 instance) ')
rect = ax[0].hist(dl_dists2,10)
ax[0].set_ylabel('counts')
ax[0].set_xlabel('damerau levenshtein distance')
autolabel(ax[0],rect)
rect = ax[1].hist(huasdorff_dists2,10)
ax[1].set_ylabel('counts')
ax[1].set_xlabel('hausdorff distance')
autolabel(ax[1],rect)
rect = ax[2].hist(kt_dists2,10)
ax[2].set_ylabel('counts')
ax[2].set_xlabel('simple kantorovich distance')
autolabel(ax[2],rect)
rect = ax[3].hist(my_dists2,10)
ax[3].set_ylabel('counts')
ax[3].set_xlabel('my distance')
autolabel(ax[3],rect)
plt.savefig(fname = './3_dist_19*10 maps ',dpi = 600)

Using matplotlib backend: module://ipympl.backend_nbagg


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [298]:
%matplotlib
fig,ax = plt.subplots()
ax.hist(dl_dists2,10)
ax.set_ylabel('counts')
ax.set_xlabel('damerau levenshtein distance')

Using matplotlib backend: module://ipympl.backend_nbagg


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 0, 'damerau levenshtein distance')

In [334]:
env1 = gym.make("Taxi-v001")
env2 = gym.make("Taxi-v003")
state_dis(env1.P,env2.P)

  0%|          | 3/1000 [00:04<24:45,  1.49s/it]


KeyboardInterrupt: 

## Definition

### 0. utils

In [145]:
import pickle


def saveData(data, path):
    output = open(path, 'wb')
    # Pickle dictionary using protocol 0.
    pickle.dump(data, output)
    output.close()
    
def loadData(path):
    pkl_file = open(path, 'rb')
    segContent = pickle.load(pkl_file)
    pkl_file.close()
    return segContent

In [146]:
def transfer_rate(lc_no_trans, lc_trans):
    max_no_trans = np.max(lc_no_trans)
    max_trans = np.max(lc_trans)
    c_no = 2*abs(max_no_trans) + 1
    c = 2*abs(max_trans) + 1
    if max_trans > max_no_trans:
        return (max_trans + c_no)/(max_no_trans +c_no)
    elif max_trans < max_no_trans:
        return - (max_no_trans + c)/(max_trans + c)
    else:
       
        time_trans = np.min( np.where(lc_trans >= max_no_trans))
        time_no_trans = np.min( np.where(lc_no_trans >= max_no_trans) )
        if time_trans < time_no_trans:
            return (time_no_trans - time_trans) / time_no_trans
        elif time_trans ==  time_no_trans: 
            return 0
        else:
            return (time_no_trans - time_trans) / (len(lc_no_trans) - time_no_trans)
        

#### show continuality of TR

In [28]:
from matplotlib.animation import FuncAnimation 
from  matplotlib import pyplot as plt
import numpy as np
from IPython.display import HTML
algs = ['qlearn','pr','ops','caps']
data = [None] * len(algs)
all_data = {}
for i,alg in enumerate(algs):
    data[i] = loadData('./past_ex_data/' + "similarity_diff" + '_' + alg + '_'  + '20210429' )
    all_data.update(data[i])
lc_no_trans = all_data['No_transfer_qlearn'][0][0:500]
lc_trans_3long = np.concatenate((np.linspace(-20, 0 ,500),lc_no_trans,[lc_no_trans[-1]]*500,np.linspace(20,40,500)))
lc_trans_3long.shape

(2000,)

In [71]:
len(err_acc_rewards)

2807

In [86]:
err_trans_rate, err_jumpstart, err_asympotic, err_time_to_threshold, err_acc_rewards = [],[],[],[],[]
metrics = ['trans_rate', 'jumpstart', 'asympotic', 'time_to_threshold', 'acc_rewards']
sample_len = len(lc_trans_3long) - len(lc_trans)
for i in range(sample_len):
    for j in range(i,sample_len):
        lc_no_trans= lc_trans_3long[j:j+len(lc_trans)]
        lc_trans = lc_trans_3long[i:i+len(lc_trans)]
        trans_rate, jumpstart, asympotic, time_to_threshold, acc_rewards = get_metrics_val(lc_no_trans,lc_trans)
        transfer_flag = None
        if np.max(lc_trans) > np.max(lc_no_trans):
            transfer_flag = 1
        elif np.max(lc_trans) < np.max(lc_no_trans):
            transfer_flag = -1
        else:
            if time_to_threshold > 0: 
                transfer_flag = 1
            elif time_to_threshold < 0:
                 transfer_flag = -1
            else: transfer_flag = 0
        for metric in metrics:
            if locals()[metric] * transfer_flag < 0 :
                locals()['err_'+ metric].append([i,j])
            elif locals()[metric] * transfer_flag == 0 and locals()[metric] + transfer_flag != 0:
                locals()['err_'+ metric].append([i,j])
for metric in metrics:
    print(metric + ': ' , end ='')
    print(len( locals()['err_'+ metric] ) / (sample_len*(sample_len - 1)/2))

trans_rate: 0.0
jumpstart: 0.15196975761618858
asympotic: 0.15239759839893263
time_to_threshold: 0.02259106070713809
acc_rewards: 0.0024967756281965756


In [138]:
i = 850
j = 877
fig,ax = plt.subplots()
lc_trans_i = lc_trans_3long[i:i+len(lc_trans)]
lc_trans_j = lc_trans_3long[j:j+len(lc_trans)]
t_train = len(lc_trans)
time_trans_i = np.min(np.where(lc_trans_i == np.max(lc_trans_i))) #the first point that reach the max value in lc_trans
time_trans_j = np.min(np.where(lc_trans_j == np.max(lc_trans_j)))
ax.plot(range(t_train), lc_trans_i)
ax.plot(range(t_train), lc_trans_j)

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [136]:
get_metrics_val(lc_trans_i,lc_trans_j)

(0, 2.0, 0.0, 0, 168.0)

In [None]:
max = 0
id_max = 0
for err in err_acc_rewards:
    if err[0] - err[1]

In [88]:
%matplotlib widget
t_train = len(lc_no_trans)
fig, ax = plt.subplots() 
lc_no_trans = lc_trans_3long[t_train:2*t_train]

# def init():
# #     line_no, = plt.plot(range(t_train), lc_no_trans,color='blue', label = 'No_transfer')
#     line_trans, = plt.plot(range(t_train), lc_trans_3long[:t_train])
#     ax.set_xlim(0, 500)
#     ax.set_ylim(-20, 40)    
#     return line_trans

#calculate transfer rate before animation in case of calculation delay between frames
tr = []
for frame_idx in range(3*t_train):
    lc_trans = lc_trans_3long[frame_idx : frame_idx + t_train]
    tr.append( transfer_rate(lc_no_trans,lc_trans) )
 
def update(frame_idx):
    plt.cla()
    ax.set_xlim(0, t_train)
    ax.set_ylim(-20, 40) 
    lc_trans = lc_trans_3long[frame_idx : frame_idx + t_train]
    line_trans, = ax.plot(range(t_train), lc_no_trans,color = 'blue', label='No_Transfer')
    line_trans=ax.plot(range(t_train), lc_trans,color = 'orange',label = 'Transfer')    
    time_trans = np.min(np.where(lc_trans == np.max(lc_trans))) #the first point that reach the max value in lc_trans
    ax.axhline(y=max(lc_trans),xmin=0,xmax = time_trans/t_train, color ='r',linestyle='--')
    yticks = ax.get_yticks()
    yticks_  = np.append(yticks,max(lc_trans))
    ax.set_yticks(yticks_)
#     colors = ['b']*len(yticks)
#     colors.append('red')
    ax.set_yticklabels(np.append(yticks,f'p* = {max(lc_trans):.1f}'))
    plt.title(f"Transfer Rate = {tr[frame_idx]:+.4f}")  
    plt.legend(loc = 'lower right')
    return line_trans
ani = FuncAnimation(fig, update, frames=3*t_train, 
                   interval=1,repeat = False)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [91]:
fig, ax = plt.subplots() 
ax.plot(range(len(lc_trans_3long)), lc_trans_3long,color = 'orange')
ax.set_xlim(right = 2000)
plt.title(f'Orange learning curve')


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Orange learning curve')

In [92]:
fig, ax = plt.subplots() 
ax.plot(range(len(tr)), tr)
ax.set_xlim(right = 1600)
plt.title(f'Transfer rate during {len(tr)} frames')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0.5, 1.0, 'Transfer rate during 1500 frames')

In [93]:
1500*1499/2

1124250.0

In [10]:
from matplotlib import animation, rc
from IPython.display import HTML

In [7]:
# First set up the figure, the axis, and the plot element we want to animate
fig, ax = plt.subplots()

ax.set_xlim(( 0, 2))
ax.set_ylim((-2, 2))

line, = ax.plot([], [], lw=2)
# initialization function: plot the background of each frame
def init():
    line.set_data([], [])
    return (line,)
# animation function. This is called sequentially
def animate(i):
    x = np.linspace(0, 2, 1000)
    y = np.sin(2 * np.pi * (x - 0.01 * i))
    line.set_data(x, y)
    return (line,)
# call the animator. blit=True means only re-draw the parts that have changed.
anim = FuncAnimation(fig, animate, init_func=init,
                               frames=100, interval=20, blit=True)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [147]:
def get_metrics_val(learning_curve_no_trans, learning_curve_trans):
    """
    get the value of 5 metrics
    1.jumpstart
    2.asympotic performance 
    3.time to threshold
    4.accumulated rewards
    5.transfer rate
    """
    lc_no_trans = learning_curve_no_trans
    lc_trans = learning_curve_trans
    
    #jumpstart
    jumpstart = lc_trans[0] - lc_no_trans[0]
    
    #asympotic performance 
    asympotic = lc_trans[-1] - lc_no_trans[-1]
    
    # time to threshold(convergence)
    threshold = min( np.max(lc_no_trans),np.max(lc_trans)  )
   
    time_trans = np.min( np.where(lc_trans >= threshold))
    time_no_trans = np.min( np.where(lc_no_trans >= threshold) )
    time_to_threshold = time_no_trans - time_trans
    
    # accumulated rewards
    acc_rewards = np.sum(lc_trans - lc_no_trans)
    
    #transfer rate 
    trans_rate = transfer_rate(lc_no_trans, lc_trans)
    
    return trans_rate, jumpstart, asympotic, time_to_threshold, acc_rewards

In [148]:
def get_results(lc_no_transfer, lc_transfer):
    assert lc_no_transfer.shape[0] == lc_transfer.shape[0],\
    "two learning curve must have same run times"
    nruns =  lc_no_transfer.shape[0]
    trans_rate, jumpstart, asympotic, time_to_threshold, acc_rewards = [np.zeros(nruns), np.zeros(nruns),np.zeros(nruns),np.zeros(nruns),np.zeros(nruns)]
    for i in range(nruns):
        trans_rate[i], jumpstart[i], asympotic[i], time_to_threshold[i], acc_rewards[i] = get_metrics_val(lc_no_transfer[i], lc_transfer[i])
    res = { 'Transfer rate': trans_rate,
            'Jumpstart': jumpstart,
            'Asympotic performace': asympotic,
            'Time to threshold': time_to_threshold,
            'Accumulated rewards': acc_rewards       
           }
    return res

In [149]:
def make_color_dict(data):
    keys = [key for key in data.keys()]
    
    trans_settings = set()
    algs = set()
    for key in keys:
        temp = key.split('_')
        trans_settings.add(temp[0])
        algs.add(temp[2])
    trans_settings = list(trans_settings) 
    algs = list(algs)
    trans_settings.sort(reverse = True)
    algs.sort(reverse = True)
    cmap_names =['Blues', 'Reds', 'Greens']
    color_dict = {}
    for i, trans_setting in enumerate(trans_settings):
        for j, alg in enumerate(algs):
            assert len(algs) > 0, 'data is empty'
            color = plt.get_cmap(cmap_names[i%len(cmap_names)])(0.8 - 0.8 / len(algs) * j +0.1)
            if trans_setting not in color_dict:
                color_dict.update({trans_setting:{alg: color}})
            elif alg not in color_dict[trans_setting]:
                color_dict[trans_setting].update({alg: color})
    return color_dict

In [150]:
def plot_result(data, nepisodes, smooth_radius=50, combine_figures = True, average_group = True, show_std = True, figsize = None, title="episode-reward graph", xlabel="episode", ylabel = "rewards"):
   
    plt.rcParams['savefig.dpi'] = 600
    data = copy.deepcopy(data)
    color_dict = make_color_dict(data)
    # plot n runs as only 1 curve
    if average_group:
        for label,rewards in data.items():
            mean = np.mean(rewards, axis = 0)
            std = np.std(rewards, axis = 0)
            data[label]= {'mean': mean, 'std': std}
            
   
    if combine_figures:
        f, axarr = plt.subplots(1, 1, sharex=False, squeeze=False, figsize=figsize)
        ax= axarr[0][0]
        assert average_group == True, "average_group should be True when combine_figures is True"
        for label in data.keys():
                color = color_dict[label.split('_')[0]][label.split('_')[2]]
                y = data[label]['mean'][:nepisodes]
                ystd = data[label]['std'][:nepisodes]
                #print(content.shape)
                ax.plot(range(nepisodes), smooth(y,smooth_radius),color = color,label = label)
                if show_std:
                    ax.fill_between(range(nepisodes), smooth(y -  ystd,smooth_radius), smooth(y +  ystd,smooth_radius), color = color, alpha = 0.2)
                ax.set_xlabel(xlabel)
                ax.set_ylabel(ylabel)
                if title == "episode-reward graph":
                    ax.set_title(label.split('_')[2] +' ' + label.split('_')[-2] + ":  " +title)
                else:
                    ax.set_title(title)
                ax.legend()
    else:
        f, axarr = plt.subplots(len(data), 1, sharex=True, sharey=True, squeeze=False, figsize=figsize)
        for idx,label in enumerate(data.keys()):
            ax = axarr[idx][0]            
            color = color_dict[label.split('_')[0]][label.split('_')[2]]
            
            
            if average_group:
               
                y = data[label]['mean'][:nepisodes]
                ystd = data[label]['std'][:nepisodes]
                ax.plot(range(nepisodes), smooth(data[label]['mean'][:nepisodes],smooth_radius),color = color,label = label)
                if show_std:
                    ax.fill_between(range(nepisodes), smooth(y -  ystd,smooth_radius), smooth(y +  ystd,smooth_radius), color = color, alpha = 0.2)
                ax.set_xlabel(xlabel)
                ax.set_ylabel(ylabel)
                ax.set_title(label+ ":  " +title)
                ax.legend()
                
            else:
                for i,reward in enumerate(data[label]):
                    if i==0:
                        ax.plot(range(nepisodes),smooth(reward[:nepisodes],smooth_radius),label = label)
                        ax.set_xlabel(xlabel)
                        ax.set_ylabel(ylabel)
                        ax.set_title(label+ ":  " +title)
                        ax.legend()
                    else:
                        ax.plot(range(nepisodes),smooth(reward[:nepisodes],smooth_radius),label = None)

    plt.tight_layout()
                           

In [151]:
def arg_max(state_action):
    max_index_list = []
    max_value = state_action[0]
    for index, value in enumerate(state_action):
        if value > max_value:
            max_index_list.clear()
            max_value = value
            max_index_list.append(index)
        elif value == max_value:
            max_index_list.append(index)
    return random.choice(max_index_list)

In [152]:
def count_zero_state(q_table):
    zero_state = 0
    for arr_2d in q_table:
         zero_state += np.sum(np.sum(arr_2d,axis = 1) == 0)
    return zero_state

#### consts

In [173]:
EPISODE_NUM = 4000
MAX_EPISODE_LEN = 200
REPEAT_TIMES = 10  # train agent REAPEAT_TIMES to get averaged learning curves
EVALUATION_TIMES = 10 # evaluate target policy EVALUATION_TIMES after x updates in off-policy RL algorithms

In [154]:
import numpy as np


import random
from IPython.display import clear_output
from time import sleep

def show_frames(env,j ,i, episode_rewards ):
    clear_output(wait=True)
    env.render()
    print(f"episode {j+1}  step {i}  rewards={episode_rewards}")
    sleep(0.1)




In [155]:
def smooth(y, radius):
    '''
    smooth data y by averaging the values in each window [max{index-radius,0}, min{(index+radius), len(y)-1}] 
    
    '''
    if (len(y) < 2 * radius + 1):
        return np.mean(y) * np.ones_like(y)
    else:
        convkernel = np.ones(2 * radius + 1)
        out = np.convolve(y, convkernel, mode = 'same') / np.convolve(np.ones_like(y), convkernel, mode = 'same')
        return out

In [156]:
def policy_evaluate(env, policy, times = 10):
    # store data, make average and return
    all_length, all_penalties, all_rewards = np.zeros(times), np.zeros(times), np.zeros(times)

    for i in range(times):

        state = env.reset()
        done = False
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        
        for j in range(MAX_EPISODE_LEN):
            action = arg_max(policy[state])
            next_state, reward, done, info = env.step(action)
            state = next_state
            #update data for learning curve
            if reward == -10:
                episode_penalties +=1

            episode_rewards += reward
            episode_length += 1
            if done:
                break

        all_rewards[i] = episode_rewards
        all_penalties[i] = episode_penalties
        all_length[i] = episode_length
    return np.mean(all_rewards), np.mean(all_penalties), np.mean(all_length)

### 1.1 q-learning

In [178]:
random.randint(0,10)

6

In [157]:
def qlearn(env, train_episodes, init_q_table=None):
    """
    Training the agent
    Q(state,action)←(1−α)Q(state,action)+α(reward+γmaxaQ(next state,all actions))
    """  
    
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning

    timesteps =  train_episodes * MAX_EPISODE_LEN

    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    
    if init_q_table is None:
        q_table = np.zeros([env.observation_space.n, env.action_space.n])
    else:
        q_table = copy.deepcopy(init_q_table)
    
    env_copy = copy.deepcopy(env) # for policy_evaluate
    i = 0
    while i < timesteps:
        env.seed(random.randint(0,timesteps))
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False

        while not done and i < timesteps:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1)< epsilon:
                action = env.action_space.sample()
            else:
                action = arg_max(q_table[state])

            # step to next state
            next_state, reward, done, info = env.step(action)       
            
            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            if i%200 == 0:
                #evaluate policy for learning curve after each episode
                episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, q_table, EVALUATION_TIMES)
                
                all_episodes_length.append(episode_length)
                all_penalties.append(episode_penalties)
                all_rewards.append(episode_rewards)
                
                #show training progress
                clear_output(wait=True)
                print(f"timesteps: {i}")
            i += 1
         
    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table

### 1.2 sarsa

In [158]:
def sarsa(env,  train_episodes,init_s_table = None):
    """Training the agent
    Q(state,action)←(1−α)Q(state,action)+α(reward+γQ(next state,next action))

    """  
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning

    train_episodes =  train_episodes 
    timesteps =  train_episodes * MAX_EPISODE_LEN
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    
    
    if init_s_table is None:
        s_table = np.zeros([env.observation_space.n, env.action_space.n])
    else:
        s_table = copy.deepcopy(init_s_table)
    env_copy = copy.deepcopy(env)
    
    i = 0
    while i < timesteps:
        env.seed(random.randint(0,timesteps))
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False

        # epsilon greedy alg balancing exporation and exploitation
        if random.uniform(0,1)< epsilon:
            action = env.action_space.sample()
        else:
            action = arg_max(s_table[state])

        while not done and i < timesteps:

            # step to next state
            next_state, reward, done, info = env.step(action)

            # choose next action
            if random.uniform(0,1)< epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = arg_max(s_table[next_state])

            # update q-value
            q_value = (1-alpha) * s_table[state, action] + alpha * (reward + gamma * s_table[next_state,next_action])
            s_table[state, action] = q_value

            state = next_state
            action = next_action

            if i%200 == 0:
                #evaluate policy for learning curve after each episode
                episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, s_table, EVALUATION_TIMES)
                
                all_episodes_length.append(episode_length)
                all_penalties.append(episode_penalties)
                all_rewards.append(episode_rewards)
                
                #show training progress
                clear_output(wait=True)
                print(f"timesteps: {i}")
            i += 1
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, s_table

### 2 PRQL 

In [159]:
def prql(env, train_episodes, past_policy):
        
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 1 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    mu = 0.95 # the decaying rate of fi
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []

    timesteps =  train_episodes * MAX_EPISODE_LEN
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
 
    past_table = copy.deepcopy(past_policy)
    env_copy = copy.deepcopy(env)
    
    i = 0
    while i < timesteps:
        env.seed(random.randint(0,timesteps))
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        f = fi
        while not done and i < timesteps:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1) < f:
                action = arg_max(past_table[state])
                #print(f"f = {f} in episode {i} \n")
            elif random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
                #print("random action \n")
                
            else:
                action = arg_max(q_table[state])
                #print("on the policy now \n")

            # step to next state
            next_state, reward, done, info = env.step(action)

            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            f = f*mu
            
            if i%200 == 0:
                #evaluate policy for learning curve after each episode
                episode_rewards, episode_penalties, episode_length = policy_evaluate(env_copy, q_table, EVALUATION_TIMES)
                
                all_episodes_length.append(episode_length)
                all_penalties.append(episode_penalties)
                all_rewards.append(episode_rewards)
                
                #show training progress
                clear_output(wait=True)
                print(f"timesteps: {i}")
            i += 1
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table    


### 3 OPS-TL

In [160]:
def ops(env,train_episodes,past_policies):
    """
    Training the agent using algorithm in paper "An Optimal Online Method of Selecting 
    Source Policies for Reinforcement Learning"
    """
    # Hyper parameters
    alpha = 0.05 #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95 # (gamma) is the discount factor (0≤γ≤1)
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 0.95 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    c = 0.0049 # the factor in UCB-tuned deciding the rate of exploration, a lager c will lead to a higher exploration rate
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
   
    #win = {'reuse': [], 'new': []}
    timesteps = train_episodes * MAX_EPISODE_LEN
   
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    env_copy = copy.deepcopy(env)
    
    # pt = 1-k/(k+1500)
    
    #initialize the expected reward of every bandit/past_policy
    policy_expect = np.zeros([len(past_policies)])
    policy_times = np.zeros([len(past_policies)])
    for j in range(len(past_policies)):
        t = 0
        env.reset()
        _,policy_expect[j],_,_= policy_reuse(timesteps, env,q_table, env.reset(), t,[],[],[],past_policies[j])
        policy_times[j] += 1
        
    # training strat  
    i = 0
    while i < timesteps:
        env.seed(random.randint(0,timesteps))
        state = env.reset()
        
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        
        k = i // MAX_EPISODE_LEN  + 1
        pt = 1-k/(k+1500)
        
        # use UCB1 to solve MAB(multi armed bandit problem
        if random.uniform(0,1) < pt:
            j = arg_max(policy_expect + np.sqrt(c * np.log(np.sum(policy_times)) / policy_times )) # UCB1-tuned
            episode_penalties, episode_rewards, episode_length,i = policy_reuse(timesteps, env, q_table, state, i,
                                                                              all_episodes_length,all_penalties,all_rewards,
                                                                              past_policy = past_policies[j])
            policy_expect[j] = (policy_expect[j]*policy_times[j] + episode_rewards)/( policy_times[j] +1)
            policy_times[j] += 1
#            if episode_length < 200:
#                win['reuse'].append(k)
        # follow episilon-greedy strategy    
        else:
            episode_penalties, episode_rewards, episode_length,i = epsilon_greedy(timesteps, env, q_table, state, i , 
                                                                                all_episodes_length,all_penalties,all_rewards,
                                                                                epsilon, alpha, gamma)
#            if episode_length < 200:
#                win['new'].append(k)        


        #show training progress
        if (k) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {k}")
    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table    
            
            
        

In [161]:
def epsilon_greedy(timesteps, env, q_table, initial_state, i,
                   all_episodes_length,all_penalties,all_rewards,epsilon = 0.1, alpha = 0.05,  gamma = 0.95 ):
    done = False
    state = initial_state
    episode_penalties, episode_rewards, episode_length = 0, 0, 0
  
    _env_copy = copy.deepcopy(env)
    while not done and i < timesteps:
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = arg_max(q_table[state])
            
        # step to next state
        next_state, reward, done, info = env.step(action)

        # update q-value
        q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
        q_table[state, action] = q_value

        state = next_state
      
        #update data for learning curve
        if reward == -10:
            episode_penalties +=1

        episode_rewards += reward
        episode_length += 1
        
        if i%200 == 0:
            #evaluate policy for learning curve after each episode
            _episode_rewards, _episode_penalties, _episode_length = policy_evaluate(_env_copy, q_table, EVALUATION_TIMES)

            all_episodes_length.append(_episode_length)
            all_penalties.append(_episode_penalties)
            all_rewards.append(_episode_rewards)

            #show training progress
            clear_output(wait=True)
            print(f"timesteps: {i}")
        i += 1

    return episode_penalties, episode_rewards, episode_length,i

In [162]:
def policy_reuse(timesteps,env, q_table, initial_state, i, 
                 all_episodes_length,all_penalties,all_rewards, past_policy,  fi = 0.95, mu=0.95, alpha = 0.05,  gamma = 0.95 ):
    done = False
    state = initial_state
    episode_penalties, episode_rewards, episode_length = 0, 0, 0
    
    _env_copy = copy.deepcopy(env)
    while not done and i < timesteps:
        if random.uniform(0,1) < fi:
            action = arg_max(past_policy[state])
        else:
            action = env.action_space.sample()
            
        # step to next state
        next_state, reward, done, info = env.step(action)

        # update q-value
        q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
        q_table[state, action] = q_value

        state = next_state
        fi = fi*mu
        #update data for learning curve
        if reward == -10:
            episode_penalties +=1

        episode_rewards += reward
        episode_length += 1
        
        if i%200 == 0:
            #evaluate policy for learning curve after each episode
            _episode_rewards, _episode_penalties, _episode_length = policy_evaluate(_env_copy, q_table, EVALUATION_TIMES)

            all_episodes_length.append(_episode_length)
            all_penalties.append(_episode_penalties)
            all_rewards.append(_episode_rewards)

            #show training progress
            clear_output(wait=True)
            print(f"timesteps: {i}")
        i += 1
        
    return episode_penalties, episode_rewards, episode_length,i
    

### 4 caps

In [163]:
def caps(env,train_episodes,past_policies):
    """
    Training the agent using algorithm in paper "Context-aware policy reuse"
    """
    # Hyper parameters
    alpha = 0.05 #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95 # (gamma) is the discount factor (0≤γ≤1)
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    fi = 0.95 # the intial probability to reuse past policy in policy-reuse algo, decays after each step in one episode
    c = 0.0049 # the factor in UCB-tuned deciding the rate of exploration, a lager c will lead to a higher exploration rate
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    all_frequency = np.zeros([len(past_policies)+1, train_episodes])
#   win = {'reuse': [], 'new': []}
    timesteps = train_episodes * MAX_EPISODE_LEN
   
    q_table = np.zeros([env.observation_space.n, env.action_space.n])
    env_copy = copy.deepcopy(env)
    
    # pt = 1-k/(k+1500)
    
    #initialize the expected reward of every bandit/past_policy
    frequency = np.zeros([len(past_policies)+1, train_episodes])
    policy_expect = np.zeros([len(past_policies)])
    policy_times = np.zeros([len(past_policies)])
    for j in range(len(past_policies)):
        t = 0
        env.reset()
        for _ in range(5):
            _,temp,_,_= policy_reuse(timesteps, env,q_table, env.reset(), t,[],[],[],past_policies[j])
            policy_expect[j] += temp
        policy_expect[j] /= 10
        policy_times[j] += 1
        
    # training strat  
    i = 0
    while i < timesteps:
        env.seed(random.randint(0,timesteps))
        state = env.reset()
        
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        
        k = i // MAX_EPISODE_LEN  + 1
        pt = 1-k/(k+1500)
        
        # use UCB1 to solve MAB(multi armed bandit problem
        if random.uniform(0,1) < pt:
            j = arg_max(policy_expect/20 + np.sqrt(c * np.log(np.sum(policy_times)) / policy_times )) # UCB1-tuned
            episode_penalties, episode_rewards, episode_length,i = policy_reuse(timesteps, env, q_table, state, i,
                                                                              all_episodes_length,all_penalties,all_rewards,
                                                                              past_policy = past_policies[j])
            policy_expect[j] = (policy_expect[j]*policy_times[j] + episode_rewards)/( policy_times[j] +1)
            policy_times[j] += 1
            frequency[j][k-1] += 1
#            if episode_length < 200:
#                win['reuse'].append(k)
        # follow episilon-greedy strategy    
        else:
            episode_penalties, episode_rewards, episode_length,i = epsilon_greedy(timesteps, env, q_table, state, i , 
                                                                                all_episodes_length,all_penalties,all_rewards,
                                                                                epsilon, alpha, gamma)
#            if episode_length < 200:
#                win['new'].append(k)        
            frequency[len(past_policies)][k-1] += 1


    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    for line in range(train_episodes):
        for row in range(len(past_policies)+1):
            all_frequency[row][line]=np.sum(frequency[row][:line+1], keepdims= True) /np.sum(np.sum(frequency[:,:line+1]))
    return all_episodes_length, all_penalties, all_rewards, all_frequency, q_table    

### 5 OC

In [164]:
def oc(env ,episode_num ,  option_policies_lib = None, 
                  option_terminations_lib = None, policy_over_options = None, critic = None, noptions = 4,seed = 1 ):
    
    # Discount
    discount = 0.99
    

    # Learning rates - termination, intra-option, critic
    lr_term = 0.25
    lr_intra = 0.25
    lr_critic = 0.5

    # Epsilon for epsilon-greedy for policy over options
    epsilon = 1e-1

    # Temperature for softmax
    temperature = 0.01
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []
    

    rng = np.random.RandomState(seed)
    #trainning timesteps
    timesteps =  episode_num * MAX_EPISODE_LEN

    nstates = env.observation_space.n
    nactions = env.action_space.n
    
    # for option_evaluate
    env.seed(random.randint(0,timesteps))
    state = env.reset()
    env_copy = copy.deepcopy(env) 

    # Following three belong to the Actor

    # 1. The intra-option policies - linear softmax functions
    if option_policies_lib is None:
        option_policies = [SoftmaxPolicy(rng, lr_intra, nstates, nactions, epsilon, temperature) for _ in range(noptions)]
      
       
    else:
        option_policies = [None] * noptions
        for i in range(noptions):
            option_policies[i] = copy.deepcopy(option_policies_lib[i])


    # 2. The termination function - linear sigmoid function
    if option_terminations_lib is None:
        option_terminations = [SigmoidTermination(rng, lr_term, nstates) for _ in range(noptions)]
        
    else:
        option_terminations =  [None] * noptions
        for i in range(noptions):
            option_terminations[i] = copy.deepcopy(option_terminations_lib[i])



    # 3. The epsilon-greedy policy over options
    if policy_over_options is None:
        policy_over_options = EpsGreedyPolicy(rng, nstates, noptions, epsilon)

        
    else:
        policy_over_options = copy.deepcopy(policy_over_options)


    # Critic
    if critic is None:
        critic = Critic(lr_critic, discount, policy_over_options.Q_Omega_table, nstates, noptions, nactions)
    else:
        critic = copy.deepcopy(critic)

   
    
    i=0
    while i < timesteps:

        # Change goal location after 1000 episodes 
        # Comment it for not doing transfer experiments
#        if episode == 1000:
#            env.goal = rng.choice(possible_next_goals)
#            print('New goal: ', env.goal)

        state = env.reset()

        option = policy_over_options.sample(state)
        action = option_policies[option].sample(state)

        critic.cache(state, option, action)
        
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        
        
        while not done and i < timesteps:
            state, reward, done, _ = env.step(action)

            # Termination might occur upon entering new state
            if option_terminations[option].sample(state):
                option = policy_over_options.sample(state)


            action = option_policies[option].sample(state)

            # Critic update
            critic.update_Qs(state, option, action, reward, done, option_terminations)

            # Intra-option policy update with baseline
            Q_U = critic.Q_U(state, option, action)
            Q_U = Q_U - critic.Q_Omega(state, option)
            option_policies[option].update(state, action, Q_U)

            # Termination condition update
            option_terminations[option].update(state, critic.A_Omega(state, option))
            
          
            if i%200 == 0:
                #evaluate policy for learning curve after each episode
                episode_rewards, episode_penalties, episode_length = option_evaluate(env_copy, option_policies, option_terminations, policy_over_options, EVALUATION_TIMES)
                
                all_episodes_length.append(episode_length)
                all_penalties.append(episode_penalties)
                all_rewards.append(episode_rewards)

                #show training progress
                clear_output(wait=True)
                print(f"timesteps: {i}")
            i += 1



    print(f"Training finished")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, option_policies, option_terminations, policy_over_options,critic

In [165]:
def option_evaluate(env,  option_policies, option_terminations, policy_over_options , times):
    # store data, make average and return
    all_length, all_penalties, all_rewards = np.zeros(times), np.zeros(times), np.zeros(times)

    for i in range(times):

        state = env.reset()
        done = False
        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        
        option = policy_over_options.evaluate(state)
        action = option_policies[option].evaluate(state)
       
        for j in range(MAX_EPISODE_LEN):
            # Termination might occur upon entering new state
            if option_terminations[option].sample(state):
                option = policy_over_options.evaluate(state)


            action = option_policies[option].evaluate(state)
            next_state, reward, done, info = env.step(action)
            state = next_state
            #update data for learning curve
            if reward == -10:
                episode_penalties +=1

            episode_rewards += reward
            episode_length += 1
            if done:
                break

        all_rewards[i] = episode_rewards
        all_penalties[i] = episode_penalties
        all_length[i] = episode_length
    return np.mean(all_rewards), np.mean(all_penalties), np.mean(all_length)

###  tranform

In [188]:
random.seed(2)

In [166]:
def transform( tl_algo, episode_num, repeat_times, target_task, 
              source_task= None, source_rl_algo = None, policy_library = None):
    
    """
    return all_episodes_length, all_penalties, all_rewards, all_frequency, all_trans_knowledge
    
    Situation 1:
    no source task available, train agent on target task from scratch 
    
    Situation 2:
    source task is available but source policy unavailable, train agent on source task to get source policy, 
    then train agent on target task using knowledge from source policy
    
    Situation 3:
    source policies are availale, reuse source policies to train agent on target task
    """
    train_episodes = episode_num    
    # data collected during trainning
    all_episodes_length = np.zeros([repeat_times,train_episodes])
    all_penalties = np.zeros([repeat_times,train_episodes])
    all_rewards = np.zeros([repeat_times,train_episodes])
    if tl_algo is caps and policy_library is not None:
        all_frequency = np.zeros([len(policy_library[0])+1, train_episodes])
    else:
        all_frequency = None
    
    all_trans_knowledge = []
    
    
    
    for i in range(repeat_times):
       
        np.random.seed(i)
        random.seed(i)
        #Situation 3    
        if policy_library is not None: 
            
            if type(target_task) is str:
                env = gym.make(target_task)
            else:
                env = target_task
            past_policies = policy_library # policy_library = [ [[policy11][policy21]...] [[policy12][policy22]...]...]
            if tl_algo is caps:
                episodes,penalties,rewards,frequency, *knowledge = tl_algo(env, train_episodes, past_policies[i]) 
            else:
                episodes,penalties,rewards, *knowledge = tl_algo(env, train_episodes,  past_policies[i]) 
        #Situation 2   
        elif source_task is not None:
            if source_rl_algo is None:
                source_rl_algo = tl_algo
            if type(source_task) is str:
                env = gym.make(source_task)
            else:
                env = source_task
            _, _,_ ,*knowledge = source_rl_algo(env,  train_episodes)
            
            if type(target_task) is str:
                env = gym.make(target_task)
            else:
                env = target_task
            episodes,penalties,rewards,*knowledge = tl_algo(env, train_episodes , *knowledge)
         #Situation 1
        else: 
            if type(target_task) is str:
                env = gym.make(target_task)
            else:
                env = target_task
                
           
            episodes,penalties,rewards,*knowledge = tl_algo(env, train_episodes)
            

        all_episodes_length[i] = episodes
        all_penalties[i] = penalties
        all_rewards[i] = rewards
        if tl_algo is caps and policy_library is not None:
            all_frequency += frequency
        if len(knowledge) > 1:
            all_trans_knowledge.append(knowledge)
        else:
            all_trans_knowledge.append(*knowledge)

    
    if tl_algo is caps and policy_library is not None:
        all_frequency /= repeat_times
    
    return all_episodes_length, all_penalties, all_rewards, all_frequency, all_trans_knowledge

In [190]:
a= np.zeros([2,3])
a[0]

array([0., 0., 0.])

## Training

### 1 q-learning

In [39]:
%%time
qlearn_episodes_length_fourroom, qlearn_penalties_fourroom, qlearn_rewards_fourroom, qlearn_q_table_fourroom = [[None] * len(four_room_envs) for _ in range(4)]
for i in range(len(four_room_envs)):
    qlearn_episodes_length_fourroom[i], qlearn_penalties_fourroom[i], qlearn_rewards_fourroom[i], _, qlearn_q_table_fourroom[i] = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[i],
                                                                                     source_task = None
                                                                                    )

NameError: name 'CAPS' is not defined

In [40]:
%%time

# training Taxi-v003 from scratch
qlearn_episodes_length_v003, qlearn_penalties_v003, qlearn_rewards_v003, _, qlearn_q_table_v003 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v003",
                                                                                     source_task = None
                                                                                    )

NameError: name 'CAPS' is not defined

In [48]:
%%time

# training Taxi-v000 from scratch
qlearn_episodes_length_v000, qlearn_penalties_v000, qlearn_rewards_v000, _, qlearn_q_table_v000 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v000",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 12min 15s, sys: 1min 8s, total: 13min 24s
Wall time: 14min 51s


In [49]:
%%time

# training Taxi-v001 from scratch
qlearn_episodes_length_v001, qlearn_penalties_v001, qlearn_rewards_v001,_, qlearn_q_table_v001 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v001",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 12min 4s, sys: 1min 14s, total: 13min 19s
Wall time: 14min 56s


In [50]:
%%time

# training Taxi-v004 from scratch
qlearn_episodes_length_v004, qlearn_penalties_v004, qlearn_rewards_v004,_, qlearn_q_table_v004 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v004",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 12min 55s, sys: 1min 27s, total: 14min 22s
Wall time: 15min 31s


### 2 sarsa

In [51]:
%%time

# training Taxi-v003 from scratch
sarsa_episodes_length_v003, sarsa_penalties_v003, sarsa_rewards_v003, _, sarsa_q_table_v003 = transform(tl_algo = sarsa,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v003",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished 

CPU times: user 10min 37s, sys: 1min 44s, total: 12min 21s
Wall time: 12min 53s


In [52]:
%%time

# training Taxi-v001 from scratch
sarsa_episodes_length_v001, sarsa_penalties_v001, sarsa_rewards_v001,_, sarsa_q_table_v001 = transform(tl_algo = sarsa,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v001",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished 

CPU times: user 7min 21s, sys: 2min 3s, total: 9min 24s
Wall time: 8min 14s


In [53]:
%%time

# training Taxi-v004 from scratch
sarsa_episodes_length_v004, sarsa_penalties_v004, sarsa_rewards_v004,_, sarsa_q_table_v004 = transform(tl_algo = sarsa,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v004",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished 

CPU times: user 7min 58s, sys: 2min 29s, total: 10min 27s
Wall time: 8min 57s


### 3 PRQL

In [40]:
%%time

pr_episodes_length_fourroom, pr_penalties_fourroom, pr_rewards_fourroom, pr_q_table_fourroom = [[None] * (len(four_room_envs)-1) for _ in range(4)]

#pr_fourroom_v012_3
for i in range(len(four_room_envs)-1):
    pr_episodes_length_fourroom[i], pr_penalties_fourroom[i], pr_rewards_fourroom[i], _, pr_q_table_fourroom[i] = transform(tl_algo = prql,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library = qlearn_q_table_fourroom[i]
                                                                                    )

NameError: name 'qlearn_q_table_fourroom' is not defined

In [41]:
%%time
#pr_v014_003
pr_episodes_length_v014_003, pr_penalties_v014_003,pr_rewards_v014_003, _, pr_q_table_v014_003 = transform(tl_algo= prql,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v004)

TypeError: randint() missing 1 required positional argument: 'b'

###  3 OPS-TL

In [42]:
%%time

ops_episodes_length_fourroom, ops_penalties_fourroom, ops_rewards_fourroom, ops_q_table_fourroom = [[None] * (len(four_room_envs)-1) for _ in range(4)]

#ops_fourroom_v012_3

ops_episodes_length_fourroom, ops_penalties_fourroom, ops_rewards_fourroom, _, ops_q_table_fourroom = transform(tl_algo = OPS_TL,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library =  [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_fourroom[0],qlearn_q_table_fourroom[1],qlearn_q_table_fourroom[2])]
                                                                                    )

NameError: name 'qlearn_q_table_fourroom' is not defined

In [43]:
%%time
#ops_v014_003
ops_episodes_length_v014_003, ops_penalties_v014_003,ops_rewards_v014_003,ops_fre_v014_003, ops_q_table_v014_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

NameError: name 'qlearn_q_table_v000' is not defined

### 4 caps

In [44]:
%%time


#caps_fourroom_v012_3

caps_episodes_length_fourroom, caps_penalties_fourroom, caps_rewards_fourroom, _, caps_q_table_fourroom = transform(tl_algo = caps,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                     target_task = four_room_envs[3],
                                                                                     policy_library =  [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_fourroom[0],qlearn_q_table_fourroom[1],qlearn_q_table_fourroom[2])]
                                                                                    )

NameError: name 'qlearn_q_table_fourroom' is not defined

In [45]:
%%time
#caps_v014_003
caps_episodes_length_v014_003, caps_penalties_v014_003,caps_rewards_v014_003,caps_fre_v014_003, caps_q_table_v014_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

NameError: name 'qlearn_q_table_v000' is not defined

### 5 OC

In [46]:

%autoreload 2
option_policies, option_terminations, policy_over_options, critic, nrewards = [ [None]*4 for i in range(5) ] 

ERROR:root:Line magic function `%autoreload` not found.


In [336]:
%%time

for i in [0,1,2,3]:
    _, _ ,nrewards[i] ,option_policies[i], option_terminations[i], policy_over_options[i],critic[i]= oc( four_room_envs[i], 4000, noptions=1)


timesteps: 799800
Training finished
CPU times: user 13min 44s, sys: 2min 30s, total: 16min 15s
Wall time: 14min 31s


In [255]:
%%time
trans_rewards,trans_terminations, trans_policies = [ [None]*4 for _ in range(3)]
 
for i in [0,1,2]:
    _,_, trans_rewards[i], trans_policies[i], trans_terminations[i], _,_= oc( four_room_envs[3], 4000, option_policies[i], option_terminations[i])


IndexError: list index out of range

In [308]:
source_lib = [1,2]
_,_, trans_rewards_012, _, _, _,_= oc( four_room_envs[3], 4000, [option_policies[i][0] for i in source_lib], noptions=2)

timesteps: 799800
Training finished


In [373]:
%matplotlib widget
import seaborn as sns
RATIO = 1
SMOOTH_RADIUS = 50
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_fourroom[2], SMOOTH_RADIUS), label = "NoneTransfer")
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "caps_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "caps_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "caps_v4_003",color='y')
i = 0
#plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_fourroom[i],SMOOTH_RADIUS), label = "pr_v014_003",color='c')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_fourroom,SMOOTH_RADIUS), label = "ops_v014_003",color='g')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_fourroom,SMOOTH_RADIUS), label = "caps_v014_003",color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards_012,SMOOTH_RADIUS), label = "oc_v01_3")
plt.plot(range(EPISODE_NUM//RATIO), smooth(nrewards[3], SMOOTH_RADIUS), label = "oc_v3")





plt.title("0-4000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [242]:
%matplotlib widget
RATIO = 4
SMOOTH_RADIUS = 100
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_fourroom[3][:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "caps_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "caps_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "caps_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_fourroom[0][:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "pr_v014_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_fourroom[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_v014_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_fourroom[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v014_003",color='r')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards[3][:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "oc_v014_003",color='m')





plt.title("0-1000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

NameError: name 'pr_rewards_fourroom' is not defined

In [372]:
#%matplotlib widget
RATIO = 1
SMOOTH_RADIUS = 50
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003, SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "caps_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "caps_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "caps_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_v014_003,SMOOTH_RADIUS), label = "pr_v014_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_v014_003,SMOOTH_RADIUS), label = "ops_v014_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003,SMOOTH_RADIUS), label = "caps_v014_003",color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards[3],SMOOTH_RADIUS), label = "oc_v014_003",color='m')





plt.title("0-4000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

NameError: name 'pr_rewards_v014_003' is not defined

In [None]:
RATIO = 4
SMOOTH_RADIUS = 50
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003,SMOOTH_RADIUS), label = "caps_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003,SMOOTH_RADIUS), label = "caps_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003,SMOOTH_RADIUS), label = "caps_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(pr_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "pr_v014_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(tl_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "ops_v014_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v014_003",color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(trans_rewards[3][:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "oc_v014_003",color='m')





plt.title("0-1000 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
import seaborn as sns
RATIO = 1
SMOOTH_RADIUS = 100
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[0],SMOOTH_RADIUS), label = 'ploicy_v001')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[1],SMOOTH_RADIUS), label = 'ploicy_v004')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v01_003[2],SMOOTH_RADIUS), label = 'target policy')
plt.legend()
plt.xlabel('episodes')
plt.ylabel('frequency')
plt.title("frequency of policy selection")
plt.show()
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v04_003,SMOOTH_RADIUS), label = "caps_v04_003",color='g')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_fre_v14_003,SMOOTH_RADIUS), label = "caps_v14_003",color='r')

In [None]:
RATIO = 10
SMOOTH_RADIUS = 20

plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v000_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v0_003",color='b')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v001_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v1_003",color='m')
#plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v004_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v4_003",color='y')

plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v01_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v01_003",color='c')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v04_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v04_003",color='g')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v14_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = "caps_v14_003",color='r')



plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v014_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "caps_v014_003", color='navy')

plt.title("first 400 episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
%%time
#ops_v1_003
ops_episodes_length_v1_003, ops_penalties_v1_003,ops_rewards_v1_003, ops_q_table_v1_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v1",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v1])

In [None]:
%%time
#ops_v4_003
ops_episodes_length_v4_003, ops_penalties_v4_003,ops_rewards_v4_003, ops_q_table_v4_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v4])

In [None]:
%%time
#ops_v14_003
ops_episodes_length_v14_003, ops_penalties_v14_003,ops_rewards_v14_003, ops_q_table_v14_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)])

In [None]:
%%time
#caps_v14_003
caps_episodes_length_v14_003, ops__norm_penalties_v14_003,caps_rewards_v14_003, caps_q_table_v14_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = 5,
                                                                                       target_task = "Taxi-v003",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)])

In [None]:
import functools
#caps_v14_003
pl=[[q_func1,q_func2 ] for q_func1,q_func2 in zip(qlearn_q_table_v1, qlearn_q_table_v4)]
ctuple = (0, 0.0049, 0.1, 0.2, 0.4,0.8, 1, 2, 4, 8, 16)
ops_cnorm_episodes_length_v14_003,ops__cnorm_penalties_v14_003,ops_cnorm_rewards_v14_003,ops_cnorm_q_table_v14_003 = [0] * len(ctuple), [0]*len(ctuple),[0]*len(ctuple), [0]*len(ctuple)
for i in range(len(ctuple)):
    OPS_patial = functools.partial( caps, c = ctuple[i])
    ops_cnorm_episodes_length_v14_003[i], ops__cnorm_penalties_v14_003[i],ops_cnorm_rewards_v14_003[i], ops_cnorm_q_table_v14_003[i] = transform(tl_algo= OPS_patial,
               episode_num = EPISODE_NUM,
               repeat_times = REPEAT_TIMES,
               target_task = "Taxi-v003",
               policy_library = pl)



In [None]:
sns.set(style = "darkgrid")
plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v003, 100), label = "qlearn_v003", color='k')
for i in range(len(ctuple)):
    plt.plot(range(EPISODE_NUM), smooth(ops_cnorm_rewards_v14_003[i], 100), label = "c = "+str(ctuple[i]))
    
plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
RATIO = 20
SMOOTH_RADIUS = 200


for i in range(len(ctuple)):
    plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "c = "+str(ctuple[i]))
    
plt.title("reward-epsidoe graph using caps different c")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
for i in range(len(ctuple)):
    print(f"c = {ctuple[i]}, reward = {smooth(ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//20], 200)[0]}" )

In [None]:
import matplotlib
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False

reward = [smooth( ops_cnorm_rewards_v14_003[i][:EPISODE_NUM//20], 200)[0] for i in range(len(ctuple)) ]
"""
绘制水平条形图方法barh
参数一：y轴
参数二：x轴
"""
plt.barh(range(len(ctuple)), reward, height=0.7, color='steelblue', alpha=0.8)      # 从下往上画
plt.yticks(range(len(ctuple)), [f"c={str(ctuple[i])}" for i in range(len(ctuple))])
plt.xlim(15,20)
plt.xlabel("average reward ")
plt.title("average reward over first 200 episode under different c")
for x, y in enumerate(reward):
    plt.text(y + 0.2, x - 0.1, '%s' % y)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style = "darkgrid")


plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v003, 100), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v4_003, 100), label = "ops_v4_003", color='r')
plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v1_003, 100), label = "ops_v1_003", color='b')

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v14_003, 100), label = "ops_v14_003", color='y')
plt.plot(range(EPISODE_NUM), smooth(caps_rewards_v14_003, 100), label = "caps_v14_003", color='navy')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:

RATIO = 8
SMOOTH_RADIUS = 25
plt.plot(range(EPISODE_NUM//RATIO), smooth(qlearn_rewards_v003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "qlearn_v003", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v4_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v4_003", color='r')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v1_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v1_003", color='b')
plt.plot(range(EPISODE_NUM//RATIO), smooth(ops_rewards_v14_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "ops_v14_003", color='y')
plt.plot(range(EPISODE_NUM//RATIO), smooth(caps_rewards_v14_003[:EPISODE_NUM//RATIO], SMOOTH_RADIUS), label = "caps_v14_003", color='navy')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
RATIO = 8
SMOOTH_WINDOW = 20
plt.plot(range(EPISODE_NUM//RATIO),smooth(qlearn_episodes_length_v003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'qlearn_v3')
'''
plt.plot(range(train_episodes),qlearn_episodes_length_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_episodes_length_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_episodes_length_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_episodes_length_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_episodes_length_v4_3, label = 'sarsa_v4_3')
'''
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_episodes_length_v1_003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'ops_v1_3')
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_episodes_length_v4_003[:EPISODE_NUM//RATIO],SMOOTH_WINDOW), label = 'ops_v4_3')

plt.title("episode-length graph")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
RATIO = 20
SMOOTH_RADIUS = 20
plt.plot(range(EPISODE_NUM//RATIO),smooth(qlearn_penalties_v03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'qlearn_v3')
'''
plt.plot(range(train_episodes),qlearn_episodes_length_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_episodes_length_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_episodes_length_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_episodes_length_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_episodes_length_v4_3, label = 'sarsa_v4_3')
'''
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_penalties_v1_03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'ops_v1_3')
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_penalties_v4_03[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'ops_v4_3')

plt.title("episode-penalties graph")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show



In [None]:
def visit_all(dic):
    for value in dic.values():
        if value == 0:
            return False
    return True

sum = 0
times = 1000
for i in range(times):    
    dic = {i: 0 for i in range(100)}
    while not visit_all(dic):
        key = random.choice(range(len(dic)))
        dic[key] = 1
        sum += 1
sum /= times
print(sum)

In [None]:
def div_add(n):
    sum = 0
    for i in range(1,n+1):
        sum += 1/i
        
    return n * sum
div_add(3000)/200


In [None]:
sum = 0
times = 1000
for i in range(times):    
    dic = {i: 0 for i in range(100)}
    while dic[0] == 0:
        key = random.choice(range(len(dic)))
        dic[key] = 1
        sum += 1
sum /= times
print(sum)

In [None]:
def p_reuse(env, past_policy, train_episodes, fi = 1, mu = 0.95):
        
    # Hyper parameters
    alpha = 0.05  #(alpha) is the learning rate (0<α≤1) 
    gamma = 0.95  # (gamma) is the discount factor (0≤γ≤1) 
    epsilon = 0.1 # the rate to act randomly in the epsilon-greedy alg for expolration in Q-learning
    
    # To plot learning curve
    all_episodes_length = []
    all_penalties = []
    all_rewards = []

    q_table = np.zeros([env.observation_space.n, env.action_space.n])
 
    past_table = copy.deepcopy(past_policy[0])
    
    
    for i in range(train_episodes):
        state = env.reset()

        episode_penalties, episode_rewards, episode_length = 0, 0, 0
        done = False
        f = fi
        while not done:
            # epsilon greedy alg balancing exporation and exploitation
            if random.uniform(0,1) < f:
                action = arg_max(past_table[state])
                #print(f"f = {f} in episode {i} \n")
            elif random.uniform(0,1) < epsilon:
                action = env.action_space.sample()
                #print("random action \n")
                
            else:
                action = arg_max(q_table[state])
                #print("on the policy now \n")

            # step to next state
            next_state, reward, done, info = env.step(action)

            # update q-value
            q_value = (1-alpha) * q_table[state, action] + alpha * (reward + gamma * np.max(q_table[next_state]))
            q_table[state, action] = q_value

            state = next_state
            f = f*mu
            #update data for learning curve
            if reward == -10:
                episode_penalties +=1

            episode_rewards += reward
            episode_length += 1

        #record data for learning curve
        all_episodes_length.append(episode_length)
        all_penalties.append(episode_penalties)
        all_rewards.append(episode_rewards)

        #show training progress
        if (i+1) % 100 == 0:
            clear_output(wait=True)
            print(f"Episode {i}")
    print("Training finished \n")
    
    # return ndarray
    all_episodes_length = np.array(all_episodes_length)
    all_penalties = np.array(all_penalties)
    all_rewards = np.array(all_rewards)
    
    return all_episodes_length, all_penalties, all_rewards, q_table

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style = "darkgrid")



plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v3, 100), label = "qlearn_v3", color='k')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3",color='c')
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3",color='g')

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v4_3, 100), label = "ops_v4_3", color='r')
plt.plot(range(EPISODE_NUM), smooth(ops_rewards_v1_3, 100), label = "ops_v1_3", color='b')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
%%time


# training v03 from scratch

qlearn_episodes_length_v03, qlearn_penalties_v03, qlearn_rewards_v03, qlearn_q_table_v03 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v03",
                                                                                     source_task = None
                                                                                    )



In [None]:
%%time


# training v01 from scratch

qlearn_episodes_length_v01, qlearn_penalties_v01, qlearn_rewards_v01, qlearn_q_table_v01 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v01",
                                                                                     source_task = None
                                                                                    )


In [None]:
%%time


# training v04 from scratch

qlearn_episodes_length_v04, qlearn_penalties_v04, qlearn_rewards_v04, qlearn_q_table_v04 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM,  
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v04",
                                                                                     source_task = None
                                                                                    )


In [None]:
%%time
#ops_v04_03
ops_episodes_length_v04_03, ops_penalties_v04_03,ops_rewards_v04_03, ops_q_table_v04_03 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v03",
                                                                                       source_task = "Taxi-v04",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v04])

In [None]:
%time
#ops_v01_03
ops_episodes_length_v01_03, ops_penalties_v01_03,ops_rewards_v01_03, ops_q_table_v01_03 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v03",
                                                                                       source_task = "Taxi-v01",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v01])

In [None]:
plt.plot(range(500), smooth(qlearn_rewards_v5[:500], 20), label = "qlearn_v5")
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v5, 100), label = "qlearn_v5")

#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v1_3,100), label = "qlearn_v1_3")
#plt.plot(range(EPISODE_NUM), smooth(qlearn_rewards_v4_3,100), label = "qlearn_v4_3")

#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v4_3, 100), label = "pr_v4_3")
#plt.plot(range(EPISODE_NUM), smooth(pr_rewards_v1_3, 100), label = "pr_v1_3")

plt.plot(range(500), smooth(ops_rewards_v4_5[:500], 20), label = "ops_v4_5")
plt.plot(range(500), smooth(ops_rewards_v1_5[:500], 20), label = "ops_v1_5")

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("rewards")
plt.legend()
plt.show()

In [None]:
for i in qlearn_rewards_v3:
    print(i)

In [None]:
env = gym.make("Taxi-v3")
r,_, _ = policy_evaluate(env, np.zeros([env.observation_space.n,env.action_space.n]), 1)
print(r)

In [None]:

plt.plot(range(EPISODE_NUM//10 ),smooth(pr_rewards_v4_3[-EPISODE_NUM//10: ],100), label = 'pr_v4_3')
plt.plot(range(EPISODE_NUM//10 ),smooth(pr_rewards_v4_3[-EPISODE_NUM//10: ],100), label = 'prql_v4_3')
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v3[-EPISODE_NUM//10:], 100), label = "qlearn_v3")
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v4[-EPISODE_NUM//10:], 100), label = "qlearn_v4")
plt.plot(range(EPISODE_NUM//10), smooth(qlearn_rewards_v1_3[-EPISODE_NUM//10:],100), label = "qlearn_v1_3")

plt.title("smoothed episode-reward graph: last {} episodes".format(EPISODE_NUM//10))
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
%%time
ops_episodes_length_v4_3, ops_penalties_v4_3,ops_rewards_v4_3, ops_q_table_v4_3 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = 10,
                                                                                       target_task = "Taxi-v3",
                                                                                       source_task = "Taxi-v4",
                                                                                       policy_library = [[q_func] for q_func in qlearn_q_table_v4])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

train_episodes = EPISODE_NUM

plt.plot(range(train_episodes),smooth(qlearn_rewards_v3,100), 'k', label = 'qlearn_v3')
plt.plot(range(train_episodes),smooth(qlearn_rewards_v1_3,100), label = 'qlearn_v1_3')
plt.plot(range(train_episodes),smooth(qlearn_rewards_v4_3,100), label = 'qlearn_v4_3')


plt.plot(range(train_episodes),smooth(pr_rewards_v1_3,100), label = 'pr_v1_3')
plt.plot(range(train_episodes),smooth(pr_rewards_v4_3,100), label = 'pr_v4_3')


plt.plot(range(train_episodes),smooth(ops_rewards_v1_3,100), label = 'ops_v1_3')
plt.plot(range(train_episodes),smooth(ops_rewards_v4_3,100), label = 'ops_v4_3')

plt.title("episode-reward graph")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show




In [None]:
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v3[-train_episodes//10: ],100), 'k',label = 'qlearn_v3')
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v1_3[-train_episodes//10: ],100), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//10 ),smooth(qlearn_rewards_v4_3[-train_episodes//10: ],100), label = 'qlearn_v1_3')

#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v3[-train_episodes//10: ],100), label = 'sarsa_v3')
#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v1_3[-train_episodes//10: ],100), label = 'sarsa_v1_3')
#plt.plot(range(train_episodes//10 ),smooth(sarsa_rewards_v4_3[-train_episodes//10: ],100), label = 'sarsa_v1_3')

plt.plot(range(train_episodes//10 ),smooth(pr_rewards_v1_3[-train_episodes//10: ],100), label = 'pr_v1_3')
plt.plot(range(train_episodes//10 ),smooth(pr_rewards_v4_3[-train_episodes//10: ],100), label = 'pr_v4_3')

plt.plot(range(train_episodes//10 ),smooth(pr1_rewards_v1_3[-train_episodes//10: ],100), label = 'pr1_v1_3')
plt.plot(range(train_episodes//10 ),smooth(pr1_rewards_v4_3[-train_episodes//10: ],100), label = 'pr1_v4_3')


plt.title("smoothed episode-reward graph: last 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:

plt.plot(range(train_episodes//100),qlearn_rewards_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_rewards_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_rewards_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_rewards_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_rewards_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_rewards_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show



In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_rewards_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_rewards_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')


plt.title("q-learning smoothed episode-reward graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("reward")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),qlearn_episodes_length_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_episodes_length_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_episodes_length_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_episodes_length_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_episodes_length_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_episodes_length_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show


In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_episodes_length_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.title("q-learning smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_episodes_length_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("sarsa smoothed episode-length graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("length")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes),qlearn_penalties_v3, label = 'qlearn_v3')
plt.plot(range(train_episodes),qlearn_penalties_v1_3, label = 'qlearn_v1_3')
plt.plot(range(train_episodes),qlearn_penalties_v4_3, label = 'qlearn_v4_3')

plt.plot(range(train_episodes),sarsa_penalties_v3, label = 'sarsa_v3')
plt.plot(range(train_episodes),sarsa_penalties_v1_3, label = 'sarsa_v1_3')
plt.plot(range(train_episodes),sarsa_penalties_v4_3, label = 'sarsa_v4_3')

plt.title("episode-penalties graph")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show



In [None]:
plt.plot(range(train_episodes//100),qlearn_penalties_v3[:train_episodes//100], label = 'qlearn_v3')
plt.plot(range(train_episodes//100),qlearn_penalties_v1_3[:train_episodes//100], label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),qlearn_penalties_v4_3[:train_episodes//100], label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),sarsa_penalties_v3[:train_episodes//100], label = 'sarsa_v3')
plt.plot(range(train_episodes//100),sarsa_penalties_v1_3[:train_episodes//100], label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),sarsa_penalties_v4_3[:train_episodes//100], label = 'sarsa_v4_3')

plt.title("episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')

plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v3[:train_episodes//100],10), label = 'qlearn_v3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v1_3[:train_episodes//100],10), label = 'qlearn_v1_3')
plt.plot(range(train_episodes//100),smooth(qlearn_penalties_v4_3[:train_episodes//100],10), label = 'qlearn_v4_3')


plt.title("q-learning smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

In [None]:
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v3[:train_episodes//100],10), label = 'sarsa_v3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v1_3[:train_episodes//100],10), label = 'sarsa_v1_3')
plt.plot(range(train_episodes//100),smooth(sarsa_penalties_v4_3[:train_episodes//100],10), label = 'sarsa_v4_3')

plt.title("sarsa smoothed episode-penalties graph: first 1000 episodes")
plt.xlabel("episode")
plt.ylabel("penalties ")
plt.legend()
plt.show

## Experiment

### 1. Exp1: similarity diff

#### q-learning

In [54]:
%%time
#qlearn_v004_003
qlearn_episodes_length_v004_003, qlearn_penalties_v004_003,qlearn_rewards_v004_003, _, qlearn_q_table_v004_003 = transform(tl_algo= q_learning,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v004)

timesteps: 799800
Training finished
CPU times: user 8min, sys: 1min 42s, total: 9min 42s
Wall time: 8min 41s


In [55]:
%%time
#qlearn_v001_003
qlearn_episodes_length_v001_003, qlearn_penalties_v001_003,qlearn_rewards_v001_003, _, qlearn_q_table_v001_003 = transform(tl_algo= q_learning,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v001)

timesteps: 799800
Training finished
CPU times: user 21min 52s, sys: 5min 52s, total: 27min 45s
Wall time: 24min 17s


#### sarsa

In [56]:
%%time
#sarsa_v001_003
sarsa_episodes_length_v001_003, sarsa_penalties_v001_003,sarsa_rewards_v001_003, _, sarsa_q_table_v001_003 = transform(tl_algo= sarsa,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  sarsa_q_table_v001)

timesteps: 799800
Training finished 

CPU times: user 23min 8s, sys: 8min 17s, total: 31min 26s
Wall time: 27min 4s


In [57]:
%%time
#sarsa_v004_003
sarsa_episodes_length_v004_003, sarsa_penalties_v004_003,sarsa_rewards_v004_003, _, sarsa_q_table_v004_003 = transform(tl_algo= sarsa,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  sarsa_q_table_v004)

timesteps: 799800
Training finished 

CPU times: user 7min 31s, sys: 1min 47s, total: 9min 18s
Wall time: 8min 48s


#### Train all

In [189]:
qlearn_rewards_v4

array([[-770.6, -792.8, -707.6, ...,    8.4,    8.6,    8.2],
       [-794.2, -730.2, -764.3, ...,    8.3,    7.8,    7.9],
       [-794. , -783.2, -736.7, ...,    8.3,    7.6,    8.4],
       ...,
       [-733.7, -891.2, -970.4, ...,    6.2,    9.7,    7.8],
       [-803.9, -693.2, -729.9, ...,    7.6,    7. ,    7.4],
       [-756.3, -965.9, -763.4, ...,    8.5,    8.6,    8.1]])

In [185]:
%%time
ENV = 'Taxi'
rl_algs = ['qlearn','sarsa']
#alias
qtable = qlearn
stable =sarsa
tl_algs = [ 'qtable', 'stable', 'prql','ops','caps']
levels =['v1', 'v3','v4']
for rl_alg in tqdm(rl_algs):    
    for level in tqdm(levels):
        # source task training from scratch
        _,_,locals()[rl_alg + '_rewards_'+level ],_,locals()[rl_alg + '_q_table_'+level ] = transform(tl_algo= eval(rl_alg),
                                                                            episode_num = EPISODE_NUM,
                                                                            repeat_times = REPEAT_TIMES,
                                                                            target_task = ENV + '-' + levels[1],
                                                                                  )
         # transfer training
        if level != levels[1]:
            for tl_alg in tl_algs:
                if tl_alg not in ['ops','caps']:
                    _,_,locals()[tl_alg + '_' + rl_alg +'_rewards_'+level +'_'+ levels[1]],_,_ = transform(tl_algo= eval(tl_alg),
                                                                                episode_num = EPISODE_NUM,
                                                                                repeat_times = REPEAT_TIMES,
                                                                                target_task = ENV + '-' + levels[1],
                                                                                policy_library =  eval(rl_alg + '_q_table_' + level))
                else:
                    _,_,locals()[tl_alg + '_' + rl_alg + '_rewards_'+level +'_'+ levels[1]],_,_ = transform(tl_algo= eval(tl_alg),
                                                                                episode_num = EPISODE_NUM,
                                                                                repeat_times = REPEAT_TIMES,
                                                                                target_task = ENV + '-' + levels[1],
                                                                                policy_library =  [[qfunc] for qfunc in eval(rl_alg + '_q_table_' + level)])

timesteps: 799800
Training finished



100%|██████████| 3/3 [8:09:35<00:00, 9791.88s/it]   [A
100%|██████████| 2/2 [16:19:21<00:00, 29380.76s/it] 

CPU times: user 15h 47min 19s, sys: 45min 3s, total: 16h 32min 23s
Wall time: 16h 19min 21s





In [199]:
all_data

{'Close_transfer_caps_qlearn_F1': array([[ -785.9,  -804.8,  -821. , ...,     7.5,     8.9,     9.5],
        [ -784.1,  -776. ,  -813.8, ...,     8.8,     7.7,     7.9],
        [ -790.4,  -766.1,  -865.1, ...,     8.4,     7.4,     7. ],
        ...,
        [ -794.9,  -941.6,  -751.3, ...,     8.4,     8.3,     8.4],
        [ -744.9,  -810.6,  -804.8, ...,     7.3,     6.6,     7.6],
        [ -785. , -1026.2,  -872.3, ...,     7.3,     8.4,     7.6]]),
 'Close_transfer_caps_sarsa_F1': array([[-794. , -822.8, -814.7, ...,    7.4,    7.6,    8.1],
        [-762.5, -774.2, -821.9, ...,    7.8,    7.5,    8.7],
        [-773.3, -896.6, -810.2, ...,    7.2,    8. ,    7.3],
        ...,
        [-782.3, -827.3, -742.7, ...,    8.3,    7.4,    7.9],
        [-796.7, -795.8, -807.5, ...,   10. ,    7.5,    8.4],
        [-768.8, -863.3, -815.6, ...,    7.9,    8.5,    6.7]]),
 'Close_transfer_ops_qlearn_F1': array([[-766.3, -737.9, -787.7, ...,    7.4,    5.7,    8.7],
        [-790.4, -

In [196]:
tl_algs = ['qtable', 'stable', 'prql','ops','caps']
rl_algs = ['qlearn','sarsa']
data = [None] * (len(tl_algs) * len(rl_algs))
all_data = {}
i = 0
for rl_alg in rl_algs:
    for tl_alg in (tl_algs):
        data[i] = cache(tl_alg =tl_alg, rl_alg = rl_alg)
        all_data.update(data[i])
        i += 1

In [194]:
import os
def cache(ex_name = "Taxi", tl_alg = "qlearn", rl_alg = '', all_data = None, reward_mode = 'F1', target_level = 'v3', save = True):
    
    if os.path.exists("./"+ time.strftime("%Y%m%d", time.localtime())) is not True:
        os.mkdir("./"+ time.strftime("%Y%m%d", time.localtime()) )
    if all_data is None:
        data = {"No_transfer_null_" + rl_alg + '_' + reward_mode: eval(rl_alg +"_rewards_" + target_level),
                "Far_transfer_"+tl_alg + '_' + rl_alg + '_' + reward_mode: eval(tl_alg+'_'+ rl_alg +"_rewards_"+ target_level[0] + '1' + '_' + target_level),
                "Close_transfer_"+tl_alg +'_' + rl_alg + '_' + reward_mode: eval(tl_alg+'_'+ rl_alg +"_rewards_"+ target_level[0] + '4' + '_' + target_level)
               }
        if save:
            path = "./" + time.strftime("%Y%m%d", time.localtime()) +'/' + ex_name + '_' + tl_alg + '_'  + rl_alg 
            saveData(data,path)
    else:
        data = all_data
        path = "./"+ time.strftime("%Y%m%d", time.localtime()) +'/'  + ex_name + '_' +  'alldata_'  + rl_alg 
        saveData(data,path)
    return data

#### PRQL

In [63]:
%%time
#pr_v001_003
pr_episodes_length_v001_003, pr_penalties_v001_003,pr_rewards_v001_003, _, pr_q_table_v001_003 = transform(tl_algo= prql,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v001)

timesteps: 799800
Training finished 

CPU times: user 7min 4s, sys: 1min 9s, total: 8min 13s
Wall time: 7min 30s


In [69]:
%%time
#pr_v004_003
pr_episodes_length_v004_003, pr_penalties_v004_003,pr_rewards_v004_003, _, pr_q_table_v004_003 = transform(tl_algo= prql,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v004)

timesteps: 799800
Training finished 

CPU times: user 7min 14s, sys: 1min 28s, total: 8min 42s
Wall time: 7min 50s


#### OPS-TL

In [87]:
%%time
#ops_v001_003
ops_episodes_length_v001_003, ops_penalties_v001_003,ops_rewards_v001_003, _, ops_q_table_v001_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                   repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  [[qfunc] for qfunc in qlearn_q_table_v001])

Episode 4000
Training finished
CPU times: user 31min 22s, sys: 1min, total: 32min 23s
Wall time: 31min 35s


In [88]:
%%time
#ops_v004_003
ops_episodes_length_v004_003, ops_penalties_v004_003,ops_rewards_v004_003, _, ops_q_table_v004_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  [[qfunc] for qfunc in qlearn_q_table_v004])

Episode 4000
Training finished
CPU times: user 47min 26s, sys: 39.8 s, total: 48min 6s
Wall time: 47min 28s


#### caps

In [89]:
%%time
#caps_v001_003
caps_episodes_length_v001_003, caps_penalties_v001_003,caps_rewards_v001_003, _, caps_q_table_v001_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  [[qfunc] for qfunc in qlearn_q_table_v001])

timesteps: 799800
Training finished
CPU times: user 31min 14s, sys: 1min 18s, total: 32min 33s
Wall time: 31min 38s


In [90]:
%%time
#caps_v004_003
caps_episodes_length_v004_003, caps_penalties_v004_003,caps_rewards_v004_003, _, caps_q_table_v004_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  [[qfunc] for qfunc in qlearn_q_table_v004])

timesteps: 799800
Training finished
CPU times: user 49min 35s, sys: 44.3 s, total: 50min 20s
Wall time: 49min 49s


#### Table

##### qlearn

In [589]:
# qlearn-Close transfer 
pd.set_option('display.float_format', lambda x: '%.3f' % x)
res = get_results(qlearn_rewards_v003,qlearn_rewards_v004_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.01,10.2,0.0,24.8,1310.2
std,0.539,3.327,0.0,228.083,1841.795
min,-0.842,6.0,0.0,-303.0,-988.0
25%,-0.231,8.0,0.0,-90.0,118.0
50%,-0.084,10.0,0.0,-32.5,927.0
75%,0.336,12.0,0.0,155.0,2323.5
max,0.811,16.0,0.0,365.0,5114.0


In [590]:
# qlearn-Far transfer
res = get_results(qlearn_rewards_v003,qlearn_rewards_v001_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,-inf,-1.0,-13.8,-inf,-68990.8
std,,1.414,3.19,,2596.006
min,-inf,-4.0,-18.0,-inf,-72612.0
25%,-inf,-2.0,-16.0,-inf,-71320.5
50%,-inf,0.0,-14.0,-inf,-69026.0
75%,-inf,0.0,-10.5,-inf,-66954.0
max,-7.928,0.0,-10.0,-3504.0,-64660.0


##### PRQL

In [591]:
# pr-Close transfer
res = get_results(qlearn_rewards_v003,pr_rewards_v004_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.91,-0.4,0.0,385.8,5854.6
std,0.016,1.578,0.0,59.243,829.821
min,0.886,-4.0,0.0,314.0,4970.0
25%,0.898,0.0,0.0,338.0,5212.5
50%,0.912,0.0,0.0,390.0,5705.0
75%,0.917,0.0,0.0,412.5,6268.5
max,0.934,2.0,0.0,491.0,7528.0


In [592]:
# pr-Far transfer
res = get_results(qlearn_rewards_v003,pr_rewards_v001_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.397,-0.2,0.0,171.0,2371.0
std,0.105,1.476,0.0,66.483,1034.359
min,0.226,-4.0,0.0,92.0,1198.0
25%,0.359,0.0,0.0,140.0,1890.0
50%,0.388,0.0,0.0,155.0,2155.0
75%,0.399,0.0,0.0,182.75,2335.5
max,0.637,2.0,0.0,335.0,4742.0


##### OPS-TL

In [229]:
# ops-Close transfer
res = get_results(qlearn_rewards_v003,ops_rewards_v004_003)
res = pd.DataFrame(res)
res.describe()

NameError: name 'ops_rewards_v004_003' is not defined

In [594]:
# ops-Far transfer
res = get_results(qlearn_rewards_v003,ops_rewards_v001_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.469,-0.6,0.0,200.8,2892.6
std,0.142,1.647,0.0,69.723,976.823
min,0.272,-4.0,0.0,98.0,1606.0
25%,0.384,-1.5,0.0,153.0,2026.0
50%,0.46,0.0,0.0,222.5,3138.0
75%,0.577,0.0,0.0,247.75,3570.5
max,0.681,2.0,0.0,301.0,4226.0


##### caps

In [595]:
# caps-Close transfer
res = get_results(qlearn_rewards_v003,caps_rewards_v004_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.904,0.0,0.0,383.4,5813.6
std,0.021,1.886,0.0,61.446,845.584
min,0.877,-4.0,0.0,308.0,4908.0
25%,0.889,0.0,0.0,335.5,5190.5
50%,0.901,0.0,0.0,381.0,5567.0
75%,0.915,1.5,0.0,411.25,6331.0
max,0.943,2.0,0.0,489.0,7450.0


In [597]:
# caps-Far transfer
res = get_results(qlearn_rewards_v003,caps_rewards_v001_003)
res = pd.DataFrame(res)
res.describe()

Unnamed: 0,Transfer rate,Jumpstart,Asympotic performace,Time to threshold,Accumulated rewards
count,10.0,10.0,10.0,10.0,10.0
mean,0.436,0.6,0.0,190.9,2839.4
std,0.205,2.503,0.0,106.82,1560.574
min,0.154,-4.0,0.0,59.0,322.0
25%,0.279,0.0,0.0,102.5,1780.0
50%,0.439,0.0,0.0,181.5,2611.0
75%,0.632,2.0,0.0,287.75,4118.0
max,0.693,4.0,0.0,343.0,5212.0


##### OC

|Transfer Setting|**Transfer Rate**|**Jumpstart(100 epsidoe)**|**Asymptotic Performance**|**Time to Threshold**|**Accumulated Rewards**|
|---|---|---|---|---|---|
|***Close transfer***|p =0.000|p =0.000|p =0.000|p =0.000|p =0.000|
|Q-table|0.010$\pm$0.539|8.400$\pm$3.978|0.000$\pm$0.000|24.800$\pm$228.083|1310.200$\pm$1841.795|
|PRQL|**0.910**$\pm$0.016|**17.600**$\pm$1.578|0.000$\pm$0.000|**385.800**$\pm$59.243|**5854.600**$\pm$829.821|
|OPS-TL|0.883$\pm$0.032|17.400$\pm$1.647|0.000$\pm$0.000|375.000$\pm$63.821|5682.600$\pm$904.087|
|caps|0.904$\pm$0.021|**17.600**$\pm$1.578|0.000$\pm$0.000|383.400$\pm$61.446|5813.600$\pm$845.584|
|***Far transfer***|
|Q-table|-4.08$\pm$0.234|-2.400$\pm$1.578|-13.800$\pm$3.190|-inf$\pm$nan|-68990.800$\pm$2596.006|
|PRQL|0.397$\pm$0.105|1.000$\pm$2.160|0.000$\pm$0.000|171.000$\pm$66.483|2371.000$\pm$1034.359|
|OPS-TL|**0.469**$\pm$0.142|4.600$\pm$5.502|0.000$\pm$0.000|**200.800**$\pm$69.723|**2892.600**$\pm$976.823|
|caps|0.436$\pm$0.205|**2.400**$\pm$3.627|0.000$\pm$0.000|190.900$\pm$106.820|2839.400$\pm$1560.574|

In [235]:
import scipy.stats as stats
import scipy.optimize as opt
algs = ['qlearn','pr','ops','caps']
level=['Close_transfer_', 'Far_transfer_']
from scipy.stats import ks_2samp, kstest   
for alg in algs:
    print(alg)
    stat_val, p_val = stats.ks_2samp(np.mean(all_data[level[0]+alg],axis=0), np.mean(all_data[level[1]+alg],axis=0))
    #看看两个分布在均值上有没有显著差异
    #注意，这里我们生成的第二组数据样本大小、方差和第一组均不相等，在运用t检验时需要使用Welch's t-test
    #即指定ttest_ind中的equal_var=False。
    print ('Two-sample ks-test statistic = %6.3f, p-value = %6.4f' % (stat_val, p_val))

qlearn
Two-sample ks-test statistic =  1.000, p-value = 0.0000
pr
Two-sample ks-test statistic =  0.092, p-value = 0.0000
ops
Two-sample ks-test statistic =  0.068, p-value = 0.0000
caps
Two-sample ks-test statistic =  0.112, p-value = 0.0000


In [191]:
%%time
option_policies, option_terminations, policy_over_options, nrewards,critic = [ [None]*4 for i in range(5) ] 
env_list = [ "Taxi-v001", "Taxi-v003", "Taxi-v004"]

for i in range(len(env_list)):
    _, _ ,nrewards[i] ,option_policies[i], option_terminations[i], policy_over_options[i],critic[i] = oc(env = gym.make(env_list[i]),episode_num = 4000)

timesteps: 799800
Training finished
CPU times: user 12min 34s, sys: 3min 3s, total: 15min 37s
Wall time: 13min 42s


In [192]:
%%time
oc_rewards_v001_003, oc_rewards_v004_003 = np.zeros([10,4000]), np.zeros([10,4000])
for i in range(10):
    _, _ ,oc_rewards_v001_003[i] ,_, _, _,_ = oc(env = gym.make( "Taxi-v003"),episode_num = 4000,option_policies_lib=option_policies[0],option_terminations_lib =option_terminations[0] )
    _, _ ,oc_rewards_v004_003[i] ,_, _, _,_ = oc(env = gym.make("Taxi-v003"),episode_num = 4000,option_policies_lib=option_policies[2],option_terminations_lib =option_terminations[2] )

timesteps: 799800
Training finished
CPU times: user 2h 50min 45s, sys: 43min 35s, total: 3h 34min 20s
Wall time: 3h 6min 46s


In [163]:
oc_rewards_v001_003

NameError: name 'oc_rewards_v001_003' is not defined

In [232]:
algs = ['qlearn','pr','ops','caps']
for level in ['Close_transfer_', 'Far_transfer_']:
    print(level)
    for alg in algs:
        print(alg)
        res = get_results(qlearn_rewards_v003,all_data[level+alg])
        res = pd.DataFrame(res)
        for i in range(len(res.std())):
            print("|{:.3f}$\pm${:.3f}".format(round(res.mean()[i],3),round(res.std()[i],3)),end='')
        print("|")

Close_transfer_
qlearn
|0.106$\pm$0.431|9.800$\pm$2.898|0.000$\pm$0.000|50.400$\pm$179.353|2054.800$\pm$1465.836|
pr
|0.915$\pm$0.015|-0.800$\pm$1.932|0.000$\pm$0.000|411.400$\pm$85.930|6599.200$\pm$1285.746|
ops
|0.890$\pm$0.029|0.200$\pm$2.201|0.000$\pm$0.000|400.600$\pm$87.983|6427.200$\pm$1286.940|
caps
|0.910$\pm$0.017|-0.400$\pm$2.066|0.000$\pm$0.000|409.000$\pm$85.825|6558.200$\pm$1260.261|
Far_transfer_
qlearn
|-inf$\pm$nan|-1.400$\pm$1.350|-13.800$\pm$3.190|-inf$\pm$nan|-68246.200$\pm$2539.523|
pr
|0.418$\pm$0.154|-0.600$\pm$1.897|0.000$\pm$0.000|196.600$\pm$105.695|3115.600$\pm$1626.796|
ops
|0.497$\pm$0.127|-1.000$\pm$1.944|0.000$\pm$0.000|226.400$\pm$83.228|3637.200$\pm$1187.541|
caps
|0.477$\pm$0.162|0.200$\pm$2.394|0.000$\pm$0.000|216.500$\pm$90.092|3584.000$\pm$1006.378|


#### all_data

#### Plot

##### Separated figure

In [208]:
%matplotlib widget
plt.rcParams['savefig.dpi'] = 600
algs = ["qtable",'stable', "prql", "ops", "caps"]
data =[None] *len(algs)
for i,alg in enumerate(algs):
    data[i] = cache(ex = "similarity_diff", tl_alg = alg,rl_alg='qlearn')
    plot_result(data[i],1000,smooth_radius=1)
#     plt.savefig(fname = './combined_figure_similarity_diff_'+ alg ,dpi = 600)
    plot_result(data[i],500,combine_figures =False,average_group=False,figsize=(9,12),smooth_radius=1) 
#     plt.savefig(fname = './separated_figure_similarity_diff_'+ alg ,dpi = 600)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

##### combined figure

In [197]:
cache(all_data = all_data,rl_alg ='sarsa')

{'Close_transfer_caps': array([[ 0.,  0.,  6., ..., 20., 20., 20.],
        [ 0.,  0.,  4., ..., 20., 20., 20.],
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        ...,
        [ 2.,  2.,  6., ..., 20., 20., 20.],
        [ 4.,  0.,  2., ..., 20., 20., 20.],
        [ 0.,  8.,  8., ..., 20., 20., 20.]]),
 'Close_transfer_ops': array([[ 0.,  2.,  4., ..., 20., 20., 20.],
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  0.,  4., ..., 20., 20., 20.],
        ...,
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  0.,  2., ..., 20., 20., 20.],
        [ 0.,  4.,  0., ..., 20., 20., 20.]]),
 'Close_transfer_prql': array([[ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  2.,  0., ..., 20., 20., 20.],
        [ 4.,  2.,  2., ..., 20., 20., 20.],
        ...,
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  2.,  6., ..., 20., 20., 20.]]),
 'Far_transfer_caps': array([[ 2.,  2.,  4., ..., 20., 20., 20.],
        [ 2

In [5]:
algs = ['qlearn','pr','ops','caps']
data = [None] * len(algs)
all_data = {}
for i,alg in enumerate(algs):
    data[i] = loadData('./past_ex_data/' + "similarity_diff" + '_' + alg + '_'  + '20210429' )
    all_data.update(data[i])
all_data    

{'Close_transfer_caps': array([[ 0.,  0.,  4., ..., 20., 20., 20.],
        [ 0.,  0.,  2., ..., 20., 20., 20.],
        [ 0.,  0.,  2., ..., 20., 20., 20.],
        ...,
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 2.,  0.,  0., ..., 20., 20., 20.],
        [ 2.,  2.,  0., ..., 20., 20., 20.]]),
 'Close_transfer_ops': array([[ 2.,  0.,  0., ..., 20., 20., 20.],
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        [ 2.,  0.,  4., ..., 20., 20., 20.],
        ...,
        [ 2.,  0.,  0., ..., 20., 20., 20.],
        [ 2.,  4.,  0., ..., 20., 20., 20.],
        [ 4.,  2.,  0., ..., 20., 20., 20.]]),
 'Close_transfer_pr': array([[ 0.,  4.,  0., ..., 20., 20., 20.],
        [ 0.,  0.,  2., ..., 20., 20., 20.],
        [ 0.,  0.,  0., ..., 20., 20., 20.],
        ...,
        [ 0.,  0.,  2., ..., 20., 20., 20.],
        [ 0.,  2.,  2., ..., 20., 20., 20.],
        [ 2.,  0.,  2., ..., 20., 20., 20.]]),
 'Close_transfer_qlearn': array([[12., 18.,  8., ..., 20., 20., 20.],
        [

In [199]:
algs = ['prql','ops','caps']
data = [None] * len(algs)
all_data = {}
for i,alg in enumerate(algs):
    data[i] = cache(tl_alg =alg,rl_alg ='sarsa')
    all_data.update(data[i])
    

In [209]:

plot_result(all_data,4000,title = "3 similarity + 4 algs : 4000 episode-rewareds graph")
#plt.savefig('./3similarity+4algs:4000episode',dpi =  600)
plot_result(all_data,1000,title = "3 similarity + 4 algs : 1000 episode-rewareds graph", show_std= False ,smooth_radius = 20)
#plt.savefig('./3similarity+4algs:1000episode',dpi =  600)

['qlearn', 'pr', 'ops', 'caps']


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

['qlearn', 'pr', 'ops', 'caps']


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### 2. Exp2

In [58]:
%%time

# training Taxi-v3 from scratch
qlearn_episodes_length_v3, qlearn_penalties_v3, qlearn_rewards_v3, _, qlearn_q_table_v3 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v3",
                                                                                     source_task = None
                                                                                    )

timesteps: 155800


KeyboardInterrupt: 

In [38]:
%%time

# training Taxi-v1 from scratch
qlearn_episodes_length_v1, qlearn_penalties_v1, qlearn_rewards_v1,_, qlearn_q_table_v1 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v1",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 8min 48s, sys: 1min 21s, total: 10min 9s
Wall time: 9min 15s


In [39]:
%%time

# training Taxi-v4 from scratch
qlearn_episodes_length_v4, qlearn_penalties_v4, qlearn_rewards_v4,_, qlearn_q_table_v4 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v4",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 9min 6s, sys: 1min 13s, total: 10min 19s
Wall time: 9min 29s


In [36]:
%%time

# training Taxi-v03 from scratch
qlearn_episodes_length_v03, qlearn_penalties_v03, qlearn_rewards_v03, _, qlearn_q_table_v03 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v03",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 9min 5s, sys: 1min 23s, total: 10min 28s
Wall time: 9min 32s


In [38]:
%%time

# training Taxi-v01 from scratch
qlearn_episodes_length_v01, qlearn_penalties_v01, qlearn_rewards_v01,_, qlearn_q_table_v01 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v01",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 8min 48s, sys: 1min 21s, total: 10min 9s
Wall time: 9min 15s


In [39]:
%%time

# training Taxi-v04 from scratch
qlearn_episodes_length_v04, qlearn_penalties_v04, qlearn_rewards_v04,_, qlearn_q_table_v04 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM, 
                                                                                     repeat_times = REPEAT_TIMES,
                                                                                     target_task = "Taxi-v04",
                                                                                     source_task = None
                                                                                    )

timesteps: 799800
Training finished
CPU times: user 9min 6s, sys: 1min 13s, total: 10min 19s
Wall time: 9min 29s


### 3. Exp3

In [51]:
%%time
#caps_v014_003
caps_episodes_length_v014_003, caps_penalties_v014_003,caps_rewards_v014_003,caps_fre_v014_003, caps_q_table_v014_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

timesteps: 154400


KeyboardInterrupt: 

In [52]:
%%time
#pr_v014_003
pr_episodes_length_v014_003, pr_penalties_v014_003,pr_rewards_v014_003, _, pr_q_table_v014_003 = transform(tl_algo= prql,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                                                                                                            policy_library =  qlearn_q_table_v004)

NameError: name 's_table' is not defined

In [53]:
%%time
#ops_v014_003
tl_episodes_length_v014_003, tl_penalties_v014_003,tl_rewards_v014_003,tl_fre_v014_003, tl_q_table_v014_003 = transform(tl_algo= OPS_TL,
                                                                                       episode_num = EPISODE_NUM,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1,q_func2,q_func3 ] for q_func1,q_func2,q_func3 in zip(qlearn_q_table_v000, qlearn_q_table_v001, qlearn_q_table_v004)])

timesteps: 112000


KeyboardInterrupt: 

In [None]:
%%time
taxiEnvs = [gym.make("Taxi-v000"),gym.make("Taxi-v001"),None,gym.make("Taxi-v003"),gym.make("Taxi-v004")]
for i in [0,1,3,4]:
    _, _ ,nrewards[i] ,option_policies[i], option_terminations[i], policy_over_options[i],critic[i]= oc( taxiEnvs[i], 4000, noptions=1)


In [None]:
%%time

source_lib = [0,1,4]
_,_, trans_rewards_012, _, _, _,_= oc( four_room_envs[3], 4000, [option_policies[i][0] for i in source_lib], noptions=3)

## Evaluation

In [191]:
def evaluate(run_times, q_table = None ,task = "Taxi-v003", mode = 'learning agent', show_frame = True):
    """Evaluate agent's performance after policy updates"""
    env = gym.make(task)
    if q_table is None:
        list_length = 1
    else:
        list_length = len(q_table)
    all_epochs, all_penalties, all_rewards, all_success_rate =  np.zeros(list_length), np.zeros(list_length),np.zeros(list_length),np.zeros(list_length)
    all_epochs_std, all_penalties_std, all_rewards_std = np.zeros(list_length), np.zeros(list_length),np.zeros(list_length)
    
    
        
    for i in range( list_length ):
        table_epochs, table_penalties, table_rewards = np.zeros(run_times), np.zeros(run_times),np.zeros(run_times)
        if q_table is not None:
            table = q_table[i]
        success = 0
        for j in range(run_times):
            state = env.reset()
            epochs, penalties, reward, sum_reward = 0, 0, 0, 0

            done = False

            while not done:
                if mode == 'learning agent':
                    action = arg_max(table[state])
                elif mode == 'random agent':
                    action = np.random.randint(0,env.action_space.n)
                state, reward, done, info = env.step(action)

                if reward == -10:
                    penalties += 1
                elif reward == 20:
                    success += 1

                epochs += 1
                sum_reward += reward
                if show_frame:
                    show_frames(env,j, epochs,sum_reward )
            table_penalties[j] = penalties
            table_epochs[j] = epochs
            table_rewards[j] = sum_reward

        all_success_rate[i] = success/run_times    
        all_epochs[i], all_penalties[i], all_rewards[i]  = np.mean(table_epochs), np.mean(table_penalties), np.mean(table_rewards)
        all_epochs_std[i], all_penalties_std[i], all_rewards_std[i] = np.std(table_epochs), np.std(table_penalties), np.std(table_rewards)
      
    print(f"Results over {run_times} evaluating episodes:")
    print(f"Success rate : {np.mean(all_success_rate)}")
    print(f"Average  episode length : {np.mean(all_epochs)} ± {np.mean(all_epochs_std)}")
    print(f"Average penalties per episode: {np.mean(all_penalties)} ± {np.mean(all_penalties_std)}")
    print(f"Average rewards per episode: {np.mean(all_rewards)} ± {np.mean(all_rewards_std)}")

In [144]:
def op_evaluate(run_times,option_policies, option_terminations, policy_over_options, q_table = None ,task = "Taxi-v003", mode = 'learning agent'):
    """Evaluate agent's performance after policy updates"""
    env = gym.make(task)
    if q_table is None:
        list_length = 1
    else:
        list_length = len(q_table)
    all_epochs, all_penalties, all_rewards, all_success_rate =  np.zeros(list_length), np.zeros(list_length),np.zeros(list_length),np.zeros(list_length)
    all_epochs_std, all_penalties_std, all_rewards_std = np.zeros(list_length), np.zeros(list_length),np.zeros(list_length)
    
    
        
    for i in range( list_length ):
        table_epochs, table_penalties, table_rewards = np.zeros(run_times), np.zeros(run_times),np.zeros(run_times)
        if q_table is not None:
            table = q_table[i]
        success = 0
        for j in range(run_times):
            state = env.reset()
            epochs, penalties, reward, sum_reward = 0, 0, 0, 0

            done = False
            option = policy_over_options.evaluate(state)
            while not done:
                if mode == 'learning agent':
                    if option_terminations[option].sample(state):
                        option = policy_over_options.evaluate(state)

                    action = option_policies[option].evaluate(state)
                    
                    
                elif mode == 'random agent':
                    action = np.random.randint(0,env.action_space.n)
                state, reward, done, info = env.step(action)

                if reward == -10:
                    penalties += 1
                elif reward == 20:
                    success += 1

                epochs += 1
                sum_reward += reward
                show_frames(env,j,epochs,sum_reward )
            table_penalties[j] = penalties
            table_epochs[j] = epochs
            table_rewards[j] = sum_reward

        all_success_rate[i] = success/run_times    
        all_epochs[i], all_penalties[i], all_rewards[i]  = np.mean(table_epochs), np.mean(table_penalties), np.mean(table_rewards)
        all_epochs_std[i], all_penalties_std[i], all_rewards_std[i] = np.std(table_epochs), np.std(table_penalties), np.std(table_rewards)
      
    print(f"Results after {run_times} runs:")
    print(f"Success rate : {np.mean(all_success_rate)}")
    print(f"Average  episode length : {np.mean(all_epochs)} ± {np.mean(all_epochs_std)}")
    print(f"Average penalties per episode: {np.mean(all_penalties)} ± {np.mean(all_penalties_std)}")
    print(f"Average rewards per episode: {np.mean(all_rewards)} ± {np.mean(all_rewards_std)}")

In [160]:
%%time
#ops_v0_003
ops_episodes_length_v0_003, ops_penalties_v0_003,ops_rewards_v0_003,ops_fre_v0_003, ops_q_table_v0_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM//10,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1] for q_func1 in qlearn_q_table_v000])

timesteps: 79800
Training finished
CPU times: user 12.3 s, sys: 675 ms, total: 13 s
Wall time: 12.5 s


In [161]:
%%time
#ops_v1_003
ops_episodes_length_v1_003, ops_penalties_v1_003,ops_rewards_v1_003,ops_fre_v1_003, ops_q_table_v1_003 = transform(tl_algo= caps,
                                                                                       episode_num = EPISODE_NUM//10,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                      
                                                                                       policy_library = [[q_func1] for q_func1 in qlearn_q_table_v001])

timesteps: 79800
Training finished
CPU times: user 13 s, sys: 527 ms, total: 13.5 s
Wall time: 13.2 s


In [179]:
%%time
pr_episodes_length_v1_003, pr_penalties_v1_003,pr_rewards_v1_003, _, pr_q_table_v1_003 = transform(tl_algo= prql,
                                                                                       episode_num = EPISODE_NUM//10,
                                                                                       repeat_times = REPEAT_TIMES,
                                                                                       target_task = "Taxi-v003",
                                                                                       
                                                                                       policy_library =  qlearn_q_table_v001)

Episode 399
Training finished 

CPU times: user 2.21 s, sys: 106 µs, total: 2.21 s
Wall time: 2.2 s


In [175]:
%%time
qlearn_episodes_length_v3, qlearn_penalties_v3, qlearn_rewards_v3, _, qlearn_q_table_v3 = transform(tl_algo = q_learning,
                                                                                     episode_num = EPISODE_NUM//10, 
                                                                                     repeat_times = 1,
                                                                                     target_task = "Taxi-v003",
                                                                                     source_task = None
                                                                                    )

timesteps: 79800
Training finished
CPU times: user 3.7 s, sys: 455 ms, total: 4.15 s
Wall time: 3.81 s


In [197]:
%matplotlib widget
RATIO = 10
SMOOTH_RADIUS = 20
plt.plot(range(EPISODE_NUM//RATIO),smooth(qlearn_rewards_v3[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'qlearn_v3')

plt.plot(range(EPISODE_NUM//RATIO),smooth(pr_rewards_v1_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'pr_v1_3')
plt.plot(range(EPISODE_NUM//RATIO),smooth(ops_rewards_v1_003[:EPISODE_NUM//RATIO],SMOOTH_RADIUS), label = 'ops_v1_3')

plt.title("episode-penalties graph")
plt.xlabel("episode")
plt.ylabel("penalties")
plt.legend()
plt.show


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<function matplotlib.pyplot.show>

In [192]:
evaluate(1000,qlearn_q_table_v3,show_frame = False)

Results over 1000 evaluating episodes:
Success rate : 0.779
Average  episode length : 84.576 ± 75.92020958875179
Average penalties per episode: 0.0 ± 0.0
Average rewards per episode: 15.58 ± 8.298409486160585


In [204]:
evaluate(3,ops_q_table_v1_003)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
episode 3  step 6  rewards=20
Results over 3 evaluating episodes:
Success rate : 1.0
Average  episode length : 17.0 ± 9.41629792788369
Average penalties per episode: 0.0 ± 0.0
Average rewards per episode: 20.0 ± 0.0


In [193]:
evaluate(3,pr_q_table_v1_003)

Results over 1000 evaluating episodes:
Success rate : 0.988
Average  episode length : 18.847 ± 25.006750908504685
Average penalties per episode: 0.0 ± 0.0
Average rewards per episode: 19.76 ± 2.1777052142105915


In [194]:
evaluate(1000,pr_q_table_v1_003,show_frame = False)

Results over 1000 evaluating episodes:
Success rate : 0.706
Average  episode length : 85.098 ± 82.33279052722554
Average penalties per episode: 0.0 ± 0.0
Average rewards per episode: 14.12 ± 9.111838453352869


In [None]:
evaluate(1000, mode = 'random agent')

In [None]:
%%time
# training Taxi-v3 reusing policy from Taxi-v1
pr1_rewards_v1_3 = np.zeros(10000)
for i in range(10):
    pr1_episodes_length_v1_3, pr1_penalties_v1_3,rewards, pr1_q_table_v1_3 = prql(gym.make("Taxi-v3"), qlearn_q_table_v1[i], 10000)
    pr1_rewards_v1_3 += rewards
pr1_rewards_v1_3/=10

In [None]:
%%time
# training Taxi-v3 reusing policy from Taxi-v1
pr1_rewards_v4_3 = np.zeros(10000)
for i in range(10):
    pr1_episodes_length_v4_3, pr1_penalties_v4_3,rewards, pr1_q_table_v4_3 = prql(gym.make("Taxi-v3"), qlearn_q_table_v4[i], 10000)
    pr1_rewards_v4_3 += rewards
pr1_rewards_v4_3/=10

In [None]:
%%time

env = gym.make("Taxi-v4")
# training Taxi-v4 from scratch
lprofiler = LineProfiler(q_learning)
lprofiler.run('qlearn_episodes_length_v41, qlearn_penalties_v41, qlearn_rewards_v41, qlearn_q_table_v41 = q_learning(env,np.zeros([env.observation_space.n,env.action_space.n]),EPISODE_NUM)')
lprofiler.print_stats()