# Data Processing for Safe RL

In [11]:
import pandas as pd
import numpy as np


In [12]:
x_limit = 80
y_limit = 80

In [13]:
data_1 = pd.read_csv('../../../robo_limb_ml/ml_data/train_data.csv')
data_2 = pd.read_csv('../../../robo_limb_ml/ml_data/test_data.csv')
data = pd.concat([data_1, data_2])


## Computing rewards

To avoid numbers that are too big, change everything to radians

In [14]:
def deg2rad_normalized(degrees):
    radians = np.deg2rad(degrees)
    normalized_radians = np.mod(radians + np.pi, 2 * np.pi) - np.pi
    return normalized_radians

In [15]:
data['theta_x'] = deg2rad_normalized(data['theta_x'])
data['theta_y'] = deg2rad_normalized(data['theta_y'])
data['vel_x'] = np.sign(data['vel_x']) * deg2rad_normalized(np.abs(data['vel_x']))
data['vel_y'] = np.sign(data['vel_y']) * deg2rad_normalized(np.abs(data['vel_y']))

In [16]:
data

Unnamed: 0,time_begin,time_begin_traj,theta_x,theta_y,vel_x,vel_y,X_throttle,Y_throttle
0,0.358,0.358,-0.086219,-0.187797,0.0,0.000000,0.0,0.0
1,0.421,0.421,-0.086219,-0.187797,0.0,0.000000,0.0,0.0
2,0.485,0.485,-0.086219,-0.187797,0.0,0.000000,0.0,0.0
3,0.548,0.548,-0.086219,-0.187797,0.0,0.000000,0.0,0.0
4,0.612,0.612,-0.086219,-0.187797,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...
31261,1172.517,121.126,-0.384496,-0.656069,0.0,0.000000,-2.0,10.0
31262,1172.589,121.198,-0.384496,-0.656069,0.0,0.000000,-2.0,10.0
31263,1172.661,121.270,-0.384496,-0.646819,0.0,0.128476,-2.0,10.0
31264,1172.733,121.342,-0.384496,-0.646819,0.0,0.000000,-2.0,10.0


In [17]:
# seperate into trajectories
data['delta_t'] = data['time_begin'].diff()
data['new_traj'] = data['delta_t'] > 10
data['traj_num'] = data['new_traj'].cumsum().ffill().astype(int)

In [18]:
seq_len = 10
df_list = []
for traj_num, traj_data in data.groupby('traj_num'):
    traj_data = traj_data.copy()
    traj_data = traj_data.drop(columns=['traj_num', 'new_traj', 'delta_t'])
    num_sequences = traj_data.shape[0] - seq_len
    tiled_input_data = np.array([traj_data[i:i+seq_len] for i in range(num_sequences)])
    df_list.append(tiled_input_data)
data_tiled = np.concatenate(df_list, axis=0)
len(df_list)

522

In [19]:
max_dist = deg2rad_normalized(60)
gamma = 0.99
reward_list = []
action_list = []

curr_states_list = []
next_states_list = []

curr_states_no_unsafe_list = []
next_states_no_unsafe_list = []

reward_no_unsafe_list = []
actions_no_unsafe_list = []

rewards_no_unsafe_exponential_list = []
rewards_exponential_list = []
rewards_base_e_no_unsafe_exponential_list = []
rewards_base_e_exponential_list = []
alpha = 0.01

for traj in df_list:
    dist = np.linalg.norm(traj[:, -1, 2:4], axis=1)
    # Remove last state due to next state offset
    reward = dist[:-1]/max_dist
    action = traj[:-1, -1, 6:]
    index_into_unsafe = np.argmax(dist >= max_dist)

    states = traj[:-1]
    next_states = traj[1:]
    
    reward[index_into_unsafe] = -1/((1-gamma)*gamma**(index_into_unsafe + seq_len))
    reward[index_into_unsafe+1:] = -1
    
    curr_states_no_unsafe_list.append(states[:index_into_unsafe+1]) 
    next_states_no_unsafe_list.append(next_states[:index_into_unsafe+1])
    
    curr_states_list.append(states)
    next_states_list.append(next_states)
    
    reward_list.append(reward)
    action_list.append(action)
    
    actions_no_unsafe_list.append(action[:index_into_unsafe+1])
    reward_no_unsafe_list.append(reward[:index_into_unsafe+1])
    
    #computing exponential rewards
    l = dist[:-1]/max_dist
    t = np.arange(len(l)) + seq_len
    reward_exp = np.power(1 + alpha, -np.power(gamma, t)/l)
    reward_exp_e = np.exp(-np.power(gamma, t)/l)
    reward_exp[index_into_unsafe + 1:] = 0
    reward_exp_e[index_into_unsafe + 1:] = 0
    rewards_no_unsafe_exponential_list.append(reward_exp[:index_into_unsafe + 1])
    rewards_exponential_list.append(reward_exp)
    rewards_base_e_no_unsafe_exponential_list.append(reward_exp_e[:index_into_unsafe + 1])
    rewards_base_e_exponential_list.append(reward_exp_e)
rewards = np.concatenate(reward_list)
actions = np.concatenate(action_list, axis=0)
curr_states_no_unsafe = np.concatenate(curr_states_no_unsafe_list, axis=0)
next_states_no_unsafe = np.concatenate(next_states_no_unsafe_list, axis=0)
curr_states = np.concatenate(curr_states_list, axis=0)
next_states = np.concatenate(next_states_list, axis=0)
rewards_no_unsafe = np.concatenate(reward_no_unsafe_list)
actions_no_unsafe = np.concatenate(actions_no_unsafe_list, axis=0)
rewards_no_unsafe_exponential = np.concatenate(rewards_no_unsafe_exponential_list)
rewards_exponential = np.concatenate(rewards_exponential_list)
rewards_base_e_no_unsafe_exponential = np.concatenate(rewards_base_e_no_unsafe_exponential_list)
rewards_base_e_exponential = np.concatenate(rewards_base_e_exponential_list)
rewards.shape, actions.shape, curr_states.shape, next_states.shape, rewards_no_unsafe.shape, actions_no_unsafe.shape, curr_states_no_unsafe.shape, next_states_no_unsafe.shape, rewards_no_unsafe_exponential.shape, rewards_exponential.shape, rewards_base_e_no_unsafe_exponential.shape, rewards_base_e_exponential.shape

((137465,),
 (137465, 2),
 (137465, 10, 8),
 (137465, 10, 8),
 (37239,),
 (37239, 2),
 (37239, 10, 8),
 (37239, 10, 8),
 (37239,),
 (137465,),
 (37239,),
 (137465,))

In [20]:
np.save('../../safe_rl_data/states_len10.npy', curr_states)
np.save('../../safe_rl_data/rewards_reg_markovian_len10.npy', rewards)
np.save('../../safe_rl_data/actions_len10.npy', actions)
np.save('../../safe_rl_data/states_len10_no_action.npy', curr_states[:, :, :6])
np.save('../../safe_rl_data/states_no_unsafe_no_action_len10.npy', curr_states_no_unsafe[:, :, :6])
np.save('../../safe_rl_data/states_no_unsafe_len10.npy', curr_states_no_unsafe)
np.save('../../safe_rl_data/next_states_len10.npy', next_states)
np.save('../../safe_rl_data/next_states_no_unsafe_len10.npy', next_states_no_unsafe)
np.save('../../safe_rl_data/next_states_no_unsafe_no_action_len10.npy', next_states_no_unsafe[:, :, :6])
np.save('../../safe_rl_data/next_states_no_action_len10.npy', next_states[:, :, :6])
np.save('../../safe_rl_data/rewards_no_unsafe_reg_markovian_len10.npy', rewards_no_unsafe)
np.save('../../safe_rl_data/actions_no_unsafe_len10.npy', actions_no_unsafe)
np.save('../../safe_rl_data/rewards_no_unsafe_exponential_len10.npy', rewards_no_unsafe_exponential)
np.save('../../safe_rl_data/rewards_exponential_len10.npy', rewards_exponential)
np.save('../../safe_rl_data/rewards_base_e_no_unsafe_exponential_len10.npy', rewards_base_e_no_unsafe_exponential)
np.save('../../safe_rl_data/rewards_base_e_exponential_len10.npy', rewards_base_e_exponential)

# import pickle

# with open('../../safe_rl_data/states_len10.pkl', 'wb') as file:
#     pickle.dump(curr_states, file)

# with open('../../safe_rl_data/rewards_reg_markovian_len10.pkl', 'wb') as file:
#     pickle.dump(rewards, file)

# with open('../../safe_rl_data/actions_len10.pkl', 'wb') as file:
#     pickle.dump(actions, file)

# with open('../../safe_rl_data/states_len10_no_action.pkl', 'wb') as file:
#     curr_states_no_action = curr_states[:, :, :6]
#     pickle.dump(curr_states_no_action, file)

# with open('../../safe_rl_data/states_no_unsafe_no_action_len10.pkl', 'wb') as file:
#     curr_states_no_unsafe_no_action = curr_states_no_unsafe[:, :, :6]
#     pickle.dump(curr_states_no_unsafe_no_action, file)

# with open('../../safe_rl_data/states_no_unsafe_len10.pkl', 'wb') as file:
#     pickle.dump(curr_states_no_unsafe, file)

# with open('../../safe_rl_data/next_states_len10.pkl', 'wb') as file:
#     pickle.dump(next_states, file)

# with open('../../safe_rl_data/next_states_no_unsafe_len10.pkl', 'wb') as file:
#     pickle.dump(next_states_no_unsafe, file)

# with open('../../safe_rl_data/next_states_no_unsafe_no_action_len10.pkl', 'wb') as file:
#     next_states_no_unsafe_no_action = next_states_no_unsafe[:, :, :6]
#     pickle.dump(next_states_no_unsafe_no_action, file)

# with open('../../safe_rl_data/next_states_no_action_len10.pkl', 'wb') as file:
#     next_states_no_action = next_states[:, :, :6]
#     pickle.dump(next_states_no_action, file)

# with open('../../safe_rl_data/rewards_no_unsafe_reg_markovian_len10.pkl', 'wb') as file:
#     pickle.dump(rewards_no_unsafe, file)

# with open('../../safe_rl_data/actions_no_unsafe_len10.pkl', 'wb') as file:
#     pickle.dump(actions_no_unsafe, file)
    

In [93]:
# with open('../../safe_rl_data/rewards_no_unsafe_exponential_len10.pkl', 'wb') as file:
#     pickle.dump(rewards_no_unsafe_exponential, file)

# with open('../../safe_rl_data/rewards_exponential_len10.pkl', 'wb') as file:
#     pickle.dump(rewards_exponential, file)

# with open('../../safe_rl_data/rewards_base_e_no_unsafe_exponential_len10.pkl', 'wb') as file:
#     pickle.dump(rewards_base_e_no_unsafe_exponential, file)

# with open('../../safe_rl_data/rewards_base_e_exponential_len10.pkl', 'wb') as file:
#     pickle.dump(rewards_base_e_exponential, file)

In [14]:
reward = []
x_limit = deg2rad_normalized(x_limit)
y_limit = deg2rad_normalized(y_limit)
D = 0.5*(x_limit**2 + y_limit**2)**0.5
gamma = 0.99
for row_num, row in data.iterrows():
    if row['theta_x'] <= x_limit and row['theta_x'] >= -x_limit and row['theta_y'] <= y_limit and row['theta_y'] >= -y_limit:
        dist_to_bounds_x = min(abs(x_limit - row['theta_x']), abs(-x_limit - row['theta_x']))
        dist_to_bounds_y = min(abs(y_limit - row['theta_y']), abs(-y_limit - row['theta_y']))
        dist_to_bounds = (dist_to_bounds_x**2 + dist_to_bounds_y**2)**0.5
        r = 1 + dist_to_bounds
        reward.append(r)
    else:
        reward.append(-D/(1-gamma))
data['reward'] = reward    

In [15]:
data

Unnamed: 0,time_begin,time_begin_traj,theta_x,theta_y,vel_x,vel_y,X_throttle,Y_throttle,reward
0,0.358,0.358,-0.086219,-0.187797,0.0,0.000000,0.0,0.0,2.782303
1,0.421,0.421,-0.086219,-0.187797,0.0,0.000000,0.0,0.0,2.782303
2,0.485,0.485,-0.086219,-0.187797,0.0,0.000000,0.0,0.0,2.782303
3,0.548,0.548,-0.086219,-0.187797,0.0,0.000000,0.0,0.0,2.782303
4,0.612,0.612,-0.086219,-0.187797,0.0,0.000000,0.0,0.0,2.782303
...,...,...,...,...,...,...,...,...,...
31261,1172.517,121.126,-0.384496,-0.656069,0.0,0.000000,-2.0,10.0,2.253619
31262,1172.589,121.198,-0.384496,-0.656069,0.0,0.000000,-2.0,10.0,2.253619
31263,1172.661,121.270,-0.384496,-0.646819,0.0,7.361111,-2.0,10.0,2.259103
31264,1172.733,121.342,-0.384496,-0.646819,0.0,0.000000,-2.0,10.0,2.259103


In [16]:
data.to_csv('../safe_rl_data/data.csv', index=False)

In [17]:
data.describe()

Unnamed: 0,time_begin,time_begin_traj,theta_x,theta_y,vel_x,vel_y,X_throttle,Y_throttle,reward
count,143207.0,143207.0,143207.0,143207.0,143207.0,143207.0,143207.0,143207.0,143207.0
mean,953.331442,30.882267,-0.006678,-8.3e-05,0.368827,-0.152111,-0.123332,-0.028162,-13.898717
std,674.203298,41.364048,0.793214,0.799035,16.972159,17.594994,2.173461,2.289834,37.005254
min,0.348,0.348,-1.813572,-1.816713,-165.294118,-151.571429,-10.0,-10.0,-98.730732
25%,356.609,6.497,-0.570548,-0.55135,-10.149254,-10.615385,-1.0,-1.0,1.70095
50%,875.516,15.39,0.001047,-0.039444,0.0,0.0,0.0,0.0,2.203503
75%,1462.8505,37.1595,0.549953,0.572643,10.298507,10.0,1.0,1.0,2.489879
max,3118.448,330.651,1.790533,1.803449,121.176471,153.880597,10.0,10.0,2.970174
