In [1]:
!pip install gym-jsbsim --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!pip install gym==0.17.3 --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import pkg_resources
pkg_resources.require("gym==0.17.3")
import gym
import gym_jsbsim
from gym_jsbsim.catalogs.catalog import Catalog as c
import numpy as np
import math
import random
import torch
from torch import nn
import torch.nn.functional as F



In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
class PolicyNetwork(nn.Module):
    def __init__(self,num_hidden_neurons):
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(9,num_hidden_neurons)
        self.layer2 = nn.Linear(num_hidden_neurons,4)

    def forward(self, x):
        x=torch.tanh(self.layer1(x))
        x=torch.tanh(self.layer2(x))
        return x

In [6]:
np.random.seed(0)
random.seed(0)
torch.manual_seed(58308)
model = PolicyNetwork(16)
model.eval()
for param in model.parameters():
    param.requires_grad = False
init_center = np.concatenate((model.layer1.weight.data.numpy().flatten(),model.layer2.weight.data.numpy().flatten(),model.layer1.bias.data.numpy(),model.layer2.bias.data.numpy()))

In [7]:
def vec_to_weights(vec,num_hidden_neurons):
    w1 = torch.from_numpy(np.reshape(vec[0:num_hidden_neurons*9], (num_hidden_neurons, 9))).float()
    w2 = torch.from_numpy(np.reshape(vec[num_hidden_neurons*9:13*num_hidden_neurons], (4, num_hidden_neurons))).float()
    b1 = torch.from_numpy(np.reshape(vec[num_hidden_neurons*13:14*num_hidden_neurons], (num_hidden_neurons))).float()
    b2 = torch.from_numpy(np.reshape(vec[num_hidden_neurons*14:14*num_hidden_neurons+4], (4))).float()
    return w1,w2,b1,b2

In [8]:
class Obs_TupleToBoxWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        low = np.empty(shape=(0,))
        high = np.empty(shape=(0,))
        for i in env.observation_space:
            low = np.concatenate([low,i.low])
            high = np.concatenate([high,i.high])
        self.observation_space = gym.spaces.Box(low=low, high=high, dtype="float")
        
    
    def observation(self, obs):
        new_obs = np.empty(shape=(0,))
        for i in obs:
            new_obs = np.concatenate([new_obs,i])
        return new_obs

In [9]:
class Act_TupleToBoxWrapper(gym.ActionWrapper):
    def __init__(self, env):
        super().__init__(env)
        low = np.empty(shape=(0,))
        high = np.empty(shape=(0,))
        for i in env.action_space:
            low = np.concatenate([low,i.low])
            high = np.concatenate([high,i.high])
        self.action_space = gym.spaces.Box(low=low, high=high, dtype="float")
        
    
    def action(self, act):
        return act

In [10]:
env=Act_TupleToBoxWrapper(Obs_TupleToBoxWrapper(gym.make("GymJsbsim-HeadingControlTask-v0")))

In [11]:
def find_and_take_action(env,curr_state,nn):
    action = nn(torch.from_numpy(curr_state).float())
    action.numpy()
    action[-1]=(action[-1]+1)*0.45
    state, reward, done, _ = env.step(action)
    return state, reward, done

In [12]:
def run_episode(env,nn,find_and_take_action):
    episode_reward = 0
    state = env.reset()
    #print("Initial State =", state)
    done = False
    while not done:
        state, reward, done = find_and_take_action(env,state,nn)
        episode_reward += reward

    delta_heading_altitude = (1/(abs(state[0])+1))*(1/(abs(state[1])+1))
        
    return env.get_sim_time(),episode_reward,delta_heading_altitude

In [13]:
#muller-marsaglia method
def spherepicking(n):
    while True:           #to get rid off [0,0,0,0] case
        l = [random.gauss(0, 1) for i in range(n)]
        sumsq = sum([x * x for x in l])
        if sumsq > 0:
            break
    norm = 1.0 / math.sqrt(sumsq)
    pt = [x * norm for x in l]
    return np.array(pt)

In [14]:
init_center_time, init_center_reward, init_delta_heading_altitude = run_episode(env,model,find_and_take_action)
print(init_center_time)
print(init_center_reward)
print(init_delta_heading_altitude)

150.0000000000115
168.50110852723716
0.00018086107160599323


In [15]:
def hill_climbing(env,lr,init_center,init_center_time,init_center_reward,init_delta_heading_altitude,model,num_hidden_neurons,num_points,epsilon):

    center_is_best = False
    num_weights = num_hidden_neurons*9 + 4*num_hidden_neurons + num_hidden_neurons + 4
    step = 0
    centers = []
    times = []
    episode_rewards = []
    delta_heading_altitudes = []
    center = init_center
    time = init_center_time
    episode_reward = init_center_reward
    delta_heading_altitude = init_delta_heading_altitude
    if time%150<epsilon:
        at_checkpoint=True
        checkpoint=time//150
    else:
        at_checkpoint=False
        checkpoint = 0
    centers.append(center)
    times.append(time)
    episode_rewards.append(episode_reward)
    delta_heading_altitudes.append(delta_heading_altitude)
    print("Step = ",step)
    print("Simulation Time = ",time," sec")
    print("Reward = ",episode_reward)
    if at_checkpoint:
        print("Delta heading altitude = ",delta_heading_altitude)
    

    while not center_is_best:
        
        rewards = []
        sim_time = []
        dhas = []
        points = []
        for i in range(num_points):
            point = center + lr*spherepicking(num_weights)
            points.append(point)
            w1, w2, b1, b2 = vec_to_weights(point,num_hidden_neurons)
            model.layer1.weight.data = w1
            model.layer2.weight.data = w2
            model.layer1.bias.data = b1
            model.layer2.bias.data = b2
            t, r, dha = run_episode(env,model,find_and_take_action)
            rewards.append(r)
            sim_time.append(t)
            dhas.append(dha)
            
        idx = -1
        if at_checkpoint:
            possible_idx1 = [i for i in range(num_points) if sim_time[i] >= ((150*checkpoint)+epsilon)]
            possible_idx2 = [i for i in range(num_points) if (sim_time[i] >= (150*checkpoint)) and (sim_time[i] < ((150*checkpoint)+epsilon))]
            if len(possible_idx1)>0:
                max_reward = 0
                for i in possible_idx1:
                    if rewards[i]>max_reward:
                        max_reward = rewards[i]
                if max_reward>=episode_reward:
                    idx = rewards.index(max_reward)
                else:
                    if len(possible_idx2)>0:
                        max_dha = 0
                        for i in possible_idx2:
                            if dhas[i]>max_dha:
                                max_dha = dhas[i]
                        if max_dha>=delta_heading_altitude:
                            idx = dhas.index(max_dha)
            else:
                if len(possible_idx2)>0:
                    max_dha = 0
                    for i in possible_idx2:
                        if dhas[i]>max_dha:
                            max_dha = dhas[i]
                    if max_dha>=delta_heading_altitude:
                        idx = dhas.index(max_dha)

        else:
            max_reward = max(rewards)
            if max_reward>=episode_reward:
                idx = rewards.index(max_reward)
            
        
        if(idx==-1):
            center_is_best = True
        else:
            step+=1
            center = points[idx]
            time = sim_time[idx]
            episode_reward = rewards[idx]
            delta_heading_altitude = dhas[idx]            
            print("Step = ",step)
            print("Simulation Time = ",time," sec")
            print("Reward = ",episode_reward)
            if time%150<epsilon:
                at_checkpoint=True
                checkpoint=time//150
            else:
                at_checkpoint=False
                checkpoint = 0
            if at_checkpoint:
                print("Delta heading altitude = ",delta_heading_altitude)
            centers.append(center)
            times.append(time)
            episode_rewards.append(episode_reward)
            delta_heading_altitudes.append(delta_heading_altitude)
            
    return centers, times, episode_rewards, delta_heading_altitudes 

In [16]:
centers, times, episode_rewards, delta_heading_altitudes = hill_climbing(env,1,init_center,init_center_time,init_center_reward,init_delta_heading_altitude,model,16,25000,0.5)

Step =  0
Simulation Time =  150.0000000000115  sec
Reward =  168.50110852723716
Delta heading altitude =  0.00018086107160599323
Step =  1
Simulation Time =  166.16666666669104  sec
Reward =  849.1886858760104
Step =  2
Simulation Time =  450.08333333325265  sec
Reward =  996.6629582735893
Delta heading altitude =  0.0008231450064820971


In [17]:
centers_new, times_new, episode_rewards_new, delta_heading_altitudes_new = hill_climbing(env,0.5,centers[-1],times[-1],episode_rewards[-1],delta_heading_altitudes[-1],model,16,25000,0.5)

Step =  0
Simulation Time =  450.08333333325265  sec
Reward =  996.6629582735893
Delta heading altitude =  0.0008231450064820971
Step =  1
Simulation Time =  450.08333333325265  sec
Reward =  777.2390819436755
Delta heading altitude =  0.012910315730236767
Step =  2
Simulation Time =  624.1666666664277  sec
Reward =  2057.0789794581638


In [18]:
centers_new_new, times_new_new, episode_rewards_new_new, delta_heading_altitudes_new_new = hill_climbing(env,0.25,centers_new[-1],times_new[-1],episode_rewards_new[-1],delta_heading_altitudes_new[-1],model,16,25000,0.5)

Step =  0
Simulation Time =  624.1666666664277  sec
Reward =  2057.0789794581638
Step =  1
Simulation Time =  1060.3333333326977  sec
Reward =  3781.9506939934195
Step =  2
Simulation Time =  1511.5833333322873  sec
Reward =  4538.95719481957
Step =  3
Simulation Time =  1360.666666665758  sec
Reward =  4637.431188657872
Step =  4
Simulation Time =  1518.4999999989477  sec
Reward =  4764.115926805554
Step =  5
Simulation Time =  2705.5833333401724  sec
Reward =  5858.75858017614
Step =  6
Simulation Time =  1800.083333332025  sec
Reward =  6838.863256071615
Delta heading altitude =  0.009881892434263032
Step =  7
Simulation Time =  7500.249999975004  sec
Reward =  8427.502652427962
Delta heading altitude =  0.0001312816931876806


In [19]:
centers_new_new_new, times_new_new_new, episode_rewards_new_new_new, delta_heading_altitudes_new_new_new = hill_climbing(env,0.1,centers_new_new[-1],times_new_new[-1],episode_rewards_new_new[-1],delta_heading_altitudes_new_new[-1],model,16,25000,0.5)

Step =  0
Simulation Time =  7500.249999975004  sec
Reward =  8427.502652427962
Delta heading altitude =  0.0001312816931876806
Step =  1
Simulation Time =  7542.4999999743895  sec
Reward =  17613.03651317054
Step =  2
Simulation Time =  7079.916666647788  sec
Reward =  20207.574301389166
Step =  3
Simulation Time =  7544.333333307696  sec
Reward =  23470.4845696408


In [20]:
times_new_new_new

[7500.249999975004, 7542.4999999743895, 7079.916666647788, 7544.333333307696]

In [21]:
d = {"centers_final": centers + centers_new + centers_new_new + centers_new_new_new,
     "times_final": times + times_new + times_new_new + times_new_new_new,
     "episode_rewards_final": episode_rewards + episode_rewards_new + episode_rewards_new_new + episode_rewards_new_new_new,
     "delta_heading_altitudes_final" : delta_heading_altitudes + delta_heading_altitudes_new + delta_heading_altitudes_new_new + delta_heading_altitudes_new_new_new}

In [23]:
import pickle

with open('./weights/58308.pickle', 'wb') as handle:
    pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)