In [1]:
import gym
import math
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
#for using gpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
env = gym.make('MountainCarContinuous-v0')
print(env.action_space, env.observation_space)
env.seed(101)
np.random.seed(101)



Box(1,) Box(2,)


In [45]:
class Agent(nn.Module):
    def __init__(self, env, action_space, state_space):
        super(Agent, self).__init__()
        self.env = env
        self.a_s = action_space
        self.s_s = state_space
        self.h_s = 16
        self.fc1 = nn.Linear(self.s_s, self.h_s).to(device)
        self.fc2 = nn.Linear(self.h_s, self.a_s).to(device)
    
    def get_wights_dim(self):
        return (self.s_s+1)*self.h_s + (self.h_s+1)*self.a_s

    def adjust_to_weights(self,weights):
        end = self.s_s*self.h_s + self.h_s
        fc1_w = torch.from_numpy(weights[:self.s_s*self.h_s]).reshape(self.s_s, self.h_s)
        fc1_b = torch.from_numpy(weights[self.s_s*self.h_s:end])
        fc2_w = torch.from_numpy(weights[end:end+self.h_s*self.a_s]).reshape(self.h_s, self.a_s)
        fc2_b = torch.from_numpy(weights[end+self.h_s*self.a_s:])
        #copying weights and bias
        self.fc1.weight.data.copy_(fc1_w.view_as(self.fc1.weight.data))
        self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
        self.fc2.weight.data.copy_(fc2_w.view_as(self.fc2.weight.data))
        self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
        
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        return x.cpu().data
        
    def evaluate(self, weight, itera):
        self.adjust_to_weights(weight)
        self.state = self.env.reset()
        scores = 0.0
        for _ in range(itera):
            self.state = torch.from_numpy(self.state).float().to(device)
            action = self.forward(self.state)
            next_state, reward, done, _ = self.env.step(action)
            scores += reward
            self.state = next_state
            if done:
                break
        return scores

agent = Agent(env, 1,2)

In [53]:
def cross_entropy(n_episode = 100, n_iter=1000, batch_per_epi=50, prob_selection=20, sigma = 0.5):
    best_weight = np.random.randn(agent.get_wights_dim())
    number_of_selection = int(prob_selection*batch_per_epi/100.0)
    for i in range(n_episode):
        memory = [best_weight + sigma*np.random.randn(agent.get_wights_dim()) for _ in range(batch_per_epi)]
        reward = [agent.evaluate(weight, n_iter) for weight in memory]
        arranged_index = np.array(reward).argsort()[-number_of_selection:]
        weights = [memory[j] for j in arranged_index]  #collected top n weights according to reward
        best_weight = (sum(weights)/number_of_selection)
        #print(agent.fc1.weight)
        print(' Average Score {}'.format(sum(reward)/batch_per_epi))
        #averaging weights 
cross_entropy()

 Average Score -80.55932934439852
 Average Score -54.98671675460832
 Average Score -41.15999907280778
 Average Score -51.481335192480834
 Average Score -39.022604498894566
 Average Score -41.845870183367516
 Average Score -42.9917245950736
 Average Score -40.37925371691987
 Average Score -49.42347312087852
 Average Score -46.46574556594254


In [52]:
state = env.reset()
while True:
    state = torch.from_numpy(state).float().to(device)
    with torch.no_grad():
        action = agent(state)
    env.render()
    next_state, reward, done, _ = env.step(action)
    state = next_state
    if done:
        break

env.close()

In [37]:
next_state

array([tensor(-0.5116, device='cuda:0', grad_fn=<AddBackward0>),
       tensor(0.0013, device='cuda:0', grad_fn=<AddBackward0>)],
      dtype=object)

In [None]:
state = env.reset()
h = torch.from_numpy(state).float().to(device)


end = self.s_s*self.h_s + self.h_s
fc1_w = torch.from_numpy(weights[:self.s_s*self.h_s]).reshape(self.s_s, self.h_s)
fc1_b = torch.from_numpy(weights[self.s_s*self.h_s:end])
fc2_w = torch.from_numpy(weights[end:end+self.h_s*self.a_s]).reshape(self.h_s, self.a_c)
fc2_b = torch.from_numpy(weights[end+self.h_s*self.a_s:])
        
        

In [None]:
weight = np.random.rand(agent.get_wights_dim())
a = weight[:32].reshape(2,16)
b = weight[32:32+16]
c = weight[48:48+16].reshape(1,16)
d = weight[64:]
print(a.shape, b.shape, c.shape, d.shape)
c = torch.from_numpy(c)

In [None]:
agent.fc2.weight.data.copy_(c.view_as(agent.fc2.weight.data))

In [None]:
agent.fc2.weight

In [None]:
agent.fc2.weight

In [None]:
sum([b,b,b])/2

In [None]:
3*16 + 17*1

In [None]:
agent.fc1.bias.size()

In [None]:
agent.fc2.weight.size()

In [None]:
agent.fc2.bias.size()