In [1]:
import torch
import numpy as np
import mujoco_py
import gym
import load_policy
import pickle
import argparse
import tensorflow as tf
from torch.autograd import Variable

dtype = torch.float
device = torch.device("cpu")

  from ._conv import register_converters as _register_converters


In [2]:
class Agent(torch.nn.Module):
    def __init__(self,dim,learning_rate=0.01):
        super(Agent,self).__init__()
        self.model=torch.nn.Sequential(
        #Write MODEL
            torch.nn.Linear(dim[0],150),
            torch.nn.ReLU(),
            torch.nn.Linear(150,50),
            torch.nn.ReLU(),
            torch.nn.Linear(50,dim[1])
        
        )  ##one forward step of the neural net
        self.loss_fn=torch.nn.MSELoss()#check what is use of size average
        self.history=[]
        self.rewards=[]
        self.optimizer= torch.optim.Adam(self.model.parameters(),lr=learning_rate)
        
        
    def train(self,training_data,batch_size,epochs):
        m=training_data["observations"].shape[0] #no of training examples
        loss=[]
        X=torch.from_numpy(training_data["observations"]).float()
        Y=torch.from_numpy(training_data["actions"]).float()
        for iter_no in range(epochs):
            for j in range(0,m,batch_size):
                loss.append(self.train_step(Variable(X[j:j+batch_size]),Variable(Y[j:j+batch_size])))    
            self.history.append(np.mean(loss))
    
    def train_step(self,X,Y):
        y_pred=self.model(X)
        loss=self.loss_fn(y_pred,Y)  
        self.optimizer.zero_grad()  
        loss.backward()  #calculates gradient
        self.optimizer.step()  #changes weight according to gradients
        return loss.item()
    
    def act(self,obs):
        obs=torch.from_numpy(np.expand_dims(obs,0)).float()
        return self.model(obs)
        
    def record_reward(self,reward):
        self.rewards.append(np.mean(reward))



In [3]:
class Expert:
    def __init__(self,expert_policy_file):
        self.policy_fn=load_policy.load_policy(expert_policy_file)
    def act(self,obs):
        return self.policy_fn(obs[None,:]) #policy_fn returns the action... None adds another blank axis 
    def record_reward(self, reward):
        pass 

In [4]:
class Simulator():  #to generaate training data by running expert
    def __init__(self,envname):
        self.initialize_env(envname)
        self.envname=envname
    
    def initialize_env(self,envname):
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        self.env=gym.make(envname)
        
    def simulate(self,agent,max_steps,num_rollouts,render):
        with self.session.as_default():
            returns=[]
            observations=[]
            rewards=[]
            actions=[]
            for i in range(num_rollouts):
                obs=self.env.reset()
                done=False
                total_reward=0
                steps=0
                while not done:
                    action=agent.act(obs)
                    observations.append(obs)
                    actions.append(action)
                    obs,reward,done,info=self.env.step(action)
                    total_reward+=reward
                    steps+=1
                    if render:
                        self.env.render()
                    if steps>=max_steps:
                        break
                returns.append(total_reward)
            print('Return summary: mean=',np.mean(returns),"  std=",np.std(returns))
            agent.record_reward(returns)
            return (observations, actions)
     
       

In [5]:
def arguments():
    parser=argparse.ArgumentParser()
    parser.add_argument("expert_policy_file", type=str)
    parser.add_argument("envname", type=str)
    parser.add_argument("--render", action='store_true')
    parser.add_argument("--max_timesteps", type=int, default=500)
    parser.add_argument("--num_rollouts",type=int,default=1) #no of expert rollouts
    
    return parser.parse_args()

In [6]:
# #args=arguments()
# # class DotDict(dict):
# #     def __getattr__(self, name):
# #         return self[name]
args ={
    'batch_size': 128,  # Number of training data for each epoch
    'epochs': 50, # Number of epoch in training the model
    'envname':'Humanoid-v2',  # Environment to stimulate the expert
    'expert_policy_file' : './experts/Humanoid-v2.pkl',  # Read expert from file
    'num_rollouts' : 5,  # Number of rollouts to play for each iter of training
    'render' : False,  # Whether to render the final training result in animation
    'max_timesteps' : 100000,  # Stop the env after this number of steps being taken
    }

# expert= Expert(args['expert_policy_file'])

# simulator = Simulator(args['envname'])
# training_data= simulator.simulate(expert,max_steps=args["max_timesteps"],num_rollouts=args["num_rollouts"],render=False)

In [7]:
with open('./data/Humanoid-v2.pkl', 'rb') as f:
    training_data = pickle.load(f)
#print(training_data.keys())
#print(training_data['observations'].shape[-1],training_data['actions'].shape[-1])
#training_data=Variable(training_data)
#training_data=torch.from_numpy(np.array(training_data))
dim=(training_data['observations'].shape[-1],training_data['actions'].shape[-1])

#print(dim)

In [8]:
agent = Agent(dim)
agent.train(training_data,batch_size=args['batch_size'],epochs=50)

NameError: name 't' is not defined

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

print(agent.history)

def gen_new_plot(title, ylabel, xlabel):
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
    return ax


ax1 = gen_new_plot('Training Loss', 'Train Loss', 'Epoch')
ax1.plot(agent.history)
plt.show()

In [None]:
torch.save(agent.state_dict(), "./data/trained_data3.pkl")


In [None]:
agent = Agent(dim)
agent.load_state_dict(torch.load("./data/trained_data3.pkl"))
print(trained_agent.env.spec.timestep_limit)

In [None]:

with torch.no_grad():
    trained_agent=Simulator(args['envname'])
    data = trained_agent.simulate(agent,max_steps=10000000,num_rollouts=10,render=True)
    #trained_agent.glfw.destroy_window()
    print(trained_agent.env.spec.timestep_limit)
    trained_agent.env.close()