In [1]:
%matplotlib inline

from __future__ import unicode_literals
import matplotlib
matplotlib.rcParams['text.usetex'] = True
matplotlib.rcParams['text.latex.unicode'] = True

import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.utils.data

import math
from itertools import product
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np
import time
import matplotlib.patches as patches
import types, io, os
import moviepy.editor as mpy
from moviepy.editor import VideoClip
from moviepy.video.io.bindings import mplfig_to_npimage

from PIL import Image

import matplotlib.pyplot as plt
from matplotlib import animation as animation_plt
from matplotlib import rc
from IPython import display
from IPython.display import HTML, clear_output
from ipywidgets.widgets.interaction import show_inline_matplotlib_plots
import inspect

from IPython.core.display import display as mydisplay
mydisplay(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
torch.set_num_threads(2)

In [3]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def get_space_shape(space):
    if isinstance(space, gym.spaces.discrete.Discrete):
        return [space.n]
    if isinstance(space, gym.spaces.multi_discrete.MultiDiscrete):
        return list(space.nvec)
    if isinstance(space, gym.spaces.box.Box):
        return list(space.low.shape)

class figure_compiler_video():
    def __init__(self):
        self.imlist = []
        
    def add_figure(self, myfig):
        buf = io.BytesIO()
        myfig.savefig(buf, format='png')
        buf.seek(0)
        im = np.array(Image.open(buf))
        buf.close()
        self.imlist.append(im)
    
    def __call__(self, out_size=5, fps=5, out_file=None,): 
        clip = mpy.ImageSequenceClip(self.imlist, fps=fps)
        
        if not out_file is None:
            if out_file.endswith('gif'):
                clip.write_gif(out_file)
            else:
                clip.write_videofile(out_file)
        return clip     

def dict_append(main_dict, update_dict):
    for key, val in update_dict.items():
        if not key in main_dict.keys():
            main_dict[key] = []
        main_dict[key].append(val)

def get_angle(self, cosine, sine):
    possine = 2*(np.array(sine) > 0) - 1
    theta = possine * np.arccos(cosine)
    theta = np.mod(theta, 2*np.pi)
    return theta

def plt_render(self, animation=True, exp_index=-1, act_per_frame = 1, create_mpy = False):
    all_states = self.exp_history[exp_index]['states']
    all_actions = self.exp_history[exp_index]['actions']
    all_rewards = self.exp_history[exp_index]['rewards']
    other_keywords = [key for key in self.exp_history[exp_index].keys() if key not in ['states', 'actions', 'rewards', 'next_states']]

    if self.use_triangular_states:
        theta = (180./np.pi) * np.array(self.get_angle(cosine = [s[0] for _,s in enumerate(all_states)],
                                                       sine = [s[1] for _,s in enumerate(all_states)]))
        theta = (theta + 180) % 360
        omega = [s[2] for _,s in enumerate(all_states)]
    else:
        theta = [(180/np.pi) * s[0] for _,s in enumerate(all_states)]
        omega = [s[1] for _,s in enumerate(all_states)]
        
    torques = all_actions
    time_array = self.time_step * np.arange(len(theta))
    
    additional_rows = int(np.ceil(len(other_keywords) / 2.))
    plt_rows = (additional_rows + 4) if animation else (additional_rows + 2)
    if not(hasattr(self, 'plt_rows')):
        create_new_figure = True
    else:
        if plt_rows==self.plt_rows:
            create_new_figure = False
        else:
            create_new_figure = True                
    if not(hasattr(self,'figure')) or animation:
        create_new_figure = True

    self.plt_rows = plt_rows
    if create_new_figure:
        self.figure = plt.figure()

    self.figure.set_size_inches(plt_rows * 4, 2 * 4, forward=True)        

    if create_new_figure:
        self.act_ax = plt.subplot2grid((2, plt_rows), (0, 0))
        self.theta_ax = plt.subplot2grid((2, plt_rows), (0, 1))
        self.reward_ax = plt.subplot2grid((2, plt_rows), (1, 0))
        self.omega_ax = plt.subplot2grid((2, plt_rows), (1, 1))
        self.other_axes = [plt.subplot2grid((2, plt_rows), (int(uu % 2) , 2 + int(uu/2))) for uu,key in enumerate(other_keywords)]
    if animation:
        self.traj_ax = plt.subplot2grid((2, plt_rows), (0, additional_rows + 2), colspan=2, rowspan=2)

        self.traj_ax.set_xlim([-(self.l)*1.05, (self.l)*1.05])
        self.traj_ax.set_ylim([-(self.l)*1.05, (self.l)*1.05])

    plot_ax_list = [self.act_ax, self.theta_ax,
                    self.reward_ax, self.omega_ax] + self.other_axes
    plot_data_list = [torques, theta, all_rewards, omega] + [self.exp_history[exp_index][key] for _,key in enumerate(other_keywords)]
    plot_title_list = [r'$\tau$', r'$\theta^\circ$', 
                       r'$R$', r'$\omega$'] + other_keywords


    for i,curr_ax in enumerate(plot_ax_list):
        curr_data=plot_data_list[i]
        curr_ax.set_xlim([0,time_array[-1]])
        if i==1:
            curr_ax.set_ylim([0,360])
        else:
            if np.min(curr_data) == np.max(curr_data):
                curr_ax.set_ylim([np.min(curr_data) - 1, np.max(curr_data) + 1])
            else:
                curr_ax.set_ylim([np.min(curr_data)-0.05*np.abs(np.min(curr_data)),
                                  np.max(curr_data)+0.05*np.abs(np.max(curr_data))])
        curr_ax.set_title(plot_title_list[i], fontsize=16)

    if animation:
        self.acrobot, = self.traj_ax.plot([], [], 'o-', lw=2)
        traj_title = self.traj_ax.set_title('Trajectory', fontsize=16)

    if create_new_figure:
        self.plot_lines = []
        for i,curr_ax in enumerate(plot_ax_list):
            curr_line, = curr_ax.plot([], [], color='k')
            self.plot_lines.append(curr_line)

    def init():
        if animation:
            self.acrobot.set_data([], [])
            traj_title.set_text('Trajectory')
        for curr_line in self.plot_lines:
            curr_line.set_data([], [])
        return_list = [self.plot_lines[0], self.plot_lines[1], self.plot_lines[2], self.plot_lines[3]]
        if animation:
            return_list = [self.acrobot, traj_title] + return_list
        return return_list

    def animate(i):
        i = act_per_frame * i
        if animation:
            thisx = [0,  self.l * np.sin(theta[i]*np.pi/180)]
            thisy = [0, -self.l * np.cos(theta[i]*np.pi/180)]

            self.acrobot.set_data(thisx, thisy)
            traj_title.set_text('Trajectory (time= %.2f)'%(i*self.time_step))

        for j,curr_line in enumerate(self.plot_lines):
            curr_data = plot_data_list[j]
            curr_line.set_data(time_array[:i], curr_data[:i])

        return_list = [self.plot_lines[0], self.plot_lines[1], self.plot_lines[2], self.plot_lines[3]]
        if animation:
            return_list = [self.acrobot, traj_title] + return_list
        return return_list

    if animation:
        final_idx = int(len(theta)/act_per_frame)
        self.ani = animation_plt.FuncAnimation(self.figure, animate, np.arange(final_idx),
                                           interval=25, blit=True, init_func=init)
        if create_mpy:
            duration =  (final_idx * self.time_step)
            old_dpi = self.figure.get_dpi()
            self.figure.set_dpi(20)
            def make_frame(t):
                animate(int(t/self.time_step))
                return mplfig_to_npimage(self.figure)
            self.mpy_ani = VideoClip(make_frame, duration=duration)
            self.figure.set_dpi(old_dpi)

    else:
        animate(len(theta)-1)
        
def mycumprodsum(my_delta, my_gamma):
    if torch.is_tensor(my_delta):
        my_delta = my_delta.to(dtype=torch.float)
        c = torch.arange(my_delta.numel()).to(my_delta)
        c = np.power(1./my_gamma, c)
        a = my_delta * c
        a = torch.cumsum(a, dim=0)
        return a / c
    else:
        my_delta = np.array(my_delta)
        c = np.arange(my_delta.size)
        c = np.power(1./my_gamma, c)
        a = my_delta * c
        a = np.cumsum(a)
        return a / c

def mycumprodsum_rev(my_delta, my_gamma): 
    if torch.is_tensor(my_delta):
        return mycumprodsum(my_delta.flip(0), my_gamma).flip(0)
    else:
        return mycumprodsum(my_delta[::-1], my_gamma)[::-1]
        

In [4]:
class tester_env_class():
    def __init__(self, Environment_Maker):
        test_env = Environment_Maker()

        #Gym specific
        test_env.use_triangular_states = True
        test_env.time_step = test_env.env.dt
        test_env.l = 1
        test_env.m = 1

        test_env.get_angle = types.MethodType( get_angle, test_env)
        test_env.plt_render = types.MethodType( plt_render, test_env )

        test_env.exp_history=[]
        self.env = test_env
    
    def __call__(self, main_net, steps=200, reward_shaping = lambda x: x):
        
        test_env = self.env
        s = test_env.reset()
        test_env.exp_history.append({'states':[], 'actions':[], 'rewards':[], 
                                     'next_states':[], 'std(a)':[], 'V(s)':[]})
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        for t in range(steps):
            with torch.no_grad():
                s_tensor = torch.from_numpy(s).to(device=device, dtype=torch.float)
                s_tensor = s_tensor.unsqueeze(0)
                mean_act, logstd_act, value_state = main_net(s_tensor)
                mean_act.squeeze_(0), logstd_act.squeeze_(0), value_state.squeeze_(0)
                chosen_act = torch.randn_like(logstd_act) * torch.exp(logstd_act) + mean_act
            
            chosen_act_np = chosen_act.numpy()
            chosen_act_np = np.clip(chosen_act_np, -2, 2)
            s_prime, r, done, info = test_env.step(chosen_act_np)
            r = reward_shaping(r)
            done = False

            test_env.exp_history[-1]['states'].append(s)
            test_env.exp_history[-1]['actions'].append(chosen_act_np)
            test_env.exp_history[-1]['rewards'].append(r)
            test_env.exp_history[-1]['std(a)'].append(torch.exp(logstd_act).numpy())
            test_env.exp_history[-1]['V(s)'].append(value_state.numpy())
            test_env.exp_history[-1]['next_states'].append(s_prime)

            s = s_prime
            if done:
                pass
                break
                
        test_env.plt_render(animation=False, exp_index = -1)
        show_inline_matplotlib_plots()
        clear_output()

In [5]:
class mean_std_val_net_v1(nn.Module):
    def __init__(self, state_space_shape, action_space_shape):
        super().__init__()
        self.state_space_shape = state_space_shape
        self.action_space_shape = action_space_shape
        inp_dim = np.prod(state_space_shape)
        out_dim = np.prod(action_space_shape)
        self.l1 = nn.Linear(inp_dim, 20)
        self.l1_relu = nn.LeakyReLU()
        self.l2 = nn.Linear(20, 100)
        self.l2_relu = nn.LeakyReLU()
        self.l4 = nn.Linear(100, 50)
        self.l4_relu = nn.LeakyReLU()
        self.l3 = nn.Linear(50, out_dim)

    def forward(self, input):
        input = input.reshape(-1, *self.state_space_shape)
        output = self.l1(input)
        output = self.l1_relu(output)
        output = self.l2(output)
        output = self.l2_relu(output)
        output = self.l4(output)
        output = self.l4_relu(output)
        output = self.l3(output)
        return output.reshape(-1, *self.action_space_shape)

class mean_std_val_net_v2(nn.Module):
    def __init__(self, state_space_shape, action_space_shape):
        super().__init__()
        self.state_space_shape = state_space_shape
        self.action_space_shape = action_space_shape
        inp_dim = np.prod(state_space_shape)
        out_dim = np.prod(action_space_shape)
        
        self.l1 = nn.Linear(inp_dim, 100)
        self.l1_relu = nn.ReLU()
        self.l2 = nn.Linear(100, out_dim)

    def forward(self, input):
        input = input.reshape(-1, *self.state_space_shape)
        output = self.l1(input)
        output = self.l1_relu(output)
        output = self.l2(output)
        return output.reshape(-1, *self.action_space_shape)

class mean_std_val_net_v3(nn.Module):
    def __init__(self, state_space_shape, out_space_shape, hidden_layers_units = [100]):
        super().__init__()
        self.state_space_shape = state_space_shape
        self.out_space_shape = out_space_shape
        self.hidden_layers_units = hidden_layers_units
        inp_dim = np.prod(state_space_shape)
        out_dim = np.prod(out_space_shape)
        
        self.layers = []
        last_dim = inp_dim
        for _,units in enumerate(self.hidden_layers_units):
            self.layers.append(nn.Linear(last_dim, units))
            self.layers.append(nn.ReLU())
            last_dim = units
        self.layers.append(nn.Linear(last_dim, out_dim))

    def forward(self, myinput):
        output = myinput.reshape(-1, *self.state_space_shape)
        for layer in self.layers:
            output = layer(output)
        return output.reshape(-1, *self.out_space_shape)

def init_weights(m): 
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0)
    elif isinstance(m, nn.Conv2d):
        torch.nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            torch.nn.init.constant_(m.bias, 0)
        #classname.find('Conv') != -1:
        #torch.nn.init.normal_(m.weight, 0.0, 0.02)
    elif classname.find('BatchNorm2d') != -1:
        torch.nn.init.normal_(m.weight, 1.0, 0.02)
        torch.nn.init.constant_(m.bias, 0.0)

mse_loss = torch.nn.MSELoss(reduction='elementwise_mean')
softplus = nn.Softplus()
tanh = nn.Tanh()

In [6]:
def run_ppo_experiment( N = 3, 
                        T = 32,
                        K = 10,
                        gamma = 0.9,
                        lamda = 1,
                        epsilon = 0.2,
                        num_loops = 10000,
                        batch_size = 32,
                        c_1 = 2,
                        c_2 = 0,
                        seperate_val_net = True,
                        reward_shaping = lambda r: (r+8)/8.,
                        action_sigma = None,
                        max_ep_len = 480,
                        learning_rate = 0.00001,
                        action_mean_transformation = lambda proposed_act_mean: 2 * tanh(proposed_act_mean),
                        action_std_transformation = lambda proposed_act_std: softplus(proposed_act_std),
                        neural_net_maker = lambda s_dim, out_dim: mean_std_val_net_v3(s_dim, out_dim, hidden_layers_units = [100]),
                        Environment_Maker = lambda : gym.make('Pendulum-v0') ):


    #Creating the environments
    env_list = [Environment_Maker() for _ in range(N)]
    for i_thread in range(N):
        env = env_list[i_thread]
        env.last_state = env.reset()
        env.my_timer = 0
        env.exp_history = []
        env.need_reset=True

    #Creating the network
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    out_act_factor = 2 if action_sigma is None else 1
    if not seperate_val_net:
        main_nnet = mean_std_val_net_v1(state_space_shape = get_space_shape(env.observation_space),
                                        action_space_shape = [out_act_factor * np.prod(get_space_shape(env.action_space)) + 1]).to(device)
        main_nnet.apply(init_weights)
        main_optimizer = torch.optim.Adam(main_nnet.parameters(), lr=learning_rate)

        act_shape = [int(x) for _,x in enumerate(get_space_shape(env.action_space))]
        def main_net(x):
            net_out = main_nnet(x)
            act_means = net_out[...,0:np.prod(act_shape)].reshape(-1,*act_shape)
            act_means = action_mean_transformation(act_means)
            if action_sigma is None:
                act_stds = net_out[...,np.prod(act_shape):(2*np.prod(act_shape))].reshape(-1,*act_shape)
                act_stds = torch.log( action_std_transformation(act_stds) )
            else:
                act_stds = torch.full_like(act_means, fill_value=float(np.log(action_sigma)), 
                                           dtype=torch.float, device=device, requires_grad=False)
            state_values = net_out[...,-1]
            return act_means, act_stds, state_values
    else:
        main_nnet = mean_std_val_net_v2(state_space_shape = get_space_shape(env.observation_space),
                                        action_space_shape = [out_act_factor * np.prod(get_space_shape(env.action_space))]).to(device)
        val_nnet = mean_std_val_net_v2(state_space_shape = get_space_shape(env.observation_space), action_space_shape = [1]).to(device)

        main_nnet.apply(init_weights)
        val_nnet.apply(init_weights)

        main_optimizer = torch.optim.Adam(list(main_nnet.parameters()) + list(val_nnet.parameters()), lr=learning_rate)

        act_shape = [int(x) for _,x in enumerate(get_space_shape(env.action_space))]

        my_softplus = nn.Softplus()
        def main_net(x):
            net_out = main_nnet(x)
            val_out = val_nnet(x)
            act_means = net_out[...,0:np.prod(act_shape)].reshape(-1,*act_shape)
            act_means = action_mean_transformation(act_means)
            if action_sigma is None:
                act_stds = net_out[...,np.prod(act_shape):(2*np.prod(act_shape))].reshape(-1,*act_shape)
                act_stds = torch.log( action_std_transformation(act_stds) )
            else:
                act_stds = torch.full_like(act_means, fill_value=float(np.log(action_sigma)), 
                                           dtype=torch.float, device=device, requires_grad=False)
            state_values = val_out[...,-1]
            return act_means, act_stds, state_values

    #Creating the test env
    test_env = tester_env_class(Environment_Maker = Environment_Maker)

    #Other stuff
    loss_hist = []
    other_hist = []
    learning_vidmaker = figure_compiler_video()

    %matplotlib inline
    np.set_printoptions(precision=2, suppress=True)

    #Training
    for i_loop in range(num_loops):
        #Running the simulations using the old policy
        for i_thread in range(N):
            env = env_list[i_thread]
            if env.need_reset:
                env.last_state = env.reset()
                env.my_timer = 0
                env.need_reset = False
            t_start = env.my_timer
            env.exp_history.append({})
            while env.my_timer < t_start + T:
                s = env.last_state
                with torch.no_grad():
                    s_tensor = torch.from_numpy(s).to(device=device, dtype=torch.float)
                    s_tensor = s_tensor.unsqueeze(0)
                    mean_act, logstd_act, value_state = main_net(s_tensor)
                    mean_act.squeeze_(0), logstd_act.squeeze_(0), value_state.squeeze_(0)
                    chosen_act = torch.randn_like(logstd_act) * torch.exp(logstd_act) + mean_act

                chosen_act_np = chosen_act.numpy()
                s_prime, r, done, info = env.step(chosen_act_np)
                r = reward_shaping(r)
                done = env.my_timer > (max_ep_len-2)

                with torch.no_grad():
                    s_new_tensor = torch.from_numpy(s_prime).to(device=device, dtype=torch.float)
                    s_new_tensor = s_new_tensor.unsqueeze(0)
                    _, _, value_next_state = main_net(s_new_tensor)
                    value_next_state.squeeze_(0)

                sample_data = {'state': np.array(s, copy=True),
                               'next state': np.array(s_prime, copy=True),
                               'action': chosen_act_np,
                               'reward': r,
                               'done' : done,
                               'time' : env.my_timer,
                               'network action mean': mean_act.numpy(),
                               'network action logstd': logstd_act.numpy(),
                               'network state value': value_state.numpy(),
                               'network next state value': value_next_state.numpy()}

                dict_append(env.exp_history[-1],sample_data)
                env.last_state = s_prime
                env.my_timer += 1

                if done:
                    env.need_reset=True
                    break

        data = []
        for i_thread in range(N):
            env = env_list[i_thread]
            #Computing delta and advantage values
            r_t = np.array(env.exp_history[-1]['reward'])
            v_st = np.array(env.exp_history[-1]['network state value'])
            v_st_plus_one = np.array(env.exp_history[-1]['network next state value'])

            with torch.no_grad():
                r_t_tensor = torch.from_numpy(r_t).to(device=device, dtype=torch.float)
                v_st1_tensor = torch.from_numpy(v_st_plus_one).to(device=device, dtype=torch.float).reshape(-1)
                v_st_tensor = torch.from_numpy(v_st).to(device=device, dtype=torch.float).reshape(-1)
                delta = r_t_tensor +  gamma * v_st1_tensor - v_st_tensor
                A_hat_t = mycumprodsum_rev(delta, lamda*gamma)

                mu_old = torch.from_numpy(np.array(env.exp_history[-1]['network action mean'])).\
                            to(device=device, dtype=torch.float)
                logstd_old = torch.from_numpy(np.array(env.exp_history[-1]['network action logstd'])).\
                                to(device=device, dtype=torch.float)
                a_t = torch.from_numpy(np.array(env.exp_history[-1]['action'])).to(device=device, dtype=torch.float)
                s_t = torch.from_numpy(np.array(env.exp_history[-1]['state'])).to(device=device, dtype=torch.float)
                s_t1 = torch.from_numpy(np.array(env.exp_history[-1]['next state'])).to(device=device, dtype=torch.float)

                log_pi_old =  (torch.pow(a_t - mu_old,2))/((-2) * torch.exp(2 * logstd_old)) - logstd_old 
                log_pi_old = log_pi_old.sum(dim=1)  

            data.append([A_hat_t, log_pi_old, a_t, s_t, s_t1, r_t_tensor, logstd_old])

        with torch.no_grad():
            A_hat_t = torch.cat([d[0] for _,d in enumerate(data)], dim=0)
            log_pi_old = torch.cat([d[1] for _,d in enumerate(data)], dim=0)
            a_t = torch.cat([d[2] for _,d in enumerate(data)], dim=0)
            s_t = torch.cat([d[3] for _,d in enumerate(data)], dim=0)
            s_t1 = torch.cat([d[4] for _,d in enumerate(data)], dim=0)
            r_t = torch.cat([d[5] for _,d in enumerate(data)], dim=0)
            log_std_old_t = torch.cat([d[6] for _,d in enumerate(data)], dim=0)

        loss_hist.append({})
        other_hist.append({'Action std': np.exp(log_std_old_t.numpy()),
                           'Reward': r_t.numpy().mean()})
        for k in range(K):
            ep_perm = torch.randperm(s_t.shape[0])

            it_per_ep = np.ceil(s_t.shape[0]/batch_size)
            for batch_iter in range(int(it_per_ep)):
                main_nnet.zero_grad()
                if seperate_val_net:
                    val_nnet.zero_grad()

                with torch.no_grad():
                    curr_idx = ep_perm[batch_iter * batch_size: (batch_iter+1) * batch_size]
                    curr_s_t = torch.index_select(s_t , dim = 0 , index = curr_idx)
                    curr_s_t1 = torch.index_select(s_t1 , dim = 0 , index = curr_idx)
                    curr_a_t = torch.index_select(a_t , dim = 0 , index = curr_idx)
                    curr_r_t = torch.index_select(r_t , dim = 0 , index = curr_idx)
                    curr_logpi_old = torch.index_select(log_pi_old , dim = 0 , index = curr_idx)
                    curr_A_hat_t = torch.index_select(A_hat_t , dim = 0 , index = curr_idx)

                curr_mu_new, curr_logstd_new, curr_v_st = main_net(curr_s_t)

                log_pi_new = torch.pow(curr_a_t - curr_mu_new, 2) / (-2 * torch.exp(2 * curr_logstd_new)) - curr_logstd_new 
                log_pi_new = log_pi_new.sum(dim=1)  

                log_pi_diff = log_pi_new - curr_logpi_old
                pi_ratio = torch.exp(log_pi_diff)

                loss_clip_vec = torch.min(pi_ratio * curr_A_hat_t, 
                                          torch.clamp(pi_ratio, min=1-epsilon, max=1+epsilon) * curr_A_hat_t)

                loss_clip = loss_clip_vec.mean()

                _, _, curr_v_st1 = main_net(curr_s_t1)
                with torch.no_grad():
                    target_v = curr_v_st1 * gamma + curr_r_t
                loss_VF = mse_loss(curr_v_st, target_v)

                if c_2:
                    loss_Entropy = torch.sum(curr_logstd_new + torch.log(torch.tensor(2*np.pi)))
                    total_loss = c_1 * loss_VF - loss_clip - c_2 * loss_Entropy
                else:
                    total_loss = c_1 * loss_VF - loss_clip

                total_loss.backward()
                main_optimizer.step()

                dict_append(loss_hist[-1], {'Total loss': total_loss.detach().numpy(),
                                            'Clip loss': loss_clip.detach().numpy(),
                                            'Value Function loss' : loss_VF.detach().numpy()})            
                if c_2:
                    dict_append(loss_hist[-1], {'Entropy loss' : loss_Entropy.detach().numpy()})

        if i_loop%100 == 0:
            test_env(main_net, reward_shaping = reward_shaping)
            learning_vidmaker.add_figure(test_env.env.figure)
        
        if i_loop%10 == 0:
            print('Loop ' +str(i_loop) + '--> ' + str([key + ': ' +str(np.mean(val))
                                                       + ', ' for key,val in loss_hist[-1].items()] + 
                                                     [key + ': ' +str(np.mean(val))
                                                       + ', ' for key,val in other_hist[-1].items()])) 

    store_prefix = 'Experiments/Pendulum'
    #Storing the results
    ctr=0
    while True:
        if not os.path.exists(store_prefix + '/' + str(ctr)):
            break
        ctr += 1
    

    store_folder = store_prefix + '/' + str(ctr)
    mkdir_p(store_folder)


    #Creating the loss figure
    fig = plt.figure(figsize=(20,10))
    ax = plt.subplot2grid((1, 2), (0, 0))
    ax2 = plt.subplot2grid((1, 2), (0, 1))
    for my_label in loss_hist[0].keys():
        x = range(len(loss_hist))
        y = [np.mean(loss_hist[t][my_label]) for _,t in enumerate(x)]
        ax.plot(x, y, label=my_label)
        
    for my_label in other_hist[0].keys():
        x = range(len(other_hist))
        y = [np.mean(other_hist[t][my_label]) for _,t in enumerate(x)]
        ax2.plot(x, y, label=my_label)

    ax.legend(), ax2.legend()
    ax.set_title('Mean Losses vs mini-episode')
    ax2.set_title('Statistics vs mini-episode')
    show_inline_matplotlib_plots()
    fig.savefig(store_folder+'/LossPlot.png')
    clear_output()

    #Learning Samples Clip
    learning_clip = learning_vidmaker(fps=1)
    learning_clip.write_videofile(store_folder+'/LearningSamples.mp4')

    #The network
    store_material={'cuda':main_nnet.state_dict(), 'cpu':main_nnet.cpu().state_dict()}
    torch.save(store_material, store_folder+'/model.pth')

    #The animation
    test_env.env.plt_render(animation=True, act_per_frame = 1)
    show_inline_matplotlib_plots()
    clear_output()
    matplotlib.rcParams['animation.embed_limit'] = 80
    %time test_env.env.ani.save(store_folder+'/Trajectory.gif', writer=animation_plt.PillowWriter(fps=10))
    gifclip = mpy.VideoFileClip(store_folder+'/Trajectory.gif')
    gifclip.write_videofile(store_folder+'/Trajectory.mp4')
    
    kwargs = {  'N' : N, 
                'T' : T,
                'K' : K,
                'gamma' : gamma,
                'lamda' : lamda,
                'epsilon' : epsilon,
                'Number of loops' : num_loops,
                'Batch size' : batch_size,
                'c_1' : c_1,
                'c_2' : c_2,
                'Seperate Value Network' : seperate_val_net,
                'Reward Shaping' : reward_shaping,
                'Action Sigma' : action_sigma,
                'Max episode length' : max_ep_len,
                'Learning rate' : learning_rate,
                'Action mean transformation' : action_mean_transformation,
                'Action std transformation' : action_std_transformation,
                'Neural net maker' : neural_net_maker,
                'Environment Maker' : Environment_Maker  }
    kwargs_str = {}
    for key, val in kwargs.items():
        if callable(val):
            kwargs_str[str(key)]=str(inspect.getsource(val))
        else:
            kwargs_str[str(key)]=str(val)
    
    for key, val in kwargs_str.items():
        print(key + ': ' + val, file=open(store_folder+'/Hyperparameters.txt', "w"))
    
    for key, val in kwargs_str.items():
        print(key + ': ' + val, file=open(store_prefix + '/ExperimentsIndex.txt', "a+"))
    print('----------------------', file=open(store_prefix + '/ExperimentsIndex.txt', "a+"))
    
    clear_output(wait=True)
    print('Resuls are stored in : ' + store_folder)
    return gifclip

In [7]:
#Running the default setting
run_ppo_experiment().ipython_display()

Resuls are stored in : Experiments/Pendulum/0


100%|█████████▉| 200/201 [00:01<00:00, 143.30it/s]


In [8]:
#Results using a single simulator
run_ppo_experiment(N = 1).ipython_display()

Resuls are stored in : Experiments/Pendulum/1


100%|█████████▉| 200/201 [00:01<00:00, 161.87it/s]


In [9]:
#Results using a larger discount rate
run_ppo_experiment(K = 20, gamma = 0.99).ipython_display()

Resuls are stored in : Experiments/Pendulum/2


100%|█████████▉| 200/201 [00:01<00:00, 128.44it/s]


In [10]:
#Results using a smaller epsilon
run_ppo_experiment(epsilon = 0.02).ipython_display()

Resuls are stored in : Experiments/Pendulum/3


100%|█████████▉| 200/201 [00:01<00:00, 143.04it/s]


In [11]:
#Results using entropy encouragement
run_ppo_experiment(c_2 = 0.01).ipython_display()

Resuls are stored in : Experiments/Pendulum/4


100%|█████████▉| 200/201 [00:01<00:00, 144.99it/s]


In [12]:
#Results using Same Network Value Learning
run_ppo_experiment(seperate_val_net = False).ipython_display()

Resuls are stored in : Experiments/Pendulum/5


100%|█████████▉| 200/201 [00:01<00:00, 73.79it/s]


In [13]:
#Results using fixed sigma of 0.3
run_ppo_experiment(action_sigma=0.3).ipython_display()

Resuls are stored in : Experiments/Pendulum/6


100%|█████████▉| 200/201 [00:01<00:00, 121.42it/s]


In [14]:
#Results using smaller episodes
run_ppo_experiment(max_ep_len = 240, num_loops = 20000).ipython_display()

Resuls are stored in : Experiments/Pendulum/7


100%|█████████▉| 200/201 [00:02<00:00, 95.21it/s]


In [15]:
#Results using no reward shaping
run_ppo_experiment(reward_shaping = lambda r: r).ipython_display()

Resuls are stored in : Experiments/Pendulum/8


100%|█████████▉| 200/201 [00:01<00:00, 123.97it/s]


In [16]:
#Results using no action mean transformation/limitation
run_ppo_experiment(action_mean_transformation = lambda proposed_act_mean: proposed_act_mean).ipython_display()

Resuls are stored in : Experiments/Pendulum/9


100%|█████████▉| 200/201 [00:01<00:00, 134.62it/s]


In [17]:
#Results using no action std transformation/limitation
run_ppo_experiment(action_std_transformation = lambda proposed_act_std: torch.exp(proposed_act_std)).ipython_display()

Resuls are stored in : Experiments/Pendulum/10


100%|█████████▉| 200/201 [00:01<00:00, 105.41it/s]


In [18]:
#Results using deeper nets
run_ppo_experiment(neural_net_maker = lambda s_dim, out_dim: 
                   mean_std_val_net_v3(s_dim, out_dim, hidden_layers_units = [50,100,20]),
                   K=30).ipython_display()

Resuls are stored in : Experiments/Pendulum/11


100%|█████████▉| 200/201 [00:01<00:00, 138.80it/s]


In [None]:
#Default PPO Hyperparameters
default_hyperparams = dict(
    N = 3, #Simulator Threads
    T = 32, #Run sim for this many actions at most
    K = 10,
    gamma = 0.9,
    lamda = 1,
    epsilon = 0.2,
    num_loops = 10000,
    batch_size = 32,
    c_1 = 2,
    c_2 = 0,  # 0.01
    seperate_val_net = True,
    reward_shaping = lambda r: (r+8)/8.,
    action_sigma = None, #0.3

    max_ep_len = 480,
    learning_rate = 0.00001,

    action_mean_transformation = lambda proposed_act_mean: 2 * tanh(proposed_act_mean),
    action_std_transformation = lambda proposed_act_std: softplus(proposed_act_std),

    neural_net_maker = lambda s_dim, out_dim: mean_std_val_net_v3(s_dim, out_dim, hidden_layers_units = [100]),
    Environment_Maker = lambda : gym.make('Pendulum-v0'),
)


In [None]:
#PPO Hyperparameters
N = 3 #Simulator Threads
T = 32 #Run sim for this many actions at most
K = 10
gamma = 0.9
lamda = 1
epsilon = 0.2
num_loops = 10000
batch_size = 32
c_1 = 2
c_2 = 0  # 0.01
seperate_val_net = True
reward_shaping = lambda r: (r+8)/8.
action_sigma = None #0.3

max_ep_len = 480
learning_rate = 0.00001

action_mean_transformation = lambda proposed_act_mean: 2 * tanh(proposed_act_mean)
action_std_transformation = lambda proposed_act_std: softplus(proposed_act_std)

neural_net_maker = lambda s_dim, out_dim: mean_std_val_net_v3(s_dim, out_dim, hidden_layers_units = [100])
Environment_Maker = lambda : gym.make('Pendulum-v0')

In [None]:
if True:
    num_loops = 200
    #Creating the environments
    env_list = [Environment_Maker() for _ in range(N)]
    for i_thread in range(N):
        env = env_list[i_thread]
        env.last_state = env.reset()
        env.my_timer = 0
        env.exp_history = []
        env.need_reset=True

    #Creating the network
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    out_act_factor = 2 if action_sigma is None else 1
    if not seperate_val_net:
        main_nnet = mean_std_val_net_v1(state_space_shape = get_space_shape(env.observation_space),
                                        action_space_shape = [out_act_factor * np.prod(get_space_shape(env.action_space)) + 1]).to(device)
        main_nnet.apply(init_weights)
        main_optimizer = torch.optim.Adam(main_nnet.parameters(), lr=learning_rate)

        act_shape = [int(x) for _,x in enumerate(get_space_shape(env.action_space))]
        def main_net(x):
            net_out = main_nnet(x)
            act_means = net_out[...,0:np.prod(act_shape)].reshape(-1,*act_shape)
            act_means = action_mean_transformation(act_means)
            if action_sigma is None:
                act_stds = net_out[...,np.prod(act_shape):(2*np.prod(act_shape))].reshape(-1,*act_shape)
                act_stds = torch.log( action_std_transformation(act_stds) )
            else:
                act_stds = torch.full_like(act_means, fill_value=float(np.log(action_sigma)), 
                                           dtype=torch.float, device=device, requires_grad=False)
            state_values = net_out[...,-1]
            return act_means, act_stds, state_values
    else:
        main_nnet = mean_std_val_net_v2(state_space_shape = get_space_shape(env.observation_space),
                                        action_space_shape = [out_act_factor * np.prod(get_space_shape(env.action_space))]).to(device)
        val_nnet = mean_std_val_net_v2(state_space_shape = get_space_shape(env.observation_space), action_space_shape = [1]).to(device)

        main_nnet.apply(init_weights)
        val_nnet.apply(init_weights)

        main_optimizer = torch.optim.Adam(list(main_nnet.parameters()) + list(val_nnet.parameters()), lr=learning_rate)

        act_shape = [int(x) for _,x in enumerate(get_space_shape(env.action_space))]

        my_softplus = nn.Softplus()
        def main_net(x):
            net_out = main_nnet(x)
            val_out = val_nnet(x)
            act_means = net_out[...,0:np.prod(act_shape)].reshape(-1,*act_shape)
            act_means = action_mean_transformation(act_means)
            if action_sigma is None:
                act_stds = net_out[...,np.prod(act_shape):(2*np.prod(act_shape))].reshape(-1,*act_shape)
                act_stds = torch.log( action_std_transformation(act_stds) )
            else:
                act_stds = torch.full_like(act_means, fill_value=float(np.log(action_sigma)), 
                                           dtype=torch.float, device=device, requires_grad=False)
            state_values = val_out[...,-1]
            return act_means, act_stds, state_values

    #Creating the test env
    test_env = tester_env_class()

    #Other stuff
    loss_hist = []
    other_hist = []
    learning_vidmaker = figure_compiler_video()

    %matplotlib inline
    np.set_printoptions(precision=2, suppress=True)

    #Training
    for i_loop in range(num_loops):
        #Running the simulations using the old policy
        for i_thread in range(N):
            env = env_list[i_thread]
            if env.need_reset:
                env.last_state = env.reset()
                env.my_timer = 0
                env.need_reset = False
            t_start = env.my_timer
            env.exp_history.append({})
            while env.my_timer < t_start + T:
                s = env.last_state
                with torch.no_grad():
                    s_tensor = torch.from_numpy(s).to(device=device, dtype=torch.float)
                    s_tensor = s_tensor.unsqueeze(0)
                    mean_act, logstd_act, value_state = main_net(s_tensor)
                    mean_act.squeeze_(0), logstd_act.squeeze_(0), value_state.squeeze_(0)
                    chosen_act = torch.randn_like(logstd_act) * torch.exp(logstd_act) + mean_act

                chosen_act_np = chosen_act.numpy()
                s_prime, r, done, info = env.step(chosen_act_np)
                r = reward_shaping(r)
                done = env.my_timer > (max_ep_len-2)

                with torch.no_grad():
                    s_new_tensor = torch.from_numpy(s_prime).to(device=device, dtype=torch.float)
                    s_new_tensor = s_new_tensor.unsqueeze(0)
                    _, _, value_next_state = main_net(s_new_tensor)
                    value_next_state.squeeze_(0)

                sample_data = {'state': np.array(s, copy=True),
                               'next state': np.array(s_prime, copy=True),
                               'action': chosen_act_np,
                               'reward': r,
                               'done' : done,
                               'time' : env.my_timer,
                               'network action mean': mean_act.numpy(),
                               'network action logstd': logstd_act.numpy(),
                               'network state value': value_state.numpy(),
                               'network next state value': value_next_state.numpy()}

                dict_append(env.exp_history[-1],sample_data)
                env.last_state = s_prime
                env.my_timer += 1

                if done:
                    env.need_reset=True
                    break

        data = []
        for i_thread in range(N):
            env = env_list[i_thread]
            #Computing delta and advantage values
            r_t = np.array(env.exp_history[-1]['reward'])
            v_st = np.array(env.exp_history[-1]['network state value'])
            v_st_plus_one = np.array(env.exp_history[-1]['network next state value'])

            with torch.no_grad():
                r_t_tensor = torch.from_numpy(r_t).to(device=device, dtype=torch.float)
                v_st1_tensor = torch.from_numpy(v_st_plus_one).to(device=device, dtype=torch.float).reshape(-1)
                v_st_tensor = torch.from_numpy(v_st).to(device=device, dtype=torch.float).reshape(-1)
                delta = r_t_tensor +  gamma * v_st1_tensor - v_st_tensor
                A_hat_t = mycumprodsum_rev(delta, lamda*gamma)

                mu_old = torch.from_numpy(np.array(env.exp_history[-1]['network action mean'])).\
                            to(device=device, dtype=torch.float)
                logstd_old = torch.from_numpy(np.array(env.exp_history[-1]['network action logstd'])).\
                                to(device=device, dtype=torch.float)
                a_t = torch.from_numpy(np.array(env.exp_history[-1]['action'])).to(device=device, dtype=torch.float)
                s_t = torch.from_numpy(np.array(env.exp_history[-1]['state'])).to(device=device, dtype=torch.float)
                s_t1 = torch.from_numpy(np.array(env.exp_history[-1]['next state'])).to(device=device, dtype=torch.float)

                log_pi_old =  (torch.pow(a_t - mu_old,2))/((-2) * torch.exp(2 * logstd_old)) - logstd_old 
                log_pi_old = log_pi_old.sum(dim=1)  

            data.append([A_hat_t, log_pi_old, a_t, s_t, s_t1, r_t_tensor, logstd_old])

        with torch.no_grad():
            A_hat_t = torch.cat([d[0] for _,d in enumerate(data)], dim=0)
            log_pi_old = torch.cat([d[1] for _,d in enumerate(data)], dim=0)
            a_t = torch.cat([d[2] for _,d in enumerate(data)], dim=0)
            s_t = torch.cat([d[3] for _,d in enumerate(data)], dim=0)
            s_t1 = torch.cat([d[4] for _,d in enumerate(data)], dim=0)
            r_t = torch.cat([d[5] for _,d in enumerate(data)], dim=0)
            log_std_old_t = torch.cat([d[6] for _,d in enumerate(data)], dim=0)

        loss_hist.append({})
        other_hist.append({'Action std': np.exp(log_std_old_t.numpy()),
                           'Reward': r_t.numpy().mean()})
        for k in range(K):
            ep_perm = torch.randperm(s_t.shape[0])

            it_per_ep = np.ceil(s_t.shape[0]/batch_size)
            for batch_iter in range(int(it_per_ep)):
                main_nnet.zero_grad()
                if seperate_val_net:
                    val_nnet.zero_grad()

                with torch.no_grad():
                    curr_idx = ep_perm[batch_iter * batch_size: (batch_iter+1) * batch_size]
                    curr_s_t = torch.index_select(s_t , dim = 0 , index = curr_idx)
                    curr_s_t1 = torch.index_select(s_t1 , dim = 0 , index = curr_idx)
                    curr_a_t = torch.index_select(a_t , dim = 0 , index = curr_idx)
                    curr_r_t = torch.index_select(r_t , dim = 0 , index = curr_idx)
                    curr_logpi_old = torch.index_select(log_pi_old , dim = 0 , index = curr_idx)
                    curr_A_hat_t = torch.index_select(A_hat_t , dim = 0 , index = curr_idx)

                curr_mu_new, curr_logstd_new, curr_v_st = main_net(curr_s_t)

                log_pi_new = torch.pow(curr_a_t - curr_mu_new, 2) / (-2 * torch.exp(2 * curr_logstd_new)) - curr_logstd_new 
                log_pi_new = log_pi_new.sum(dim=1)  

                log_pi_diff = log_pi_new - curr_logpi_old
                pi_ratio = torch.exp(log_pi_diff)

                loss_clip_vec = torch.min(pi_ratio * curr_A_hat_t, 
                                          torch.clamp(pi_ratio, min=1-epsilon, max=1+epsilon) * curr_A_hat_t)

                loss_clip = loss_clip_vec.mean()

                _, _, curr_v_st1 = main_net(curr_s_t1)
                with torch.no_grad():
                    target_v = curr_v_st1 * gamma + curr_r_t
                loss_VF = mse_loss(curr_v_st, target_v)

                if c_2:
                    loss_Entropy = torch.sum(curr_logstd_new + torch.log(torch.tensor(2*np.pi)))
                    total_loss = c_1 * loss_VF - loss_clip - c_2 * loss_Entropy
                else:
                    total_loss = c_1 * loss_VF - loss_clip

                total_loss.backward()
                main_optimizer.step()

                dict_append(loss_hist[-1], {'Total loss': total_loss.detach().numpy(),
                                            'Clip loss': loss_clip.detach().numpy(),
                                            'Value Function loss' : loss_VF.detach().numpy()})            
                if c_2:
                    dict_append(loss_hist[-1], {'Entropy loss' : loss_Entropy.detach().numpy()})

        if i_loop%100 == 0:
            test_env(main_net, reward_shaping = reward_shaping)
            learning_vidmaker.add_figure(test_env.env.figure)
        
        if i_loop%10 == 0:
            print('Loop ' +str(i_loop) + '--> ' + str([key + ': ' +str(np.mean(val))
                                                       + ', ' for key,val in loss_hist[-1].items()] + 
                                                     [key + ': ' +str(np.mean(val))
                                                       + ', ' for key,val in other_hist[-1].items()])) 

    #Storing the results
    ctr=0
    while True:
        if not os.path.exists('Experiments/Pendulum/'+str(ctr)):
            break
        ctr += 1

    store_folder = 'Experiments/Pendulum/'+str(ctr)
    mkdir_p(store_folder)


    #Creating the loss figure
    fig = plt.figure(figsize=(20,10))
    ax = plt.subplot2grid((1, 2), (0, 0))
    ax2 = plt.subplot2grid((1, 2), (0, 1))
    for my_label in loss_hist[0].keys():
        x = range(len(loss_hist))
        y = [np.mean(loss_hist[t][my_label]) for _,t in enumerate(x)]
        ax.plot(x, y, label=my_label)
        
    for my_label in other_hist[0].keys():
        x = range(len(other_hist))
        y = [np.mean(other_hist[t][my_label]) for _,t in enumerate(x)]
        ax2.plot(x, y, label=my_label)

    ax.legend(), ax2.legend()
    ax.set_title('Mean Losses vs mini-episode')
    ax2.set_title('Statistics vs mini-episode')
    show_inline_matplotlib_plots()
    fig.savefig(store_folder+'/LossPlot.png')
    clear_output()

    #Learning Samples Clip
    learning_clip = learning_vidmaker(fps=1)
    learning_clip.write_videofile(store_folder+'/LearningSamples.mp4')

    #The network
    store_material={'cuda':main_nnet.state_dict(), 'cpu':main_nnet.cpu().state_dict()}
    torch.save(store_material, store_folder+'/model.pth')

    #The animation
    test_env.env.plt_render(animation=True, act_per_frame = 1)
    show_inline_matplotlib_plots()
    clear_output()
    matplotlib.rcParams['animation.embed_limit'] = 80
    %time test_env.env.ani.save(store_folder+'/Trajectory.gif', writer=animation_plt.PillowWriter(fps=10))
    gifclip = mpy.VideoFileClip(store_folder+'/Trajectory.gif')
    gifclip.write_videofile(store_folder+'/Trajectory.mp4')

    print('N, T, K, gamma, lamda, epsilon, num_loops, batch_size, c_1, c_2 \n' + 
          str((N, T, K, gamma, lamda, epsilon, num_loops, batch_size, c_1, c_2)), 
          file=open(store_folder+'/Hyperparameters.txt', "w"))


In [None]:

print(inspect.getsource(neural_net_maker))

In [None]:
print(my_label, len([uu.shape for uu in loss_hist[1][my_label]]))

In [None]:
test_env = tester_env_class()
a = test_env(main_net, steps=200)
#learning_vidmaker.add_figure(test_env.env.figure)
test_env.env.figure.set_size_inches(16, 8, forward=True)
display.display(test_env.env.figure)
print(np.array(a).reshape(-1))
#learning_clip = learning_vidmaker(fps=1)
#learning_clip.write_videofile(store_folder+'/LearningSamples.mp4')

In [None]:
test_env.env.exp_history[-1]

In [None]:

test_env(main_net)
test_env.env.figure



In [None]:


test_env.plt_render(animation=False, exp_index = -1)
show_inline_matplotlib_plots()

#clear_output()
#HTML(env.ani.to_jshtml(fps = 5))

In [None]:

#HTML(env.ani.to_jshtml(fps = 5))

In [None]:
test_env.ani.save(store_folder+'/Trajectory.gif', writer=animation_plt.PillowWriter(fps=10))

In [None]:
import torch
a = torch.randn((2,2,2,3))
b = torch.chunk(a,3,-1)
a[:,:,:,1] - b[0].squeeze(3)
#a[:,:,:,0] - b[2]

In [None]:
[x,y,z] = [1,2,3]

In [None]:
a=np.array([1,2,3])
a[::-1]

In [None]:

mycumprodsum([1,2,3][::-1],0.9)[::-1]
    

In [None]:
a = torch.tensor([1.,2,3])
a[]
a.numel()

In [None]:
import torch
a = torch.randn((2,2,2,3))
a[...,0].shape

In [None]:
torch.log(torch.tensor(2*np.pi)).shape

In [None]:
import tensorflow as tf
import baselines
import gym
from baselines import ppo2
from baselines.ppo2.ppo2 import learn
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

In [None]:
#tf.reset_default_graph()
with tf.device('/gpu:0'):
    my_env = DummyVecEnv([lambda : gym.make('Pendulum-v0') for _ in range(16)])
    eval_env = DummyVecEnv([lambda : gym.make('Pendulum-v0') for _ in range(16)])
    mymodel = learn(network='mlp', env=my_env, total_timesteps=20000000,
                    eval_env = eval_env, seed=None, nsteps=200, ent_coef=0.0, lr=3e-4,
                    vf_coef=0.5,  max_grad_norm=0.5, gamma=0.99, lam=0.95,
                    log_interval=10, nminibatches=160, noptepochs=4, cliprange=0.2,
                    save_interval=0, load_path=None,
                    num_hidden=64, num_layers=3, )

In [None]:
sum(['a','b','c'])

In [None]:
import tensorflow as tf
import numpy as np


x = tf.constant([[37.0, -23.0], [1.0, 4.0]])
w1 = tf.constant([[1., 0.], [0.0, 1.0]])
y = tf.matmul(x, w1)
w2 = tf.constant([[1., 0.], [0.0, 1.0]])
z = tf.matmul(y, w2)

with tf.Session() as sess:

    print(sess.run(z, feed_dict={y:np.array([[1,2],[2.,3]])}))