# Solving optimal control problems with policy gradient method 

In [11]:
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib.patches as patches
font = font_manager.FontProperties(style='normal', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.set_default_dtype(torch.float32) # improved the speed when the parameters are float32
import random

import json

from IPython.display import display, Math, Markdown

import datetime
import os
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
timestamp
version = '_0.1.0'
import math
PI = math.pi

## Defining control by a neural network

$$\phi(t,x;\theta)$$



In [34]:
num_neurons = 20 # Modify the number of neurons

model = torch.nn.Sequential(
            torch.nn.Linear(1, num_neurons),
            torch.nn.ReLU(),
            torch.nn.Linear(num_neurons,1),
        )

## Predefine global variables

In [35]:
N = 10 # number of time steps
T = 1.0 # terminal time
dt = T/N # time step
num_samples = 1_000 # Modify the number of samples
x0 = 2*torch.rand([num_samples,1]) -1 # initial condition uniformly random in [-1,1]
dW = torch.sqrt(torch.tensor(dt))*torch.randn([num_samples,N,1]) # Brownian increments

## State variable

$$dX_t = (x_t-u_t)dt + dW_t$$

$$u_t=\phi(t,x_t,\theta)$$

In [36]:
# update function
def update(t,x,u):
    return x + (x -  u)*dt + dW[:,t,:]

## Loss function 

$$C(x,u)=x^2+x+u^2$$

$$g(x)=x^2-x$$

In [37]:
def step_loss(t,x,model):
    u = model(x)
    return torch.mean(x**2+x+u**2)*dt

def total_loss(model):
    x = x0
    running_cost = 0.0
    for t in range(N):
        u = model(x)
        running_cost = running_cost + step_loss(t,x,model)
        x = update(t,x,u)
    terminal_cost = torch.mean(x**2 - x)
    total_loss = running_cost + terminal_cost
    return total_loss

## Generate the number of samples

In [38]:
num_epochs = 30
learning_rate = 1e-1
optimizer = optim.Adam(model.parameters(), learning_rate)
# optimizer2 = optim.SGD(model.parameters(), learning_rate,momentum=0.9)
for t in range(num_epochs):
    loss = total_loss(model)#.clone().detach().requires_grad_(True)
    optimizer.zero_grad() # Zero the gradients before running the backward pass.
    loss.backward() # Backward pass: compute gradient of the loss with respect to all the learnable parameters
    if t % 2 == 0:
        print(t, loss)
    optimizer.step() # Update the weights and biases using gradient descent. Each parameter is a Tensor. Equivalent to the above three lines, but more efficient.

0 tensor(6.7341, grad_fn=<AddBackward0>)
2 tensor(2.6102, grad_fn=<AddBackward0>)
4 tensor(2.8496, grad_fn=<AddBackward0>)
6 tensor(3.1038, grad_fn=<AddBackward0>)
8 tensor(3.0079, grad_fn=<AddBackward0>)
10 tensor(2.7307, grad_fn=<AddBackward0>)
12 tensor(2.5654, grad_fn=<AddBackward0>)
14 tensor(2.5935, grad_fn=<AddBackward0>)
16 tensor(2.7156, grad_fn=<AddBackward0>)
18 tensor(2.6161, grad_fn=<AddBackward0>)
20 tensor(2.5395, grad_fn=<AddBackward0>)
22 tensor(2.5640, grad_fn=<AddBackward0>)
24 tensor(2.5992, grad_fn=<AddBackward0>)
26 tensor(2.5900, grad_fn=<AddBackward0>)
28 tensor(2.5492, grad_fn=<AddBackward0>)
