Run this notebook to approximate $\sin(x)$ with $y = {a} + {b} x + {c} x^2 + {d} x^3$.

In [158]:
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib.patches as patches
font = font_manager.FontProperties(style='normal', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.set_default_dtype(torch.float64)
import random

import json

from IPython.display import display, Math, Markdown

import datetime
import os
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
timestamp
version = '_0.1.0'
import math
pi = math.pi

# Shortest exit time

Given a bounded domain $D\subset\mathbb{R}^d$, find 

$\inf_{u}\{t\ge0~:~x_t\not\in D\}$

where ${dx}_t=u_t{{dt}}$ with control $|u_t|\le1$ and $u_t\in\mathbb{R}^d$ and initial position $x_0=x\in D$.


For  $D=[a,b]\subset\mathbb{R}$, the solution is obvious; go full speed to the nearest exit point $a$ or $b$. In higher dimension, the solution is not as simple. 

Here, we use a specific cost function to approximately find the solution of this problem.

# Approximation of velocity with neural networks

## Defining a neural network in torch

In [159]:
num_neurons = 40
velocity= torch.nn.Sequential(
            torch.nn.Linear(1, num_neurons),
            torch.nn.Tanh(),
            torch.nn.Linear(num_neurons, num_neurons),
            torch.nn.Tanh(),
            torch.nn.Linear(num_neurons,1),
        )

## Generate samples for the initial point $x_0$

This helps up solve the problem for the whole interval $[a,b]$ once for all.


In [160]:
M = 10 # number of samples for x0
a = 0 #interval start
b = 1 #interval end
x0 = a + torch.rand([M,1])*(b-a)
T = 1.2 # time horizon (artificial)
N = 10 # number of time steps
delta = T/N # time step

## Dynamic of the state process

In [161]:
def state(x0,model):
    t = torch.linspace(0, T, N+1)
    x = torch.zeros((M, N+1, 1))
    x[:,0,:] = x0
    tmp = x[:,0,:].clone().detach()
    for n in range (1, N+1):
        x[:,n,:] = tmp + torch.minimum(torch.maximum(model(tmp),torch.tensor([-1.]*M).reshape(M,1)),torch.tensor([1.]*M).reshape(M,1)) *delta
        tmp = x[:,0,:].clone().detach()
    return x

## Loss function

Loss function for the problem is the combination of running cost and the terminal function. For this problem:

$C(x,u)=1\!\!1_{\{x\in[a,b]\}}$ and $g(x)=1\!\!1_{\{x\in[a,b]\}}$

In [162]:
def loss(x):
    loss_ = (x[:,N,:]>a).float()*(x[:,N,:]<b).float() + torch.nn.ReLU()(x[:,N,:]) - torch.nn.ReLU()(x[:,N,:])#to make auto-differentiation work. require gradient
    # print(loss_.requires_grad)
    for n in range(0,N):
        loss_ = loss_ + (x[:,n,:]>a).float()*(x[:,n,:]<b).float()*delta #+ torch.nn.ReLU()(x[:,n,:]) - torch.nn.ReLU()(x[:,n,:]) 
        # print(loss_.requires_grad)
    return torch.mean(loss_)
    

## Training

In [163]:
# plt.plot(torch.linspace(0, T, N+1),x[2,:,0].clone().detach().numpy())
# plt.plot(torch.linspace(0, T, N+1),x[0,:,0].clone().detach().numpy(), marker='o')

In [164]:
torch.autograd.set_detect_anomaly(True)
num_epochs = 1000
learning_rate = 1e-1
# optimizer = optim.Adam(velocity.parameters(), learning_rate)
for e in range(num_epochs):
    x = state(x0,velocity)
    # x[:,0,:] = x0
    # xo = x[:,0,:]
    # for n in range (1, N+1):
    #     xo = xo.clone().detach() + velocity(xo.clone().detach())*delta # (+= does not work)
    #     x[:,n,:] = xo
    # # loss = torch.square(y_pred - y).sum().clone().detach().requires_grad_(True)
    cost = loss(x)
    # loss_ = (x[:,N,:]>a).float()*(x[:,N,:]<b).float() + torch.nn.ReLU()(x[:,N,:]) - torch.nn.ReLU()(x[:,N,:])#to make auto-differentiation work. require gradient
    # for n in range(0,N):
    #     loss_ = loss_ + (x[:,n,:]>a).float()*(x[:,n,:]<b).float()*delta + torch.nn.ReLU()(x[:,n,:]) - torch.nn.ReLU()(x[:,n,:]) 
    # cost = torch.mean(loss_)
    # optimizer.zero_grad() # Zero the gradients before running the backward pass.
    velocity.zero_grad() # Zero the gradients before running the backward pass.    
    cost.backward() # Backward pass: compute gradient of the loss with respect to all the learnable parameters
    if e % 100 == 0:
        print(e+1, cost)
    with torch.no_grad(): # Update the weights using gradient descent. Each parameter is a Tensor, so
        for param in velocity.parameters(): # We can access its gradients like we did before.
            param -= learning_rate * param.grad # Update the weights using gradient descent
    # optimizer.step() # Update the weights using gradient descent. Each parameter is a Tensor, so

1 tensor(2.2000, grad_fn=<MeanBackward0>)
101 tensor(2.2000, grad_fn=<MeanBackward0>)
201 tensor(2.2000, grad_fn=<MeanBackward0>)
301 tensor(2.2000, grad_fn=<MeanBackward0>)
401 tensor(2.2000, grad_fn=<MeanBackward0>)
501 tensor(2.2000, grad_fn=<MeanBackward0>)
601 tensor(2.2000, grad_fn=<MeanBackward0>)
701 tensor(2.2000, grad_fn=<MeanBackward0>)
801 tensor(2.2000, grad_fn=<MeanBackward0>)
901 tensor(2.2000, grad_fn=<MeanBackward0>)


The loss function does not change and the model is not properly trained. Why?

In [165]:
def new_loss(x):
    loss_ = torch.nn.ReLU()(x[:,N,:]-a) - 2*torch.nn.ReLU()(x[:,N,:]-(a+b)/2) + torch.nn.ReLU()(x[:,N,:]-b)#to make auto-differentiation work. require gradient
    # print(loss_.requires_grad)
    for n in range(0,N):
        loss_ = loss_ + (torch.nn.ReLU()(x[:,n,:]-a) - 2*torch.nn.ReLU()(x[:,n,:]-(a+b)/2) + torch.nn.ReLU()(x[:,n,:]-b))*delta #+ torch.nn.ReLU()(x[:,n,:]) - torch.nn.ReLU()(x[:,n,:]) 
        # print(loss_.requires_grad)
    return torch.mean(loss_)

In [170]:
torch.autograd.set_detect_anomaly(True)
num_epochs = 1000
learning_rate = 1e-1
# optimizer = optim.Adam(velocity.parameters(), learning_rate)
for e in range(num_epochs):
    x = state(x0,velocity)
    # x[:,0,:] = x0
    # xo = x[:,0,:]
    # for n in range (1, N+1):
    #     xo = xo.clone().detach() + velocity(xo.clone().detach())*delta # (+= does not work)
    #     x[:,n,:] = xo
    # # loss = torch.square(y_pred - y).sum().clone().detach().requires_grad_(True)
    cost = new_loss(x)
    # loss_ = (x[:,N,:]>a).float()*(x[:,N,:]<b).float() + torch.nn.ReLU()(x[:,N,:]) - torch.nn.ReLU()(x[:,N,:])#to make auto-differentiation work. require gradient
    # for n in range(0,N):
    #     loss_ = loss_ + (x[:,n,:]>a).float()*(x[:,n,:]<b).float()*delta + torch.nn.ReLU()(x[:,n,:]) - torch.nn.ReLU()(x[:,n,:]) 
    # cost = torch.mean(loss_)
    # optimizer.zero_grad() # Zero the gradients before running the backward pass.
    velocity.zero_grad() # Zero the gradients before running the backward pass.    
    cost.backward() # Backward pass: compute gradient of the loss with respect to all the learnable parameters
    if e % 100 == 0:
        print(e+1, cost)
    with torch.no_grad(): # Update the weights using gradient descent. Each parameter is a Tensor, so
        for param in velocity.parameters(): # We can access its gradients like we did before.
            param -= learning_rate * param.grad # Update the weights using gradient descent
    # optimizer.step() # Update the weights using gradient descent. Each parameter is a Tensor, so

1 tensor(0.4376, grad_fn=<MeanBackward0>)
101 tensor(0.4376, grad_fn=<MeanBackward0>)
201 tensor(0.4376, grad_fn=<MeanBackward0>)
301 tensor(0.4376, grad_fn=<MeanBackward0>)
401 tensor(0.4376, grad_fn=<MeanBackward0>)
501 tensor(0.4376, grad_fn=<MeanBackward0>)
601 tensor(0.4376, grad_fn=<MeanBackward0>)
701 tensor(0.4376, grad_fn=<MeanBackward0>)
801 tensor(0.4376, grad_fn=<MeanBackward0>)
901 tensor(0.4376, grad_fn=<MeanBackward0>)


In [171]:
velocity(torch.tensor([0.75]))

tensor([2.1534], grad_fn=<ViewBackward0>)

In [172]:
x1 = torch.tensor([0.2]*11).unsqueeze(-1).unsqueeze(0)
print(x1.shape)
new_loss(x1),torch.minimum(torch.maximum(velocity(torch.tensor([0.2])),torch.tensor([-1.])),torch.tensor([1.]))

torch.Size([1, 11, 1])


(tensor(0.4400), tensor([-0.3134], grad_fn=<MinimumBackward0>))