In [3]:
import torch
import numpy as np 
import torch.nn as nn


In [4]:
torch.__version__

'1.7.0+cu101'

In [5]:
!nvidia-smi

Sun Nov 22 18:48:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
x = torch.randint(2,10,size=(2,3))
y = torch.randint(2,10,size=(2,3))

# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x, device=device)  # directly create a tensor on GPU
    x = x.to(device)                       # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!

tensor([[ 8,  9,  8],
        [ 7,  8, 10]], device='cuda:0')
tensor([[ 8.,  9.,  8.],
        [ 7.,  8., 10.]], dtype=torch.float64)


## From CPU to GPU and vice-versa

In [12]:
# create a tensor
x = torch.rand(3,2)
# copy to GPU
y = x.cuda()
# copy back to CPU
z = y.cpu()
# get CPU tensor as numpy array
# cannot get GPU tensor as numpy array directly
try:
    y.numpy()
    #z.numpy()
except TypeError as e:
    print(e)

can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.


## Cannot mix CPU and GPU operations

In [None]:
x = torch.rand(3,5)  # CPU tensor
y = torch.rand(5,4).cuda()  # GPU tensor
try:
    torch.mm(x,y)  # Operation between CPU and GPU fails
except TypeError as e:
    print(e)

## Memory Error

GPU memory is quite limited. You will frequently run into the following error:
• RuntimeError: CUDA out of memory. Tried to allocate 12.50 MiB (GPU 0; 10.92 GiB total
capacity; 8.57 MiB already allocated; 9.28 GiB free; 4.68 MiB cached)
• When this happens, either reduce the batch size or check if there are any dangling
unused tensors left on the GPU. You can delete tensors on the GPU and free memory
with:

In [None]:
del y
torch.cuda.empty_cache()

You’ll be running into Cuda errors like: RuntimeError: CUDA error: device-side assert triggered
-
-  This can mean many things. For example:
-  You did an operation between CPU and GPU tensors
-  You did GPU operations between tensors of unexpected shape
-  Likely the most common cause
-  Your types were wrong in some weird way
-  Long when it expects a Float or vice versa is most common

### Put tensor on CUDA if available

In [13]:

x = torch.rand(3,2)
if torch.cuda.is_available():
    x = x.cuda()
    print(x, x.dtype)
    
# Do some calculations
y = x ** 2 
print(y)

# Copy to CPU if on GPU
if y.is_cuda:
    y = y.cpu()
    print(y, y.dtype)

tensor([[0.3347, 0.5490],
        [0.5036, 0.1003],
        [0.8164, 0.8855]], device='cuda:0') torch.float32
tensor([[0.1121, 0.3014],
        [0.2536, 0.0101],
        [0.6665, 0.7841]], device='cuda:0')
tensor([[0.1121, 0.3014],
        [0.2536, 0.0101],
        [0.6665, 0.7841]]) torch.float32


### Convinient way to Create Tensor on corresponding Device


In [14]:
x1 = torch.rand(3,2)
x2 = x1.new(1,2)  # create cpu tensor
print(x2)
x1 = torch.rand(3,2).cuda()
x2 = x1.new(1,2)  # create cuda tensor
print(x2)

tensor([[-3.7353,  0.0000]])
tensor([[0.1121, 0.3014]], device='cuda:0')


Calculations executed on the GPU can be many times faster than numpy. However, numpy is still optimized for the CPU and many times faster than python for loops. Numpy calculations may be faster than GPU calculations for small arrays due to the cost of interfacing with the GPU.

# Differentiation 

In [18]:
from timeit import timeit
# Create random data
x = torch.rand(1000,64)
y = torch.rand(64,32)
number = 10000  # number of iterations

def square():
    z=torch.mm(x, y) # dot product (mm=matrix multiplication)

# Time CPU
print('CPU: {}ms'.format(timeit(square, number=number)*1000))
# Time GPU
x, y = x.cuda(), y.cuda()
print('GPU: {}ms'.format(timeit(square, number=number)*1000))

CPU: 983.0039690000376ms
GPU: 115.76144300011038ms


Link : https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
1. If you set its attribute *.requires_grad* as True, it starts to track all operations on it. 
2. When you finish your computation you can call *.backward()* and have all the gradients computed automatically
3. The gradient for this tensor will be accumulated into *.grad* attribute
4. To stop a tensor from tracking history, you can call *.detach()* to detach it from the computation history, and to prevent future computation from being tracked.
5. To prevent tracking history (and using memory), you can also wrap the code block in *with torch.no_grad():*. This can be particularly helpful when evaluating a model because the model may have trainable parameters with requires_grad=True, but for which we don’t need the gradients.
5. Each tensor has a *.grad_fn* attribute that references a Function that has created the Tensor
6. If you want to compute the derivatives, you can call *.backward()* on a Tensor. If Tensor is a scalar (i.e. it holds a one element data), you don’t need to specify any arguments to *backward()*, however if it has more elements, you need to specify a gradient argument that is a tensor of matching shape.




In [21]:
x = torch.ones(2, 2, requires_grad=True)
print(x)
y = x + 2
print(y)
z = y * y * 3
out = z.mean()

print(z, out)


tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)


## Print gradients d(out)/dx

In [22]:
out.backward()
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


You should have got a matrix of $4.5 .$ Let's call the out Tensor " ${ }_{0} "$. We have that $o=\frac{1}{4} \Sigma_{i} z_{i}, z_{i}=3\left(x_{i}+2\right)^{2}$ and $\left.z_{i}\right|_{x_{i}=1}=27 .$ Therefore, $\frac{\partial o}{\partial x_{i}}=\frac{3}{2}\left(x_{i}+2\right),$ hence $\left.\frac{\partial o}{\partial x_{i}}\right|_{x_{i}=1}=\frac{9}{2}=4.5 .$
Mathematically, if you have a vector valued function $\vec{y}=f(\vec{x})$, then the gradient of $\vec{y}$ with respect to $\vec{x}$ is a Jacobian matrix:
$$
J=\left(\begin{array}{ccc}
\frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{1}}{\partial x_{n}} \\
\vdots & \ddots & \vdots \\
\frac{\partial y_{m}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
\end{array}\right)
$$
Generally speaking, torch.autograd is an engine for computing vector-Jacobian product. That is, given any vector
$v=\left(v_{1} \quad v_{2} \quad \cdots v_{m}\right)^{T},$ compute the product ${ }_{v}{ }^{T} \cdot J .$ If ${ }_{v}$ happens to be the gradient of a scalar function $l=g(\vec{y}),$ that is, $v=\left(\frac{\partial l}{\partial y_{1}} \ldots \frac{\partial l}{\partial y_{m}}\right)^{T},$ then by the chain rule, the vector-Jacobian product would be the gradient of ${ }_{l}$ with respect to $\vec{x}$ :
$$
J^{T} \cdot v=\left(\begin{array}{ccc}
\frac{\partial y_{1}}{\partial x_{1}} & \cdots & \frac{\partial y_{m}}{\partial x_{1}} \\
\vdots & \ddots & \vdots \\
\frac{\partial y_{1}}{\partial x_{n}} & \cdots & \frac{\partial y_{m}}{\partial x_{n}}
\end{array}\right)\left(\begin{array}{c}
\frac{\partial l}{\partial y_{1}} \\
\vdots \\
\frac{\partial l}{\partial y_{m}}
\end{array}\right)=\left(\begin{array}{c}
\frac{\partial l}{\partial x_{1}} \\
\vdots \\
\frac{\partial l}{\partial x_{n}}
\end{array}\right)
$$
(Note that $v^{T} \cdot J$ gives a row vector which can be treated as a column vector by taking $\left.{ J}^{T} \cdot v .\right)$
This characteristic of vector-Jacobian product makes it very convenient to feed external gradients into a model that has non-scalar output.

---

## Here the function is not scalar. I.e in above example,
## f(x) = scalar , here f(x) = vector


In [36]:
x = torch.randn(3, requires_grad=True)

y = x * 2
while y.data.norm() < 1000:
    y = y * 2

print(y)

tensor([ 818.5206, 1263.0530, -894.5714], grad_fn=<MulBackward0>)


In [37]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float)
y.backward(v)

print(x.grad)

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


## with torch.no_grad

In [38]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
    print((x ** 2).requires_grad)

True
True
False


### To Differentiate a vector Add it first

In [29]:
# Create differentiable tensor
x = torch.tensor(torch.arange(0,4,dtype = torch.float), requires_grad=True)
print(x.dtype)
# Calculate y=sum(x**2)
y = x**2
# Calculate gradient (dy/dx=2x)
y.sum().backward()
# Print values
print(x)
print(y)
print(x.grad)

torch.float32
tensor([0., 1., 2., 3.], requires_grad=True)
tensor([0., 1., 4., 9.], grad_fn=<PowBackward0>)
tensor([0., 2., 4., 6.])


  


### Gradient gets accumulated: So,Zero it out first


In [31]:
# Create a variable
x=torch.tensor(torch.arange(0,4,dtype=float), requires_grad=True)
print(x)
# Differentiate
torch.sum(x**2).backward()
print(x.grad)
# Differentiate again (accumulates gradient)
torch.sum(x**2).backward()
print(x.grad)
# Zero gradient before differentiating
x.grad.data.zero_()
torch.sum(x**2).backward()
print(x.grad)

tensor([0., 1., 2., 3.], dtype=torch.float64, requires_grad=True)
tensor([0., 2., 4., 6.], dtype=torch.float64)
tensor([ 0.,  4.,  8., 12.], dtype=torch.float64)
tensor([0., 2., 4., 6.], dtype=torch.float64)


  


## *Tensor with gradfn cannot be converted to numpy*



In [32]:
x=torch.tensor(torch.arange(0,4), requires_grad=True)
x.numpy() # raises an exception

  """Entry point for launching an IPython kernel.


RuntimeError: ignored

The reason is that pytorch remembers the graph of all computations to perform differenciation. To be integrated to this graph the raw data is wrapped internally to the Tensor class (like what was formerly a Variable). You can detach the tensor from the graph using the .detach() method, which returns a tensor with the same data but requires_grad set to False

In [34]:
x=torch.tensor(torch.arange(0,4,dtype=float), requires_grad=True)
y=x**2
z=y**2
z.detach().numpy()

  """Entry point for launching an IPython kernel.


array([ 0.,  1., 16., 81.])

Another reason to use this method is that updating the graph can use a lot of memory. If you are in a context where you have a differentiable tensor that you don't need to differentiate, think of detaching it from the graph.