<a href="https://colab.research.google.com/github/anubhavgupta1/DeepLearning/blob/master/10_PyTorch_Forward_Pass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Outline
* PyTorch
* What are tensors
* Initialising, slicing, reshaping tensors
* Numpy and PyTorch interfacing
* GPU support for PyTorch + Enabling GPUs on Google Colab
* Speed comparisons, Numpy -- PyTorch -- PyTorch on GPU
* Autodiff concepts and application
* Writing a basic learning loop using autograd

In [0]:
"""
Torch is an open-source machine learning library, a scientific computing framework, and a script language based on the Lua programming language.
"""

'\nTorch is an open-source machine learning library, a scientific computing framework, and a script language based on the Lua programming language.\n'

In [0]:
"""
PyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing.
It is primarily developed by Facebook's AI Research lab (FAIR). It is free and open-source software released under the Modified BSD license.
Pytorch provides two high level features :-
1. Tensor computation (like Numpy) with strong GPU Acceleration. There are native libraries that are written on CUDA which enable it to accelerate it in Nvidia cards.
2. Deep Neural networks built on tape based autodiff system. Also called as AutoGrad with automatic gradient or automatic differentiation which enables us to write 
   relations between tensors functionally 
"""

"\nPyTorch is an open source machine learning library based on the Torch library, used for applications such as computer vision and natural language processing.\nIt is primarily developed by Facebook's AI Research lab (FAIR). It is free and open-source software released under the Modified BSD license.\nPytorch provides two high level features :-\n1. Tensor computation (like Numpy) with strong GPU Acceleration. There are native libraries that are written on CUDA which enable it to accelerate it in Nvidia cards.\n2. Deep Neural networks built on tape based autodiff system. Also called as AutoGrad with automatic gradient or automatic differentiation which enables us to write \n   relations between tensors functionally \n"

In [0]:
import torch
import numpy as np
import matplotlib.pyplot as plt

## Initialise tensors

In [0]:
x = torch.ones(3, 2) # matrix of ones
print(x)
x = torch.zeros(3, 2) #matrix of zeros
print(x)
x = torch.rand(3, 2) #matrix of random numbers between 0 and 1
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.7236, 0.3409],
        [0.6000, 0.2434],
        [0.3376, 0.6391]])


In [0]:
x = torch.empty(3, 2) # Empty matrix
print(x)
y = torch.zeros_like(x) # matrix of zeros bus size according to x
print(y)

tensor([[-1.4973e-04,  0.0000e+00],
        [ 4.4842e-44,  0.0000e+00],
        [        nan,  0.0000e+00]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [0]:
x = torch.linspace(0, 1, steps=5) #list of 5 numbers between ) and 1
print(x)

tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [0]:
x = torch.tensor([[1, 2], 
                 [3, 4], 
                 [5, 6]]) # 3 x 2 matrix
print(x)

tensor([[1, 2],
        [3, 4],
        [5, 6]])


## Slicing tensors

In [0]:
print(x.size())
print(x[:, 1]) # all rows but column 1
print(x[0, :]) # all columns but row 0  

torch.Size([3, 2])
tensor([2, 4, 6])
tensor([1, 2])


In [0]:
y = x[1, 1]
print(y)
print(y.item())

IndexError: ignored

## Reshaping tensors

In [0]:
print(x)
y = x.view(2, 3) # 2 x 3 matrix
print(y)

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [0]:
y = x.view(6,-1) # coversion into vector
print(y)
print(y.size())

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])
torch.Size([6, 1])


## Simple Tensor Operations

In [0]:
x = torch.ones([3, 2])
y = torch.ones([3, 2])
z = x + y
print(z)
z = x - y
print(z)
z = x * y
print(z)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [0]:
z = y.add(x) # doesnot change y
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [0]:
z = y.add_(x) # change in Y also
print(z)
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])


## Numpy <> PyTorch

In [0]:
x_np = x.numpy() # tensor to numpy conversion
print(type(x), type(x_np))
print(x_np)

<class 'torch.Tensor'> <class 'numpy.ndarray'>
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [0]:
a = np.random.randn(5) # numpy list  
print(a)
a_pt = torch.from_numpy(a) # conversion of numpy list into torch list
print(type(a), type(a_pt))
print(a_pt)

[-1.28582369 -0.19582657  0.07433333 -0.8360086  -0.69525436]
<class 'numpy.ndarray'> <class 'torch.Tensor'>
tensor([-1.2858, -0.1958,  0.0743, -0.8360, -0.6953], dtype=torch.float64)


In [0]:
np.add(a, 1, out=a) # pointwise addition ; i is added to every list member of np list a and tensor list a_pt
print(a)
print(a_pt) 

[-0.28582369  0.80417343  1.07433333  0.1639914   0.30474564]
tensor([-0.2858,  0.8042,  1.0743,  0.1640,  0.3047], dtype=torch.float64)


In [0]:
%%time
for i in range(100):
  a = np.random.randn(100,100)
  b = np.random.randn(100,100)
  c = np.matmul(a, b)

CPU times: user 143 ms, sys: 98.9 ms, total: 242 ms
Wall time: 153 ms


In [0]:
%%time
for i in range(100):
  a = torch.randn([100, 100])
  b = torch.randn([100, 100])
  c = torch.matmul(a, b)

CPU times: user 43.6 ms, sys: 29 ms, total: 72.6 ms
Wall time: 41.1 ms


In [0]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  c = a + b

CPU times: user 1min 24s, sys: 204 ms, total: 1min 25s
Wall time: 1min 25s


In [0]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000])
  b = torch.randn([10000, 10000])
  c = a + b

CPU times: user 14.4 s, sys: 9.09 ms, total: 14.4 s
Wall time: 14.4 s


## CUDA support

In [0]:
# CUDA is the language extension by NVIDIA to support programming in GPUs directly.

In [0]:
print(torch.cuda.device_count()) # returns 1 if GPU is present in the system otherwise 0

1


In [0]:
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7fd06ad7f668>
Tesla P100-PCIE-16GB


In [0]:
cuda0 = torch.device('cuda:0')

In [0]:
a = torch.ones(3, 2, device=cuda0) # a will initialse on GPU card
b = torch.ones(3, 2, device=cuda0) # b will initialise on GPU card
c = a + b # C will run on GPU card
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [0]:
print(a)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], device='cuda:0')


In [0]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.add(b, a)

CPU times: user 1min 23s, sys: 16.3 ms, total: 1min 23s
Wall time: 1min 23s


In [0]:
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  b_cpu.add_(a_cpu)

CPU times: user 14 s, sys: 6.06 ms, total: 14 s
Wall time: 14 s


In [0]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  b.add_(a)

CPU times: user 614 µs, sys: 2 ms, total: 2.61 ms
Wall time: 2.75 ms


In [0]:
%%time
for i in range(10):
  a = np.random.randn(10000,10000)
  b = np.random.randn(10000,10000)
  np.matmul(b, a)

CPU times: user 17min, sys: 3.33 s, total: 17min 3s
Wall time: 9min 17s


In [0]:
%%time
for i in range(10):
  a_cpu = torch.randn([10000, 10000])
  b_cpu = torch.randn([10000, 10000])
  torch.matmul(a_cpu, b_cpu)

CPU times: user 2min 16s, sys: 139 ms, total: 2min 16s
Wall time: 2min 16s


In [0]:
%%time
for i in range(10):
  a = torch.randn([10000, 10000], device=cuda0)
  b = torch.randn([10000, 10000], device=cuda0)
  torch.matmul(a, b)

CPU times: user 443 µs, sys: 2 ms, total: 2.44 ms
Wall time: 2.1 ms


## Autodiff

In [0]:
x = torch.ones([3, 2], requires_grad=True) # here i am telling that some y is function of x and later on derivativative of y wrt derivative of x could be calculated.
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [0]:
y = x + 5
print(y)

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [0]:
z = y*y + 1
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [0]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


In [0]:
t.backward()

In [0]:
print(x.grad)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


$t = \sum_i z_i, z_i = y_i^2 + 1, y_i = x_i + 5$

$\frac{\partial t}{\partial x_i} = \frac{\partial z_i}{\partial x_i} = \frac{\partial z_i}{\partial y_i} \frac{\partial y_i}{\partial x_i} = 2y_i \times 1$


At x = 1, y = 6, $\frac{\partial t}{\partial x_i} = 12$

In [0]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
print(r)
s = torch.sum(r)
s.backward()
print(x.grad)

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


In [0]:
x = torch.ones([3, 2], requires_grad=True)
y = x + 5
r = 1/(1 + torch.exp(-y))
a = torch.ones([3, 2])
r.backward(a)
print(x.grad)

tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


$\frac{\partial{s}}{\partial{x}} = \frac{\partial{s}}{\partial{r}} \cdot \frac{\partial{r}}{\partial{x}}$

For the above code $a$ represents $\frac{\partial{s}}{\partial{r}}$ and then $x.grad$ gives directly $\frac{\partial{s}}{\partial{x}}$



## Autodiff example that looks like what we have been doing

In [0]:
x = torch.randn([20, 1], requires_grad=True)
y = 3*x - 2


In [0]:
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

y_hat = w*x + b

loss = torch.sum((y_hat - y)**2)

In [0]:
print(loss)

tensor(215.3127, grad_fn=<SumBackward0>)


In [0]:
loss.backward()

In [0]:
print(w.grad, b.grad)


tensor([-54.2859]) tensor([107.3512])


## Do it in a loop

In [0]:
learning_rate = 0.01

w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

print(w.item(), b.item())

for i in range(10):
  
  x = torch.randn([20, 1])
  y = 3*x - 2
  
  y_hat = w*x + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  print(w.item(), b.item())
  

1.0 1.0
2.4799084663391113 -0.4175025224685669
2.469186782836914 -0.9799753427505493
2.5898096561431885 -1.384779453277588
2.904806137084961 -1.6957833766937256
2.9250242710113525 -1.8144066333770752
2.949782133102417 -1.8890676498413086
2.9769749641418457 -1.9337213039398193
2.979321002960205 -1.9587136507034302
2.9905850887298584 -1.975563406944275
2.9956319332122803 -1.9861747026443481


## Do it for a large problem

In [0]:
%%time
learning_rate = 0.001
N = 10000000
epochs = 200

w = torch.rand([N], requires_grad=True)
b = torch.ones([1], requires_grad=True)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N])
  y = torch.dot(3*torch.ones([N]), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

#   print(torch.mean(w).item(), b.item())
  

CPU times: user 24.5 s, sys: 160 ms, total: 24.6 s
Wall time: 24.7 s


In [0]:
%%time
learning_rate = 0.001
N = 10000000
epochs = 200

w = torch.rand([N], requires_grad=True, device=cuda0)
b = torch.ones([1], requires_grad=True, device=cuda0)

# print(torch.mean(w).item(), b.item())

for i in range(epochs):
  
  x = torch.randn([N], device=cuda0)
  y = torch.dot(3*torch.ones([N], device=cuda0), x) - 2
  
  y_hat = torch.dot(w, x) + b
  loss = torch.sum((y_hat - y)**2)
  
  loss.backward()
  
  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad
    
    w.grad.zero_()
    b.grad.zero_()

  #print(torch.mean(w).item(), b.item())
  

CPU times: user 269 ms, sys: 122 ms, total: 390 ms
Wall time: 414 ms
