<a href="https://colab.research.google.com/github/anirbrhm/Deep-Learning/blob/main/IntroductionToPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch 
import numpy as np 
import matplotlib.pyplot as plt 

## Initialize Tensors 

In [2]:
x = torch.ones(3,2)
print(x) 
x = torch.zeros(3,2)
print(x) 
x = torch.rand(3,2) 
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
tensor([[0.2613, 0.3126],
        [0.6808, 0.5001],
        [0.2343, 0.0960]])


In [3]:
x = torch.empty(3,2) # will allocate memory, but not initialize it. Garbage will be stored 
print(x) 
y = torch.zeros_like(x) # use the shape of x 
print(y)

tensor([[2.1718e-29, 3.0757e-41],
        [3.3631e-44, 0.0000e+00],
        [       nan, 1.0000e+00]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [4]:
x = torch.linspace(0,1,5)
print(x)

tensor([0.0000, 0.2500, 0.5000, 0.7500, 1.0000])


In [5]:
x = torch.tensor([[1,2],[3,4],[5,6]]) # specifying each row in each square brackets 
print(x) 

tensor([[1, 2],
        [3, 4],
        [5, 6]])


## Slicing of Tensors 

In [6]:
print(x.size())
print(x[:,1])
print(x[0,:])

torch.Size([3, 2])
tensor([2, 4, 6])
tensor([1, 2])


In [7]:
y = x[1,1] 
print(y) 
print(y.item()) # to get the numerical value. 

tensor(4)
4


## Reshaping Tensors 

In [8]:
print(x) 
y = x.view(2,3)
print(y)

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [9]:
y = x.view(6,-1) # I know only one of the dimensions, the other dimension is found appropriately
print(y)

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])


We can do the basic '+' , '-' and '*' with tensors 

In [10]:
x = torch.ones(3,2) 
y = torch.ones(3,2) 

In [11]:
z = y.add(x) 
print(z) 
print(y) 

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


In [12]:
z = y.add_(x) # in-place addition, updates the y also
print(z) 
print(y)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])
tensor([[2., 2.],
        [2., 2.],
        [2., 2.]])


## Numpy <> PyTorch

In [13]:
x_np = x.numpy() 
print(x_np) 
print(type(x), type(x_np))

[[1. 1.]
 [1. 1.]
 [1. 1.]]
<class 'torch.Tensor'> <class 'numpy.ndarray'>


In [14]:
a = np.random.rand(5) 
print(a) 
a_pt = torch.from_numpy(a) 
print(a) 
print(type(a),type(a_pt)) 

[0.04692011 0.71296832 0.85099411 0.2536531  0.37421834]
[0.04692011 0.71296832 0.85099411 0.2536531  0.37421834]
<class 'numpy.ndarray'> <class 'torch.Tensor'>


In [15]:
np.add(a,1,out=a) 
print(a) 
print(a_pt) # even a_pt is updated, so a_pt was not really just a copy but have the same memory location

[1.04692011 1.71296832 1.85099411 1.2536531  1.37421834]
tensor([1.0469, 1.7130, 1.8510, 1.2537, 1.3742], dtype=torch.float64)


In [16]:
%%time 
for i in range(100):
  a = np.random.rand(100,100) 
  b = np.random.rand(100,100) 
  c = a*b # element wise multiplication, for matrix mulitiplication use matmul 

CPU times: user 26 ms, sys: 1.79 ms, total: 27.8 ms
Wall time: 40 ms


In [17]:
%%time 
for i in range(100):
  a = torch.randn(100,100) 
  b = torch.randn(100,100)
  c = a*b 

CPU times: user 21.2 ms, sys: 128 µs, total: 21.4 ms
Wall time: 23.4 ms


In [18]:
%%time 
for i in range(100):
  a = np.random.rand(1000,1000) 
  b = np.random.rand(1000,1000) 
  c = a+b 

CPU times: user 1.59 s, sys: 9.74 ms, total: 1.6 s
Wall time: 1.61 s


In [19]:
%%time 
for i in range(100):
  a = torch.randn(1000,1000) 
  b = torch.randn(1000,1000)
  c = a+b 

CPU times: user 1.91 s, sys: 9.86 ms, total: 1.92 s
Wall time: 1.92 s


## CUDA Support

In [20]:
print(torch.cuda.device_count())

1


In [23]:
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7fe92dfc1090>
Tesla T4


In [24]:
cuda0 = torch.device("cuda:0")

In [25]:
a = torch.ones(3,2,device=cuda0)
b = torch.ones(3,2,device=cuda0)
c = a + b 
print(c)

tensor([[2., 2.],
        [2., 2.],
        [2., 2.]], device='cuda:0')


In [26]:
%%time 
for i in range(10):
  a = np.random.rand(10000,10000) 
  b = np.random.rand(10000,10000) 
  np.add(b,a) 

CPU times: user 16.1 s, sys: 444 ms, total: 16.6 s
Wall time: 16.6 s


In [27]:
%%time 
for i in range(10):
  a_cpu = torch.rand(10000,10000) 
  b_cpu = torch.rand(10000,10000) 
  b_cpu.add_(a_cpu)

CPU times: user 12.1 s, sys: 179 ms, total: 12.3 s
Wall time: 12.3 s


In [28]:
%%time 
for i in range(10):
  a = torch.rand(10000,10000, device = cuda0) 
  b = torch.rand(10000,10000, device = cuda0) 
  b.add_(a)

CPU times: user 950 µs, sys: 3 ms, total: 3.95 ms
Wall time: 7.06 ms


The Usage of GPU can accelerate the process 10s of thousands of times faster than CPU

## AutoGrad

In [29]:
x = torch.ones([3,2],requires_grad=True) # telling torch that I might later differentiate x 
print(x)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]], requires_grad=True)


In [31]:
y = x + 5 
print(y) # we see add backwards in the output, it is doing the book keeping that y is a function of x 

tensor([[6., 6.],
        [6., 6.],
        [6., 6.]], grad_fn=<AddBackward0>)


In [32]:
z = y*y + 1 
print(z)

tensor([[37., 37.],
        [37., 37.],
        [37., 37.]], grad_fn=<AddBackward0>)


In [33]:
t = torch.sum(z)
print(t)

tensor(222., grad_fn=<SumBackward0>)


We have basically simulated a forward pass

In [34]:
t.backward() # pytorch does some internal computations 

In [35]:
print(x.grad) # derivative of t wrt to x , calculated at x = 1 (initial value)

tensor([[12., 12.],
        [12., 12.],
        [12., 12.]])


In [36]:
x = torch.ones([3,2],requires_grad=True)
y = x + 5 
r = 1/(1 + torch.exp(-y)) 
print(r) 
s = torch.sum(r)
s.backward()
print(x.grad) 

tensor([[0.9975, 0.9975],
        [0.9975, 0.9975],
        [0.9975, 0.9975]], grad_fn=<MulBackward0>)
tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


If we have to directly compute the grad of a tensor then we must pass an argument : 

In [38]:
x = torch.ones([3,2],requires_grad=True)
y = x + 5 
r = 1/(1 + torch.exp(-y)) 
a = torch.ones([3,2])
r.backward(a) # what it does is dr/dx * a. So we choose a = unity, so things remain cool. We will revisit this. 
print(x.grad) 

tensor([[0.0025, 0.0025],
        [0.0025, 0.0025],
        [0.0025, 0.0025]])


$ \frac{\partial s}{\partial x} = \frac{\partial s}{\partial r} . \frac{\partial r}{\partial x} $ \
For the above code $a$ represents $ \frac{\partial s}{\partial r} $ and $x.grad(a)$ directly gives us $\frac{\partial s}{\partial x}$

## AutoGrad example that looks like what we have been doing 

In [39]:
x = torch.randn([20,1], requires_grad=True) 
y = 3*x - 2 # ground truth 

In [40]:
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

y_hat = w*x + b # hypothesis 

loss = torch.sum((y_hat-y)**2) 

In [41]:
print(loss)

tensor(294.6115, grad_fn=<SumBackward0>)


In [42]:
loss.backward()

In [43]:
print(w.grad,b.grad)

tensor([-107.9753]) tensor([124.4242])


## Do it in a loop 

In [None]:
learning_rate = 0.01 

w = torch.tensor([1.], requires_grad=True) # 1. is written to treat it as a number 
b = torch.tensor([1.], requires_grad=True) # initializing both to 1 

print(w.item(),b.item())

for i in range(10):

  x = torch.randn([20,1]) 
  y = 3*x - 2 

  y_hat = w*x + b # hypothesis 
  loss = torch.sum((y_hat-y)**2)

  loss.backward()

  with torch.no_grad(): # to tell torch that it no longer needs to do the book keeping and our forward pass has been completed 
    w -= learning_rate * w.grad # now just thought of as variable updates and not a new function written in terms of the old functions 
    b -= learning_rate * b.grad

    w.grad.zero_()
    b.grad.zero_() # reinitializing them to 0 for the next iteration 
  
  print(w.item(),b.item())

## Doing it for a large problem 

Every tensors needs to be created into the GPU for fast calculations 

In [None]:
%%time 
learning_rate = 0.001 
N = 100000
epochs = 200

w = torch.rand([N], requires_grad=True, device=cuda0) 
b = torch.ones([1], requires_grad=True, device=cuda0) 

print(torch.mean(w).item(), b.item())

for i in range(epochs):
  x = torch.randn([N], device=cuda0) 
  y = torch.dot(3*torch.ones([N],device=cuda0), x) - 2 

  y_hat = torch.dot(w,x) + b 
  loss = torch.sum((y_hat-y)**2)

  loss.backward()

  with torch.no_grad():
    w -= learning_rate * w.grad
    b -= learning_rate * b.grad 

    w.grad.zero_()
    b.grad.zero_()

    print(torch.mean(w).item(), b.item())