# Introduction to Pytroch

## torch.cuda

In [None]:
import torch

In [None]:
torch.cuda.is_available()

In [None]:
x = torch.ones(4, 4)

In [None]:
y = x.cuda()

## torch.autograd

In [None]:
from torch.autograd import Variable

In [None]:
x = torch.ones(4, 4)
vx = Variable(x)

In [None]:
x

In [None]:
a = Variable(torch.FloatTensor([2]), requires_grad=True)
b = Variable(torch.FloatTensor([3]), requires_grad=True)

In [None]:
c = a * (b ** 2)

In [None]:
c.backward(retain_graph=True)

In [None]:
a.grad

In [None]:
b.grad

In [None]:
a.grad = None
c.backward(retain_graph=True)
a.grad, b.grad

In [None]:
x = torch.ones(4, 4)
vx = Variable(x, requires_grad=True)

In [None]:
z = vx[0, 0] * vx[0, 1]

In [None]:
z.backward()

In [None]:
vx.grad

In [None]:
x = torch.ones(4, 4)
x[0, 0] = 3

In [None]:
vx = Variable(x)

vx[0, 0] = 3

In [None]:
vx = Variable(x, requires_grad=True)

vx[0, 0] = 3

see "In-place operations on Variables" http://pytorch.org/docs/master/autograd.html
to understand why it work when `requires_grad` is set to `False`

In [None]:
x = torch.ones(4, 4)
vx = Variable(x, requires_grad=True)

In [None]:
z = vx * vx

In [None]:
s = torch.sum(z * Variable(torch.ones(4, 4)))
s.backward(retain_graph=True)

In [None]:
# equivalent to the previous cell
z.backward(torch.ones(4, 4), retain_graph=True)

In [None]:
vx.grad

## torch.optim

In [None]:
x = Variable(torch.FloatTensor([3]), requires_grad=True)
y = Variable(torch.FloatTensor([3]), requires_grad=True)

optimizer = torch.optim.SGD([x, y], 0.1)

for _ in range(100):
    optimizer.zero_grad()
    z = x ** 2 + x * y + (y - 2) ** 2
    z.backward()
    optimizer.step()
    
    print("{:8.3}\t{:8.3}\t{:8.3}".format(x.data[0], y.data[0], z.data[0]))

## torch.nn

In [None]:
import torch

In [None]:
from torch import nn
from torch.autograd import Variable

In [None]:
class Perceptron(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.weight = nn.Parameter(torch.randn(30))
    
    def forward(self, x):
        return torch.sum(x * self.weight)


teacher = Perceptron()
student = Perceptron()

optimizer = torch.optim.SGD(student.parameters(), lr=0.01)

for _ in range(200):
    x = Variable(torch.randn(30))
    y_t = teacher(x)
    y_s = student(x)
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = 0.03
    
    optimizer.zero_grad()
    loss = (y_t - y_s) ** 2
    loss.backward()
    optimizer.step()
    
    Q = (student.weight.data * teacher.weight.data).sum() \
        / ((student.weight.data ** 2).sum() * (teacher.weight.data ** 2).sum()) ** 0.5
    print("{:8.3} \t{:8.3} \t{:8.3}".format(y_t.data[0], y_s.data[0], Q))
    

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(teacher.weight.data.numpy())
plt.plot(student.weight.data.numpy())

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.linear1 = nn.Linear(30, 31, bias=True)
        self.linear2 = nn.Linear(31, 1, bias=False)
    
    def forward(self, x):
        x = self.linear1(x)
        x = nn.functional.relu(x)
        x = self.linear2(x)
        return x

    
teacher = MLP()
student = MLP()

optimizer = torch.optim.SGD(student.parameters(), lr=0.01)

for _ in range(200):
    x = Variable(torch.randn(10, 30))
    y_t = teacher(x)
    y_s = student(x)
        
    optimizer.zero_grad()
    loss = torch.sum((y_t - y_s) ** 2)
    loss.backward()
    optimizer.step()
    
    print("{:8.3} \t{:8.3}".format(y_t.data[0, 0], y_s.data[0, 0]))
    