**pytorch** is a framework for Deep learning

PyTorch is a Python machine learning package based on Torch, which is an open-source machine learning package based on the programming language **Lua**. PyTorch has two main features:

* Tensor computation (like NumPy) with strong GPU acceleration (https://medium.com/@quantumsteinke/whats-the-difference-between-a-matrix-and-a-tensor-4505fbdc576c)
* Automatic differentiation for building and training neural networks

Installation: https://pytorch.org

## Pytorch basics

Tensors are similar to numpy arrays

In [None]:
import torch

#### Define tensor:

In [None]:
torch.FloatTensor([[20, 30, 40], [90, 60, 70]])

#### Sum:

In [None]:
x = torch.FloatTensor([25])
y = torch.FloatTensor([30])
x + y

#### Transpose:

In [None]:
matrix = torch.randn(4, 5)
matrix
matrix.t()

#### Automatic differentiation (backward passes):

**.backward()** is to compute gradients. The gradient for a specific tensor will be accumulated in the **.grad** attribute

In [None]:
a = torch.tensor([3.0, 2.0], requires_grad=True)
b = torch.tensor([4.0, 7.0])
ab_sum = a + b
print(ab_sum)
ab_res = (ab_sum*8).sum()
ab_res.backward()
print(ab_res)
print(a.grad, b.grad)

#### The algorithm to train a neural network:

1. Define the neural network with some learnable parameters, referred to as weights
2. Iterate over a dataset of inputs
3. Process input through the network
4. Compare predicted results to actual values and measure the error
5. Propagate gradients back into the network’s parameters
6. Update the weights of the network using an update rule: $weight = weight — learning\_rate * gradient$


The **optim** package abstracts the idea of an optimization algorithm and provides implementations of commonly used optimization algorithms such as AdaGrad, RMSProp and Adam.

In [None]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss()  # MSE: ((input-target)**2).mean()
learning_rate = 1e-4

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#### TODO:
1. save losses
2. write a function to plot losses

#### Define custom module:

In [None]:
class CustomModel(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = CustomModel(D_in, H, D_out)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)

    loss = criterion(y_pred, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

#### TODO:
1. fix train and test data
2. compare different arhitectures: increase number of layers, sizes of layers, use different nonlinearities: torch.nn.Tanh, torch.nn.ReLU, etc
3. plot graphs

## A real world application

Generating names with a character-level RNN

### Data

In [None]:
from __future__ import unicode_literals, print_function, division
import glob
import os
import unicodedata
import string

import torch
import torch.nn as nn
import random

import time
import math

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [None]:
DATA_FOLDER = 'data'

In [None]:
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker

def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

def read_data():
    # Build the category_lines dictionary, a list of lines per category
    category_lines = {}
    all_categories = []
    for filename in findFiles(os.path.join(DATA_FOLDER, 'names/*.txt')):
        category = os.path.splitext(os.path.basename(filename))[0]
        all_categories.append(category)
        lines = readLines(filename)
        category_lines[category] = lines
    return category_lines, all_categories

In [None]:
category_lines, all_categories = read_data()

In [None]:
n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. Make sure that you downloaded data '
        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
        'the current directory.')

print('# categories:', n_categories, all_categories)
print(unicodeToAscii("O'Néàl"))

In [None]:
# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

In [None]:
category = randomChoice(all_categories)
print(category)
category, line = randomTrainingPair()
print(category, line)

In [None]:
# One-hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

# Make category, input, and target tensors from a random category, line pair
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

In [None]:
## Uncomment to check
# print(category)
# print(categoryTensor(category))
# print(line)
# print(inputTensor(line))

In [None]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

#### Training the Network

In [None]:
criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(model, category_tensor, input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = model.initHidden()

    model.zero_grad()

    loss = 0
    for i in range(input_line_tensor.size(0)):
        output, hidden = model(category_tensor, input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in model.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item() / input_line_tensor.size(0)

In [None]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '{}m {}s'.format(m, s)

In [None]:
model = Model(n_letters, 128, n_letters)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(model, *randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

#### Plotting the Losses

In [None]:
# %matplotlib inline

In [None]:
plt.figure()
plt.plot(all_losses)
plt.show()

### Sampling the Network

In [None]:
max_length = 20

# Sample from a category and starting letter
def sample(model, category, start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = model.initHidden()

        output_name = start_letter

        for i in range(max_length):
            output, hidden = model(category_tensor, input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return output_name

# Get multiple samples from one category and multiple starting letters
def samples(model, category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(model, category, start_letter))

In [None]:
samples(model, 'Russian', 'RUS')

# samples(model, 'German', 'GER')
# samples(model, 'Spanish', 'SPA')
# samples(model, 'Chinese', 'CHI')

### TODO:
0. Rewrite with an optimizer
1. Try the nn.LSTM and nn.GRU layers
2. Try different architectures and plot results