In [None]:
# !pip install -Uqq fastbook
# import fastbook
# fastbook.setup_book()

In [None]:
from fastai.vision.all import *
from fastbook import *

matplotlib.rc('image', cmap='Greys')

In [None]:
path = untar_data(URLs.MNIST_SAMPLE)

In [None]:
path

In [None]:
Path.BASE_PATH = path

In [None]:
path

In [None]:
path.ls()

In [None]:
(path/'train').ls()

In [None]:
three =(path/'train'/'3').ls().sorted()
seven =(path/'train'/'7').ls().sorted()
three

In [None]:
# type(three)
# three?
# three??

In [None]:
im3_path = three[1]
im3 = Image.open(im3_path)
im3

In [None]:
# type(im3)
# im3?
# im3??

In [None]:
array(im3)[4:10,4:10]

In [None]:
tensor(im3)[4:10,4:10]

In [None]:
im3_t = tensor(im3)
df = pd.DataFrame(im3_t[4:15,4:22])
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('PuBu')

# Baseline - Pixel Similarity

In [None]:
seven_tensors = [tensor(Image.open(o)) for o in seven]
three_tensors = [tensor(Image.open(o)) for o in three]
len(seven_tensors), len(three_tensors)

In [None]:
type(seven_tensors)

In [None]:
len(seven_tensors)

In [None]:
# in computer vision, it's a standard to work with floats between zero and one
stacked_sevens = torch.stack(seven_tensors).float()/255
stacked_threes = torch.stack(three_tensors).float()/255
stacked_threes.shape

In [None]:
stacked_sevens.ndim

In [None]:
# mean(0) to get the average pixel across each image
mean3 = stacked_threes.mean(0)
show_image(mean3);

In [None]:
mean7 = stacked_sevens.mean(0)
show_image(mean7);

# L1 norm and L2 norm

In [None]:
a_3 = stacked_threes[1]
a_7 = stacked_sevens[1]
show_image(a_3);

In [None]:
dist_3_abs = (a_3 - mean3).abs().mean()
dist_3_sqr = ((a_3 - mean3)**2).mean().sqrt()
dist_3_abs, dist_3_sqr

In [None]:
dist_7_abs = (a_3 -mean7).abs().mean()
dist_7_sqr = ((a_3 -mean7)**2).mean().sqrt()
dist_7_abs, dist_7_sqr

In [None]:
# using pytorch for mean absolute value L1 norm and mean square error L2 norm
F.l1_loss(a_3.float(), mean7), F.mse_loss(a_3, mean7).sqrt()

# NumPy Arrays & PyTorch Tensors

In [None]:
# both are faster than python
# numpy is written in c
# pytorch uses GPU for computation
data = [[1, 2, 3],[4,5,6]]
arr = array(data)
tns = tensor(data)

In [None]:
arr

In [None]:
tns

In [None]:
tns[:,1]

In [None]:
tns[1, 1:]

In [None]:
tns+1

In [None]:
tns.type()

In [None]:
tns*1.5

# Computing Metrics Using Broadcasting

In [None]:
valid_3_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'3').ls()])
valid_3_tens = valid_3_tens.float()/255

valid_7_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'7').ls()])
valid_7_tens = valid_7_tens.float()/255
                            
valid_3_tens.shape, valid_7_tens.shape

In [None]:
def mnist_distance(a, b): return (a-b).abs().mean((-1, -2))
mnist_distance(a_3, mean3)

In [None]:
valid_3_dist = mnist_distance(valid_3_tens, mean3)
valid_3_dist, valid_3_dist.shape

In [None]:
valid_3_dist.shape

In [None]:
mean3.shape

In [None]:
a_3.shape

In [None]:
valid_3_tens.shape

In [None]:
(valid_3_tens-mean3).shape

In [None]:
def is_3(x): return mnist_distance(x, mean3) < mnist_distance(x, mean7)

In [None]:
is_3(a_3), is_3(a_3).float()

In [None]:
is_3(valid_3_tens)

In [None]:
accuracy_3s =      is_3(valid_3_tens).float().mean()
accuracy_7s = (1 - is_3(valid_7_tens).float()).mean()

accuracy_3s,accuracy_7s,(accuracy_3s+accuracy_7s)/2


## broadcasting example

In [None]:
testA = tensor([1,2,3],[4,5,6])

In [None]:
testB = tensor([2,2,2])

In [None]:
testA.shape, testB.shape

In [None]:
testA+testB

In [None]:
(testA+testB).shape

In [None]:
testA*testB

In [None]:
(testA*testB).sum()

# Calculus

In [None]:
def f(x): return x**2

In [None]:
plot_function(f, 'x', 'x^2')
plt.scatter(-1.5, f(-1.5), color='red');

In [None]:
# ?plt.scatter

In [None]:
xt = tensor(3.).requires_grad_()

In [None]:
yt = f(xt)
yt

In [None]:
# backward refers to backpropagation,
# backward pass equals calculate gradient,
# forward pass equals calculate activation
yt.backward()
yt

In [None]:
xt.grad

# Gradient Decent

In [None]:
time = torch.arange(0,20).float(); time

In [None]:
# f(x) = ax^2 + bx + c
speed = torch.randn(20)*3 + 0.75*(time-9.5)**2 + 1
plt.scatter(time,speed);

In [None]:
# why this function
# because we can't consider every possible function
def f(t, params):
    a,b,c = params
    return a*(t**2) + (b*t) + c

In [None]:
# ?torch.randn - n is for normal distribution

In [None]:
# def mse(preds, targets): return ((preds-targets)**2).mean().sqrt()
def mse(preds, targets): return ((preds-targets)**2).mean()

In [None]:
# gradient descent steps
gv('''
init->predict->loss->gradient->step->stop
step->predict[label=repeat]
''')

## step 1: initialiaze the parameters

In [None]:
params = torch.randn(3).requires_grad_()

In [None]:
params

In [None]:
orig_params = params.clone()

## step 2: calculate predictions

In [None]:
preds = f(time, params)

In [None]:
# to_np(preds) from tensor to numpy array
def show_preds(preds, ax=None):
    if ax is None: ax=plt.subplots()[1]
    ax.scatter(time, speed)
    ax.scatter(time, to_np(preds), color='red')
    ax.set_ylim(-300,100)

In [None]:
show_preds(preds)

## step 3: calculate loss

In [None]:
# calculating loss using our function
loss = mse(preds, speed)
loss

In [None]:
# calculating loss using pytorch function
# F.mse_loss(preds, speed).sqrt()
F.mse_loss(preds, speed).sqrt()

## step 4: calculate the gradients

In [None]:
loss.backward()
params.grad

In [None]:
params.grad * 1e-5

In [None]:
params

# step 5: step the weights

In [None]:
params.data

In [None]:
# w -= gradient(w) * lr
# we use .data because without it the gradient would 
# be recalculated in the middle of the step we are doing
lr = 1e-5
params.data -= lr * params.grad.data
params.grad = None

In [None]:
preds = f(time, params)
mse(preds, speed)

In [None]:
show_preds(preds)

In [None]:
def apply_step(params, prn=True):
    preds = f(time, params)
    loss = mse(preds, speed)
    loss.backward()
    params.data -= lr * params.grad.data
    params.grad = None
    if prn: print(loss.item())
    return preds

In [None]:
for i in range(10): apply_step(params)

In [None]:
# apply_step(params)
# params.data

In [None]:
#hide
params = orig_params.detach().requires_grad_()

In [None]:
_,axs = plt.subplots(1,4,figsize=(12,3))
for ax in axs: show_preds(apply_step(params, False), ax)
plt.tight_layout()

In [None]:
params.data

## step 7: stop

In [None]:
# we stopped after 10 epochs
# in practice we watch the training and validation losses
# sand  metrics to decided when to stop

In [None]:
# we start with random weights,
# but in the case of transfer learning, we have given wights
# comparing the outputs with the targets,
# we have labels, calculating the loss
# then we adjust the weights to improve our prediction
# to find how to change the weight, we use gradient descent
# the learning rate is the step size

# MNIST Loss Fuction

In [None]:
# convert independent variable from rank 3 (list of matrices)
#to rank 2 (list of vectors) using view, -1 "make this axis
# as big as necessary to fit all data" 
# view a reshape method
train_x = torch.cat([stacked_threes, stacked_sevens]).view(-1, 28*28)

In [None]:
train_x.ndim

In [None]:
train_x.shape

In [None]:
train_x

In [None]:
len(three)

In [None]:
# train_y contain labels (Dependent)
# train_x vector of images (Independent)
# unsqueeze convert vector to matrix
train_y = tensor([1]*len(three) + [0]*len(seven)).unsqueeze(1)
train_x.shape, train_y.shape

In [None]:
train_y

In [None]:
train_y[6131], train_y[6130]

In [None]:
train_y.shape

In [None]:
len(seven)+len(three)

In [None]:
28*28

In [None]:
tensor([1]*len(three) + [0]*len(seven))

In [None]:
tensor([1]*len(three) + [0]*len(seven)).unsqueeze(1)

In [None]:
# zip parallel iterations
dset = list(zip(train_x, train_y))
x, y = dset[0]
x.shape, y

In [None]:
# same as above, but for validation set
valid_x = torch.cat([valid_3_tens, valid_7_tens]).view(-1, 28*28)
valid_y = tensor([1]*len(valid_3_tens) + [0]*len(valid_7_tens)).unsqueeze(1)
valid_dset = list(zip(valid_x, valid_y))

In [None]:
#step 1: init
# std => variance
def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()
weights = init_params((28*28, 1))

In [None]:
bias = init_params(1)

In [None]:
bias

In [None]:
# similar to y=mx+b, prediction = weights * train_x + bias
# weights and bias are the parameters
(train_x[0]*weights.T).sum() + bias

In [None]:
(train_x[0]*weights).sum() + bias

In [None]:
# matrix multiplication

In [None]:
def linear1(xb): return xb@weights + bias
preds = linear1(train_x)
preds

In [None]:
weights.T.shape

In [None]:
train_x[0].shape, train_x.shape, preds.shape

In [None]:
corrects = (preds>0.0).float() == train_y
corrects

In [None]:
corrects.float().mean().item()

In [None]:
trgts = tensor([1,0,1])
prds = tensor([0.9, 0.4, 0.2])

In [None]:
def mnist_loss(predictions, targets):
    return torch.where(targets==1, 1-predictions, predictions).mean()

In [None]:
# help(torch.where)

\begin{cases}
\text{x}_i & \text{if } \text{condition}_i \\
\text{y}_i & \text{otherwise} \\
\end{cases}

In [None]:
torch.where(trgts==1 , 1-prds, prds)

In [None]:
mnist_loss(tensor([0.9, 0.4, 0.8]), trgts)

## Sigmoid

In [None]:
def sigmoid(x): return 1/(1+torch.exp(-x))

In [None]:
torch.exp(tensor([1.])), math.e

In [None]:
plot_function(torch.sigmoid, title='Sigmoid', min=-4, max=4)

In [None]:
def mnist_loss(predictions, targets):
    predictions = predictions.sigmoid()
    return torch.where(targets==1, 1-predictions, predictions).mean()

In [None]:
# shuffling and min batch collation
coll = range(15)
dl = DataLoader(coll, batch_size=5, shuffle=True)
list(dl)

In [None]:
# example of a dataset
ds = L(enumerate(string.ascii_lowercase))
ds

In [None]:
dl = DataLoader(ds, batch_size=6, shuffle=True)
list(dl)

## Putting It All Together

In [None]:
# for x, y in dl:
#     pred = model(x)
#     lss = loss_func(pred, y)
#     loss.backward()
#     parameters -= parameters.grad * lr

In [None]:
weights = init_params((28*28, 1))
bias = init_params(1)
weights.shape, bias

In [None]:
# matrix multiplication: first matrix columns should equla second matrix rows
# first returns the first batch
dl = DataLoader(dset, batch_size=256)
xb, yb = first(dl)
xb.shape, yb.shape

In [None]:
valid_dl = DataLoader(valid_dset, batch_size=256)

In [None]:
batch = train_x[:4]
batch.shape

In [None]:
preds = linear1(batch)
preds

In [None]:
loss = mnist_loss(preds, train_y[:4])
loss

In [None]:
loss.backward()
weights.grad.shape, weights.grad.mean(), bias.grad

In [None]:
# put all in a function
# init - [perdict - loss - gradient] - step - stop
# step - {repeat} - predict
def calc_grad(xb, yb, model):
    preds = model(xb)
    loss = mnist_loss(preds, yb)
    loss.backward()

In [None]:
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(), bias.grad

In [None]:
# different gradients, this is a problem, this happens because
# backward function calculate gradients and adds them to the existing gradients
calc_grad(batch, train_y[:4], linear1)
weights.grad.mean(), bias.grad

In [None]:
# solution set current gradients to 0
# _ means in-place operation
weights.grad.zero_()
bias.grad.zero_();

In [None]:
def train_epoch(model, lr, params):
    for xb, yb in dl:
        calc_grad(xb, yb, model)
        for p in params:
            p.data -= p.grad * lr
            p.grad.zero_()

In [None]:
(preds>0.0).float() == train_y[:4]

In [None]:
# calculate the accuracy of the batch
def batch_accuracy(xb, yb):
    preds = xb.sigmoid()
    correct = (preds>0.5) == yb
    return correct.float().mean()

In [None]:
batch_accuracy(linear1(batch), train_y[:4])

In [None]:
# stack the batches together
def validation_epoch(model):
    accs = [batch_accuracy(model(xb), yb) for xb, yb in valid_dl]
    return round(torch.stack(accs).mean().item(), 4)

In [None]:
validation_epoch(linear1)

In [None]:
# train for 1 epoch
lr = 1.
params = weights, bias
train_epoch(linear1, lr, params)
validation_epoch(linear1)

In [None]:
for i in range (20):
    train_epoch(linear1, lr, params)
    print(validation_epoch(linear1), end=' ')

# Creating an optimizer

In [None]:
linear_model = nn.Linear(28*28, 1)

In [None]:
# ??nn.Linear

In [None]:
w, b = linear_model.parameters()
w.shape, b.shape

In [None]:
class BasicOptim:
    def __init__(self,params,lr):
        self.params,self.lr = list(params),lr

    def step(self, *args, **kwargs):
        for p in self.params:
            p.data -= p.grad.data * self.lr

    def zero_grad(self, *args, **kwargs):
        for p in self.params:
            p.grad = None

In [None]:
opt = BasicOptim(linear_model.parameters(), lr)

In [None]:
def train_epoch(model):
    for xb, yb in dl:
        calc_grad(xb, yb, model)
        opt.step()
        opt.zero_grad()

In [None]:
validation_epoch(linear_model)

In [None]:
def train_model(model, epochs):
    for i in range(epochs):
        train_epoch(model)
        print(validation_epoch(model), end=' ')

In [None]:
train_model(linear_model, 20)

In [None]:
# fastai SGD alternative to BasicOpti a.k.a refactoring
linear_model = nn.Linear(28*28, 1)
opt = SGD(linear_model.parameters(), lr)
train_model(linear_model, 20)

## beginning custom class

In [None]:
class CustomLinear(nn.Module):
    def __init__(self, w, b):
        super().__init__()
        self.w = nn.Parameter(torch.randn(w))
        self.b = nn.Parameter(torch.randn(b))
        
    def forward(self, xb):
        return xb@self.w + self.b

In [None]:
model = CustomLinear(28*28, 1)
opt = SGD(model.parameters(), lr)
train_model(model, 20)

## end of custom class

In [None]:
# pass data loaders .train and .valid
dls = DataLoaders(dl, valid_dl)

In [None]:
# learner class: data loader + model + optimization function + loss function + metrics
learn = Learner(dls, nn.Linear(28*28, 1), opt_func=SGD, loss_func=mnist_loss, metrics=batch_accuracy)

In [None]:
learn.fit(10, lr=lr)

# Adding a Nonlinearity

In [None]:
def simple_net(xb):
    res = xb@w1 + b1
    res = res.max(tensor(0.0))
    res = res@w2 + b2
    return res

In [None]:
# w1 has 30 output activations
# 28*28 is the size of weights reflecting the size of data
# 30
w1 = init_params((28*28, 30))
b1 = init_params(30)
w2 = init_params((30, 1))
b2 = init_params(1)

In [None]:
plot_function(F.relu)

In [None]:
# linear layer
# nonlinearity known as activation function
# linear layer
simple_net = nn.Sequential(
    nn.Linear(28*28, 30),
    nn.ReLU(),
    nn.Linear(30, 1)
)

In [None]:
learn = Learner(dls, simple_net, opt_func=SGD,
               loss_func=mnist_loss, metrics=batch_accuracy)

In [None]:
learn.fit(40, 0.1)

In [None]:
plt.plot(L(learn.recorder.values).itemgot(2));

In [None]:
learn.recorder.values[-1][2]

In [None]:
m = learn.model
m

In [None]:
w, b = m[0].parameters()

In [None]:
w.shape

In [None]:
show_image(w[0].view(28, 28));

In [None]:
dls = ImageDataLoaders.from_folder(path)
learn = cnn_learner(dls, resnet18, pretrained=False,
                   loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(1, 0.1)