<a href="https://colab.research.google.com/github/alfazick/AppliedLLMCourse/blob/main/Module3part1LowRankAdaptation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Plan LowRank Adaption

# 1) Matrix Multiplication in NN


# 2) Motivation: Why LoRA

# 3) Idea of low rank matrices

# Paper: https://arxiv.org/abs/2012.13255
"Intrinsic Dimensionality Explains the Effectiveness of Language Model Fine-Tuning"

# 4) Singular Value Decomposition SVD Intuition


# 5) LoRA Paper - Connecting SVD to Practice
# Paper: https://arxiv.org/abs/2106.09685
"LoRA: Low-Rank Adaptation of Large Language Models"

# 6) Choosing Rank r and alpha
# 7) Implementation Trick, Why You don't need a FULL Matrix


# 8) Practical Assignment: PEFT Library
# https://huggingface.co/docs/peft/en/quicktour
# a) fine tune using PEFT for one personality (get a book from gutenberg)
# b) create a multi character web service where person can chooose
# to which character to talk to, use larger base model, and switch adapters
# under the hood


In [9]:
# 1) Matrix Multiplication
import torch

# Set seed for reproducible results
torch.manual_seed(97)

# Setup
x = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).reshape(1, 6)
W = torch.randn(6, 4)

# Method 1: Pure Python

def matrix_multiply(A, B):
    outer_row = len(A)
    inner_col = len(A[0])
    inner_row = len(B)
    outer_col = len(B[0])


    if inner_col != inner_row:
        print("Internal Dimension should Match!!!")
        return None

    inner_shared = inner_col
    result = [[0 for _ in range(outer_col)] for _ in range(outer_row)]

    for i in range(outer_row):
        for j in range(outer_col):
            for k in range(inner_shared):
                result[i][j] += A[i][k] * B[k][j]

    return result

mat = matrix_multiply(x.tolist(), W.tolist())
result_python = torch.tensor(mat)

# Method 2: PyTorch @ operator
result_pytorch = x @ W

print("Pure Python:  ", result_python)
print("PyTorch @:    ", result_pytorch)

Pure Python:   tensor([[25.5227, -5.0256,  2.5722, 10.9780]])
PyTorch @:     tensor([[25.5227, -5.0256,  2.5722, 10.9780]])


In [10]:
# ok now let's look at Neural Network Linear Layer

def matrix_multiply(A, B):
    outer_row = len(A)
    inner_col = len(A[0])
    inner_row = len(B)
    outer_col = len(B[0])


    if inner_col != inner_row:
        print("Internal Dimension should Match!!!")
        return None

    inner_shared = inner_col
    matrix = [[0 for _ in range(outer_col)] for _ in range(outer_row)]

    for i in range(outer_row):
        for j in range(outer_col):
            for k in range(inner_shared):
                matrix[i][j] += A[i][k] * B[k][j]

    return matrix

def add_bias(matrix,bias):
    rows = len(matrix)
    cols = len(matrix[0])

    for r in range(rows):
        for c in range(cols):
            matrix[r][c] += bias[c]

    return matrix

def relu(matrix):
    rows = len(matrix)
    cols = len(matrix[0])

    for r in range(rows):
        for c in range(cols):
            matrix[r][c] = max(0,matrix[r][c])

    return matrix


import random
import math

class ManualLinear:
    def __init__(self,in_features,out_features,bias=True,seed=None):
        self.in_features = in_features
        self.out_features = out_features
        self.bias_enabled = bias

        rng = random.Random(seed) if seed is not None else random

        # (simple version)
        self.weight = [[0.0 for _ in range(out_features)]
                       for _ in range(in_features)]
        self.bias = [0.0 for _ in range(out_features)] if bias else None


    def forward(self,x):
        if isinstance(x[0], (int,float)):
            x_batch = [x]
            single_input = True
        else:
            x_batch = x
            single_input = False

        # x @ W :))
        out = matrix_multiply(x_batch,self.weight)

        if self.bias is not None:
            out = add_bias(out,self.bias)

        if single_input:
            return out[0]
        return out


    __call__ = forward # so you can do layer(x) like in PyTorch

    def __repr_(self):
        return f"ManualLinear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None})"



In [20]:
layer = ManualLinear(in_features=6, out_features=4,bias=True)
layer.weight = W.tolist()
x_list = x.tolist()
y_manual = layer(x_list)
y_manual_tensor = torch.tensor(y_manual)

print("y_manual matrix ",y_manual_tensor)


linear = torch.nn.Linear(6,4,bias=True)

with torch.no_grad():
    linear.weight.copy_(W.T)
    linear.bias.zero_()

y_linear = linear(x)

print("y_nn.linear.    ",y_linear)
print("equal?", torch.allclose(y_manual_tensor, y_linear))


y_manual matrix  tensor([[25.5227, -5.0256,  2.5722, 10.9780]])
y_nn.linear.     tensor([[25.5227, -5.0256,  2.5722, 10.9780]], grad_fn=<AddmmBackward0>)
equal? True


In [22]:
# 2) Motivation: Why LoRA
# Why not full fintunning as you did?

# Problem # 1 ) update billion of weights,
# when like optimizer above just weights keeps even more parameters
# for example AdamW

# Problem # 2 ) rigid structure and heavy models
# imagine you have multiple problems for which you want to finetune a model
# well let's say you have enough resources to fine-tune them,
# you still have a problem how quickly you can switch underlying model
# at the time of inference ???
# let's say you base 7 bln parameter, are you going to switch and reload
# each of your fintuned models each time?

# one of the creator of LoRA
# https://www.youtube.com/watch?v=DhRoTONcyZE
# explains motivation of task he was solving at Microsoft

# Problem #3) "Catastophing forgetting"



In [26]:
# 3) Idea of low rank matrices
# in general the idea is that  a  big matrixes
# can be represented as a product of two smaller matrices
# without losing or minimal lose of information


import torch
torch.manual_seed(0)

d, k = 100, 100
r = 8   # inner dim (rank limit)

# Two smaller matrices
A = torch.randn(d, r)   # [100, 8]
B = torch.randn(r, k)   # [8, 100]

# Big matrix W built from A and B
W = A @ B                # [100, 100]

print("A shape:", A.shape)
print("B shape:", B.shape)
print("W shape:", W.shape)
print("W:\n", W)

# so notice if matrix w is actual weights of model
# and we can find two good smaller matrixes
# we don't need to store 100*100 = 10000 elements
# we cane just keep total 800 + 800 = 1600 elements
# huge memory win right ???
# ok now we are left with two questions
# a) why this even should work for nn ?
# b) where viability of this idea originates from ?

A shape: torch.Size([100, 8])
B shape: torch.Size([8, 100])
W shape: torch.Size([100, 100])
W:
 tensor([[ 5.3685, -1.0868,  2.2387,  ..., -0.6155,  2.4315, -3.3885],
        [ 3.1883,  0.5887,  2.6259,  ..., -0.9715,  0.0449, -0.7832],
        [ 1.6745,  3.8639, -4.5289,  ...,  2.7822, -7.9605,  4.1718],
        ...,
        [-3.9455,  1.7957, -3.1901,  ...,  1.0807, -0.5203,  0.9297],
        [ 1.7443, -3.5769,  0.9222,  ..., -2.4217,  3.5732, -2.7722],
        [ 0.1791,  1.8260,  1.1715,  ..., -0.4111, -1.5972,  1.5057]])


In [28]:
# a) from paper
# https://arxiv.org/pdf/2012.13255
# I love when author write like this
"""Why can we use relatively vanilla gradient descent algorithms (e.g., without strong regularization)
# to tune a model with hundreds of millions of parameters on datasets
# with only hundreds or thousands of labeled examples? """

# claim

"""We empirically show that common pre-trained models have a very low intrinsic dimension;
in other words, there exists a low dimension reparameterization that is
as effective for fine-tuning as the full parameter space."""

# so try to connect to our above example like
# if matrix w is low rank, it means instead of storing all weights
# we can store the weights in form of two smaller matrices



'We empirically show that common pre-trained models have a very low intrinsic dimension; \nin other words, there exists a low dimension reparameterization that is\nas effective for fine-tuning as the full parameter space.'

In [37]:
# 4) Singular Value Decomposition SVD Intuition
# https://www.youtube.com/watch?v=02QCtHM1qb4&list=PLMrJAkhIeNNSVjnsviglFoY2nXildDCcv
# :) videos to watch to fall asleep, I called hibernating learning


# ok so now we are left with question like
# how those two smaller matrixes should look like?
# and funny enough there is an algo to find exactly it

# SVD = Singular Value Decomposition

U,S,Vh = torch.linalg.svd(W)
print("U shape:", U.shape)    # [100,100]
print("S shape:", S.shape)    # [100]
print("Vh shape:", Vh.shape)  # [100,100]

# so here comes our matrices A,B from SVD

r = 8 #

U_r = U[:,:r] # [100,r]
S_r = S[:r] # [r]
Vh_r = Vh[:r,:] # [r,100]

A_svd = U_r * S_r
B_svd = Vh_r

W_approx = A_svd @ B_svd

# How close is this rank-8 approximation to original W?
rel_error = (W - W_approx).norm() / W.norm()
print("rank-8 relative error:", rel_error.item())
print("A_svd shape:", A_svd.shape)
print("B_svd shape:", B_svd.shape)

# so from results you can see that SVD is literally an alogorithm
# to construct good "two smaller matrices"

U shape: torch.Size([100, 100])
S shape: torch.Size([100])
Vh shape: torch.Size([100, 100])
rank-8 relative error: 5.648137744174164e-07
A_svd shape: torch.Size([100, 8])
B_svd shape: torch.Size([8, 100])


In [40]:
# 5) LoRA Paper - Connecting SVD to Practice
# Paper: https://arxiv.org/abs/2106.09685
"LoRA: Low-Rank Adaptation of Large Language Models"

# Ok now here is actual magic jumps behind LoRA
# what if instead of computing SVD for full W matrix updates of gradients
# we will directly learn A and B during fine-tuning :))
# welcome to the world wheree everyone pushes into one direction

"""We propose Low-Rank Adaptation, or LoRA, which freezes the pretrained
model weights and injects trainable rank decomposition matrices into each
layer of the Transformer architecture,
greatly reducing the number of trainable parameters for downstream tasks"""

# h = W0x + âˆ†W x = W0x + BAx # slow down and read out loud



'We propose Low-Rank Adaptation, or LoRA, which freezes the pretrained \nmodel weights and injects trainable rank decomposition matrices into each\nlayer of the Transformer architecture, \ngreatly reducing the number of trainable parameters for downstream tasks'