In [6]:
import os
import pickle
import random
import time
from time import time
import typing
from typing import Union

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torch.nn import functional as F
from torch.nn.functional import mse_loss, l1_loss, binary_cross_entropy, cross_entropy
from torch.optim import Optimizer



from tqdm import tqdm

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

device(type='cuda')

In [4]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.17.3-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.17.3


In [None]:
import onnxruntime as ort
from collections import deque

> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/tip.svg">
>   <img alt="Tip" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/tip.svg">
> </picture><br>
>
> ### Helpers & middlewares to measure metrics

In [8]:
class FpsWrapper:
    """ Decorator to calculate the frames per second of a function
    """
    def __init__(self, func: typing.Callable):
        self.func = func
        self.fps_list = deque([], maxlen=100)

    def __call__(self, *args, **kwargs):
        start = time.time()
        results = self.func(self.instance, *args, **kwargs)
        self.fps_list.append(1 / (time.time() - start))
        self.instance.fps = np.mean(self.fps_list)
        return results

    def __get__(self, instance, owner):
        self.instance = instance
        return self.__call__.__get__(instance, owner)


class OnnxInferenceModel:
    """ Base class for all inference models that use onnxruntime

    Attributes:
        model_path (str, optional): Path to the model folder. Defaults to "".
        force_cpu (bool, optional): Force the model to run on CPU or GPU. Defaults to GPU.
        default_model_name (str, optional): Default model name. Defaults to "model.onnx".
    """
    def __init__(
        self,
        model_path: str = "",
        force_cpu: bool = False,
        default_model_name: str = "model.onnx",
        *args, **kwargs
        ):
        self.model_path = model_path.replace("\\", "/")
        self.force_cpu = force_cpu
        self.default_model_name = default_model_name

        # check if model path is a directory with os path
        if os.path.isdir(self.model_path):
            self.model_path = os.path.join(self.model_path, self.default_model_name)

        if not os.path.exists(self.model_path):
            raise Exception(f"Model path ({self.model_path}) does not exist")

        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if ort.get_device() == "GPU" and not force_cpu else ["CPUExecutionProvider"]

        self.model = ort.InferenceSession(self.model_path, providers=providers)

        self.metadata = {}
        if self.model.get_modelmeta().custom_metadata_map:
            # add metadata to self object
            for key, value in self.model.get_modelmeta().custom_metadata_map.items():
                try:
                    new_value = eval(value) # in case the value is a list or dict
                except:
                    new_value = value
                self.metadata[key] = new_value

        # Update providers priority to only CPUExecutionProvider
        if self.force_cpu:
            self.model.set_providers(["CPUExecutionProvider"])

        self.input_shapes = [meta.shape for meta in self.model.get_inputs()]
        self.input_names = [meta.name for meta in self.model._inputs_meta]
        self.output_names = [meta.name for meta in self.model._outputs_meta]

    def predict(self, data: np.ndarray, *args, **kwargs):
        raise NotImplementedError

    @FpsWrapper
    def __call__(self, data: np.ndarray):
        results = self.predict(data)
        return results

In [9]:
class PerformanceEvaluator:
    def __init__(self, model, dataset, device=None, batch_size=32):
        self.model = model.model if hasattr(model, 'model') else model
        self.dataset = dataset
        self.batch_size = batch_size
        self.data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        self.device = device
        self.model.to(self.device)

        # Measured performance metrics
        self.latency = None
        self.throughput = None
        self.model_size = None
        self.target_metrics = None

    def eval(self):

        result = dict(
            latency=self.measure_latency(),
            throughput=self.measure_throughput(),
            model_size=self.measure_model_size(),
        )

        self.report()
        return result

    def measure_latency(self, reps: int = 50):
        timings = np.zeros((reps, 1))
        if torch.cuda.is_available():
            self.warm_up_cuda()
        with torch.no_grad():
            with tqdm(total=reps, desc='Measuring latency', unit='rep') as pbar:
                for rep in range(reps):
                    for inputs, _ in self.data_loader:
                        start_time = time.time()
                        _ = self.model(inputs.to(self.device))
                        end_time = time.time()
                        if torch.cuda.is_available():
                            torch.cuda.synchronize()
                        curr_time = (end_time - start_time) * 1000
                        timings[rep] = curr_time / inputs.size(0)
                        break
                    pbar.update(1)
        self.latency = round(np.mean(timings) / reps, 5)
        return self.latency

    def measure_throughput(self, batches: int = 5):
        total_data_size = 0
        start_time = time.time()
        # measure for n batches
        with torch.no_grad():
            with tqdm(total=batches, desc='Measuring throughput', unit='batch') as pbar:
                for inputs, _ in self.data_loader:
                    inputs = inputs.to(self.device)
                    if batches == 0:
                        break
                    total_data_size += inputs.size(0)
                    _ = self.model(inputs)
                    batches -= 1
                    pbar.update(1)
        if self.device == 'cuda':
            torch.cuda.synchronize()
        total_time = (time.time() - start_time) / 1000
        self.throughput = round(total_data_size / total_time, 0)
        return self.throughput

    def measure_model_size(self):
        if isinstance(self.model, OnnxInferenceModel): # ONNXInferenceModel
            size_all_mb = round(self.model.size(), 3) / 1024 ** 2
        else:
            param_size = 0
            for param in self.model.parameters():
                param_size += param.nelement() * param.element_size()
            buffer_size = 0
            for buffer in self.model.buffers():
                buffer_size += buffer.nelement() * buffer.element_size()

            size_all_mb = (param_size + buffer_size) / 1024 ** 2
        self.model_size = round(size_all_mb, 3)
        return self.model_size

    def warm_up_cuda(self, num_iterations=10):
        """Warm up CUDA by performing some dummy computations"""
        if torch.cuda.is_available():
            for _ in range(num_iterations):
                inputs, _ = next(iter(self.data_loader))
                inputs = inputs.to(self.device)
                _ = self.model(inputs)

    def report(self):
        print(f"Latency: {self.latency} ms/sample with batch_size {self.batch_size}")
        print(f"Throughput: {self.throughput} samples/s with batch_size {self.batch_size}")
        print(f"Model size: {self.model_size} MB")

In [10]:
names = ["year", "month", "day", "dec_year", "sn_value", "sn_error", "obs_num", "unused1"]
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/SN_d_tot_V2.0.csv",
    sep=";",
    header=None,
    names=names,
    na_values=["-1"],
    index_col=False
)

In [11]:
# Data Preprocessing
start_id = max(df[df["obs_num"] == 0].index.tolist()) + 1
df = df[start_id:].copy()
df["sn_value"] = df["sn_value"].astype(float)
df_train = df[df["year"] < 2000]
df_test = df[df["year"] >= 2000]

spots_train = df_train["sn_value"].to_numpy().reshape(-1, 1)
spots_test = df_test["sn_value"].to_numpy().reshape(-1, 1)

scaler = StandardScaler()
spots_train = scaler.fit_transform(spots_train).flatten().tolist()
spots_test = scaler.transform(spots_test).flatten().tolist()

In [12]:
# Sequence Data Preparation
SEQUENCE_SIZE = 10

def to_sequences(seq_size, obs):
    x = []
    y = []
    for i in range(len(obs) - seq_size):
        window = obs[i:(i + seq_size)]
        after_window = obs[i + seq_size]
        x.append(window)
        y.append(after_window)
    return torch.tensor(x, dtype=torch.float32).view(-1, seq_size, 1), torch.tensor(y, dtype=torch.float32).view(-1, 1)

x_train, y_train = to_sequences(SEQUENCE_SIZE, spots_train)
x_test, y_test = to_sequences(SEQUENCE_SIZE, spots_test)

# Setup data loaders for batch
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/check.svg">
>   <img alt="Check" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/check.svg">
> </picture><br>
>
> ## Transformer architecture No Lora

In [13]:
# Positional Encoding for Transformer
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Model definition using Transformer
class TransformerModel(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerModel, self).__init__()

        self.encoder = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

In [12]:
model = TransformerModel().to(device)



In [13]:
# Train the model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)

epochs = 1000
early_stop_count = 0
min_val_loss = float('inf')

for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        x_batch, y_batch = batch
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())

    val_loss = np.mean(val_losses)
    scheduler.step(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_count = 0
    else:
        early_stop_count += 1

    if early_stop_count >= 5:
        print("Early stopping!")
        break
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")




Epoch 1/1000, Validation Loss: 0.0499
Epoch 2/1000, Validation Loss: 0.0418
Epoch 3/1000, Validation Loss: 0.0467
Epoch 4/1000, Validation Loss: 0.0420
Epoch 5/1000, Validation Loss: 0.0574
Epoch 6/1000, Validation Loss: 0.0387
Epoch 7/1000, Validation Loss: 0.0369
Epoch 8/1000, Validation Loss: 0.0354
Epoch 9/1000, Validation Loss: 0.0366
Epoch 10/1000, Validation Loss: 0.0414
Epoch 11/1000, Validation Loss: 0.0357
Epoch 12/1000, Validation Loss: 0.0403
Early stopping!


> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/example.svg">
>   <img alt="Example" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/example.svg">
> </picture><br>
>
> ### Measure of metrics no lora metrics

In [14]:
# Evaluation
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        x_batch, y_batch = batch
        x_batch = x_batch.to(device)
        outputs = model(x_batch)
        predictions.extend(outputs.squeeze().tolist())

rmse = np.sqrt(np.mean((scaler.inverse_transform(np.array(predictions).reshape(-1, 1)) - scaler.inverse_transform(y_test.numpy().reshape(-1, 1)))**2))
print(f"Score (RMSE): {rmse:.4f}")

Score (RMSE): 15.0597


In [15]:
pfev_no_lora = PerformanceEvaluator(
    model=model,
    dataset=train_dataset,
    device=device,
)

pfev_no_lora.eval()

Measuring latency: 100%|██████████| 50/50 [00:00<00:00, 563.83rep/s]
Measuring throughput: 100%|██████████| 5/5 [00:00<00:00, 480.56batch/s]

Latency: 0.00085 ms/sample with batch_size 32
Throughput: 9306328.0 samples/s with batch_size 32
Model size: 3.366 MB





{'latency': 0.00085, 'throughput': 9306328.0, 'model_size': 3.366}

> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/solution.svg">
>   <img alt="Solution" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/solution.svg">
> </picture><br>
>
> ### LoRALayer implentation (same from 2.5 Colab)

In [16]:
class LoRALayer():
    def __init__(
        self,
        r: int,
        lora_alpha: int,
        lora_dropout: float,
        merge_weights: bool,
    ):
        self.r = r
        self.lora_alpha = lora_alpha
        # Optional dropout
        if lora_dropout > 0.:
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_dropout = lambda x: x
        # Mark the weight as unmerged
        self.merged = False
        self.merge_weights = merge_weights

class LinearLora(nn.Linear, LoRALayer):
    # LoRA implemented in a dense layer
    def __init__(
        self,
        in_features: int,
        out_features: int,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.,
        fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        merge_weights: bool = True,
        **kwargs
    ):
        nn.Linear.__init__(self, in_features, out_features, **kwargs)
        LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
                           merge_weights=merge_weights)

        self.fan_in_fan_out = fan_in_fan_out
        # Actual trainable parameters
        if r > 0:
            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
            self.scaling = self.lora_alpha / self.r
            # Freezing the pre-trained weight matrix
            self.weight.requires_grad = False
        self.reset_parameters()
        if fan_in_fan_out:
            self.weight.data = self.weight.data.transpose(0, 1)

    def reset_parameters(self):
        nn.Linear.reset_parameters(self)
        if hasattr(self, 'lora_A'):
            # initialize B the same way as the default for nn.Linear and A to zero
            # this is different than what is described in the paper but should not affect performance
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)

    def train(self, mode: bool = True):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        nn.Linear.train(self, mode)
        if mode:
            if self.merge_weights and self.merged:
                # Make sure that the weights are not merged
                if self.r > 0:
                    self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
                self.merged = False
        else:
            if self.merge_weights and not self.merged:
                # Merge the weights and mark it
                if self.r > 0:
                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
                self.merged = True

    def forward(self, x: torch.Tensor):
        def T(w):
            return w.transpose(0, 1) if self.fan_in_fan_out else w
        if self.r > 0 and not self.merged:
            result = F.linear(x, T(self.weight), bias=self.bias)
            result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) @ self.lora_B.transpose(0, 1)) * self.scaling
            return result
        else:
            return F.linear(x, T(self.weight), bias=self.bias)

In [None]:
> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/complete.svg">
>   <img alt="Complete" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/complete.svg">
> </picture><br>
>
> ### TransformerLora with encoder and decoder Lora Layers

In [17]:
# Model definition using TransformerLora
class TransformerLora(nn.Module):
    def __init__(self, input_dim=1, d_model=64, nhead=4, num_layers=2, dropout=0.2):
        super(TransformerLora, self).__init__()

        self.encoder = LinearLora(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.decoder = LinearLora(d_model, 1)

    def forward(self, x):
        x = self.encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = self.decoder(x[:, -1, :])
        return x

In [18]:
model_lora = TransformerLora().to(device)



In [19]:
# Train the model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_lora.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)

epochs = 1000
early_stop_count = 0
min_val_loss = float('inf')

for epoch in range(epochs):
    model_lora.train()
    for batch in train_loader:
        x_batch, y_batch = batch
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model_lora(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    # Validation
    model_lora.eval()
    val_losses = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model_lora(x_batch)
            loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())

    val_loss = np.mean(val_losses)
    scheduler.step(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_count = 0
    else:
        early_stop_count += 1

    if early_stop_count >= 5:
        print("Early stopping!")
        break
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")



Epoch 1/1000, Validation Loss: 0.0458
Epoch 2/1000, Validation Loss: 0.0378
Epoch 3/1000, Validation Loss: 0.0464
Epoch 4/1000, Validation Loss: 0.0390
Epoch 5/1000, Validation Loss: 0.0448
Epoch 6/1000, Validation Loss: 0.0458
Early stopping!


> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/example.svg">
>   <img alt="Example" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/example.svg">
> </picture><br>
>
> ### Measure of metrics with lora metrics

In [20]:
# Evaluation
model_lora.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        x_batch, y_batch = batch
        x_batch = x_batch.to(device)
        outputs = model_lora(x_batch)
        predictions.extend(outputs.squeeze().tolist())

rmse = np.sqrt(np.mean((scaler.inverse_transform(np.array(predictions).reshape(-1, 1)) - scaler.inverse_transform(y_test.numpy().reshape(-1, 1)))**2))
print(f"Score (RMSE): {rmse:.4f}")

Score (RMSE): 15.4552


In [21]:
pfev_lora = PerformanceEvaluator(
    model=model,
    dataset=train_dataset,
    device=device,
)

pfev_lora.eval()

Measuring latency: 100%|██████████| 50/50 [00:00<00:00, 567.24rep/s]
Measuring throughput: 100%|██████████| 5/5 [00:00<00:00, 404.15batch/s]

Latency: 0.00084 ms/sample with batch_size 32
Throughput: 9782205.0 samples/s with batch_size 32
Model size: 3.366 MB





{'latency': 0.00084, 'throughput': 9782205.0, 'model_size': 3.366}

> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/info.svg">
>   <img alt="Info" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/info.svg">
> </picture><br>
>
> #### Compare with Microsoft Lora Implementation same model

In [1]:
!pip install loralib
import loralib as lora

Collecting loralib
  Downloading loralib-0.1.2-py3-none-any.whl (10 kB)
Installing collected packages: loralib
Successfully installed loralib-0.1.2


In [29]:
model_microsoft = TransformerModel().to(device)

In [30]:
# Train the model
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_microsoft.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=3, verbose=True)

epochs = 1000
early_stop_count = 0
min_val_loss = float('inf')

for epoch in range(epochs):
    model_microsoft.train()
    for batch in train_loader:
        x_batch, y_batch = batch
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model_microsoft(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

    # Validation
    model_microsoft.eval()
    val_losses = []
    with torch.no_grad():
        for batch in test_loader:
            x_batch, y_batch = batch
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            outputs = model_microsoft(x_batch)
            loss = criterion(outputs, y_batch)
            val_losses.append(loss.item())

    val_loss = np.mean(val_losses)
    scheduler.step(val_loss)

    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop_count = 0
    else:
        early_stop_count += 1

    if early_stop_count >= 5:
        print("Early stopping!")
        break
    print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}")

Epoch 1/1000, Validation Loss: 0.0365
Epoch 2/1000, Validation Loss: 0.0375
Epoch 3/1000, Validation Loss: 0.0427
Epoch 4/1000, Validation Loss: 0.0365
Epoch 5/1000, Validation Loss: 0.0416
Epoch 6/1000, Validation Loss: 0.0367
Epoch 7/1000, Validation Loss: 0.0389
Epoch 8/1000, Validation Loss: 0.0376
Early stopping!


In [31]:
lora.mark_only_lora_as_trainable(model_microsoft)

In [32]:
pfev_model_microsoft = PerformanceEvaluator(
    model=model_microsoft,
    dataset=train_dataset,
    device=device,
)

pfev_model_microsoft.eval()

Measuring latency: 100%|██████████| 50/50 [00:00<00:00, 558.69rep/s]
Measuring throughput: 100%|██████████| 5/5 [00:00<00:00, 448.08batch/s]

Latency: 0.00085 ms/sample with batch_size 32
Throughput: 9572622.0 samples/s with batch_size 32
Model size: 3.366 MB





{'latency': 0.00085, 'throughput': 9572622.0, 'model_size': 3.366}

In [33]:
lora.mark_only_lora_as_trainable(model_microsoft, bias='all')

In [34]:
pfev_model_microsoft_all = PerformanceEvaluator(
    model=model_microsoft,
    dataset=train_dataset,
    device=device,
)

pfev_model_microsoft_all.eval()

Measuring latency: 100%|██████████| 50/50 [00:00<00:00, 557.96rep/s]
Measuring throughput: 100%|██████████| 5/5 [00:00<00:00, 479.18batch/s]

Latency: 0.00085 ms/sample with batch_size 32
Throughput: 9463282.0 samples/s with batch_size 32
Model size: 3.366 MB





{'latency': 0.00085, 'throughput': 9463282.0, 'model_size': 3.366}

In [35]:
lora.mark_only_lora_as_trainable(model_microsoft, bias='lora_only')

In [36]:
pfev_model_microsoft_lora = PerformanceEvaluator(
    model=model_microsoft,
    dataset=train_dataset,
    device=device,
)

pfev_model_microsoft_lora.eval()

Measuring latency: 100%|██████████| 50/50 [00:00<00:00, 566.65rep/s]
Measuring throughput: 100%|██████████| 5/5 [00:00<00:00, 476.41batch/s]

Latency: 0.00084 ms/sample with batch_size 32
Throughput: 8586088.0 samples/s with batch_size 32
Model size: 3.366 MB





{'latency': 0.00084, 'throughput': 8586088.0, 'model_size': 3.366}

> <picture>
>   <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/light-theme/success.svg">
>   <img alt="Success" src="https://raw.githubusercontent.com/Mqxx/GitHub-Markdown/main/blockquotes/badge/dark-theme/success.svg">
> </picture><br>
>
> # Results

```
# Before
{'latency': 0.00085, 'throughput': 9306328.0, 'model_size': 3.366}
# Own Implementation
{'latency': 0.00084, 'throughput': 9782205.0, 'model_size': 3.366}
# Microsoft implementation default
{'latency': 0.00085, 'throughput': 9572622.0, 'model_size': 3.366}
# Microsoft implementation all
{'latency': 0.00085, 'throughput': 9463282.0, 'model_size': 3.366}
# Microsoft implementation lora_only
{'latency': 0.00084, 'throughput': 8586088.0, 'model_size': 3.366}
```
