In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
from tqdm import tqdm

import random

# CSIC and Vocab Classes

In [None]:
import os
from typing import List
from collections import Counter

import torch
from torch.utils.data import Dataset

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel

from urllib import parse

class CSICDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab=None, vocab_size=1000, min_frequency=1, special_tokens=["[UNK]","[CLS]","[PAD]"], tokenization_algorithm="bpe"):
        self.df = df

        # Export text content to csv for learning tokenization; apply BPE
        path = os.path.join('.', 'tokenization_input')

        self.df.to_csv(path_or_buf=path, columns=['content_for_tokenization'], index=False, header=False)

        if vocab == None:
            vocab = Vocab(vocab_size=vocab_size, min_frequency=min_frequency,
                           special_tokens=special_tokens,
                           tokenization_algorithm=tokenization_algorithm)
            vocab.build(corpus_files=[path])

        self.vocab = vocab

        self.encode_df()


    @staticmethod
    def process_df(df: pd.DataFrame):
        # Pre-process data by dropping rows without POST-Data or GET-Query
        get_mask, post_mask = df['GET-Query'].notna(), df['POST-Data'].notna()

        df.loc[get_mask,"content_for_tokenization"] = df.loc[get_mask,"GET-Query"]
        df.loc[post_mask,"content_for_tokenization"] = df.loc[post_mask,"POST-Data"]

        df = df[get_mask | post_mask]
        df = df.drop(columns=["GET-Query","POST-Data", "Accept-Charset", "Accept-Language", "Accept", "Cache-control", "Pragma", "Content-Type", "Host-Header", "Connection"])

        return df

    def encode_df(self):
        # Tokenize the GET-Query and POST-Data columns according to the subword vocabulary learned from BPE
        self.df["tokenized_ids"] = self.df["content_for_tokenization"].apply(lambda x: self.vocab.words2indices(x))
        self.df["tokenized"] = self.df["content_for_tokenization"].apply(lambda x: self.vocab.tokenize(x))
        self.df = self.df.drop(columns=["content_for_tokenization"])

        self.class_encoder, self.method_encoder = LabelEncoder(), LabelEncoder()
        self.df['Class'], self.df['Method'] = self.class_encoder.fit_transform(self.df['Class']), self.class_encoder.fit_transform(self.df['Method'])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        features = self.df.iloc[index].drop(['Class', 'User-Agent'])
        label = self.df.iloc[index]['Class']

        return features, label

In [None]:
class Vocab(object):
    def __init__(self, vocab_size=0, min_frequency=0, special_tokens: List[str]=[], unk_token="[UNK]", pad_token="[PAD]", tokenizer=None, tokenization_algorithm="bpe"):
        if tokenizer:
            self.tokenizer = tokenizer

            self.word2id = tokenizer.get_vocab()
            self.id2word = {v: k for k, v in self.word2id.items()}

            self.unk_id = self.word2id[unk_token]

        else:
            assert vocab_size > 0
            assert min_frequency > 0

            self.vocab_size = vocab_size
            self.min_frequency = min_frequency
            self.special_tokens = special_tokens
            self.unk_token = unk_token
            self.pad_token = pad_token
            self.tokenization_algorithm = tokenization_algorithm

    def build(self, corpus_files: List[str]):
        if self.tokenization_algorithm == 'bpe':
            tokenizer = Tokenizer(BPE(unk_token=self.unk_token))
            trainer = BpeTrainer(vocab_size=self.vocab_size, min_frequency=self.min_frequency, special_tokens=self.special_tokens)
            tokenizer.pre_tokenizer = ByteLevel()

            tokenizer.train(corpus_files, trainer)

            self.tokenizer = tokenizer
            self.word2id = tokenizer.get_vocab()
            self.id2word = {v: k for k, v in self.word2id.items()}

        elif self.tokenization_algorithm == 'vocab_map':
            self.word2id, self.id2word = dict(), dict()
            curr_id, self.min_frequency = 1,1
            counter = Counter()

            def add_to_vocab(token: str, ignore_cutoff=False):
                nonlocal curr_id
                if (ignore_cutoff or counter[token] >= self.min_frequency) and token not in self.word2id:
                    self.word2id[token] = curr_id
                    self.id2word[curr_id] = token

                    curr_id += 1

            for file_path in corpus_files:
                with open(file_path, 'r') as file:
                    for line in file:
                        tokens = Vocab.parse_req_body_or_params(line)
                        counter.update(tokens)

            unwanted_tokens = [' ','']
            for token in unwanted_tokens:
                if token in counter:
                    del counter[token]

            for token in self.special_tokens:
                add_to_vocab(token, ignore_cutoff=True)

            for token in set(counter.elements()):
                add_to_vocab(token)

        else:
            raise TypeError("Unsupported tokenization algorithm detected")

        self.unk_id = self.word2id[self.unk_token]
        self.pad_id = self.word2id[self.pad_token]

    def __getitem__(self, word):
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        return word in self.word2id

    def __setitem__(self, key, value):
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        return len(self.word2id)

    def __repr__(self):
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        return self.id2word[wid]

    def save(self, file_path):
        self.tokenizer.save(path=file_path)

    def words2indices(self, content):
        if self.tokenization_algorithm == 'bpe':
            if type(content) == list:
                return [self.tokenizer.encode(row).ids for row in content]
            else:
                return self.tokenizer.encode(content).ids

        elif self.tokenization_algorithm == 'vocab_map':
            if type(content) == list:
                return [[self[token] for token in Vocab.parse_req_body_or_params(line)] for line in content]
            else:
                return [self[token] for token in Vocab.parse_req_body_or_params(content)]
        else:
            raise TypeError("Unsupported tokenization algorithm detected")

    def tokenize(self, content):
        if self.tokenization_algorithm == 'bpe':
            if type(content) == list:
                return [self.tokenizer.encode(row).tokens for row in content]
            else:
                return self.tokenizer.encode(content).tokens

        elif self.tokenization_algorithm == 'vocab_map':
            if type(content) == list:
                return [[token if self.__contains__(token) else self.unk_token for token in Vocab.parse_req_body_or_params(line)] for line in content]
            else:
                return [token if self.__contains__(token) else self.unk_token for token in Vocab.parse_req_body_or_params(content)]

        else:
            raise TypeError("Unsupported tokenization algorithm detected")

    @staticmethod
    def parse_req_body_or_params(line: str):
        parsed_line = parse.parse_qs(parse.unquote_plus(string=line))

        tokens = []
        for k, v in parsed_line.items():
            tokens.append(k)
            tokens.extend(v)

        return tokens

    @staticmethod
    def load(file_path: str):
        return Vocab(tokenizer=Tokenizer.from_file(file_path))

# Data Loading

In [None]:
# Defining global constants
RANDOM_SEED = 42
BATCH_SIZE = 32

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

In [None]:
# This is how we select a GPU if it's available on your computer or in the Colab environment.
print('Device of execution - ', device)

Device of execution -  cuda


In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('./dataset.csv')
df = CSICDataset.process_df(df)

# # The following two lines are used to load the indices of the training and validation sets
# train_indices = np.load('./dataset/train_indices.npy')
# val_indices = np.load('./dataset/val_indices.npy')

# # The following two lines are used to select the training and validation sets from the dataframe based on the indices loaded above
# train_data = df.loc[train_indices].reset_index(drop=True)
# val_data = df.loc[val_indices].reset_index(drop=True)

train_data, val_data = train_test_split(df, test_size=0.2)

In [None]:
train_dataset = CSICDataset(df=train_data, vocab_size=5000, min_frequency=1, tokenization_algorithm='bpe')
train_vocab = train_dataset.vocab

val_dataset = CSICDataset(df=val_data, vocab=train_vocab)

train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)

In [None]:
# Check Dataset Lengths
assert len(train_dataset) == 45319, "Training Dataset is of incorrect size"
assert len(val_dataset) == 11330, "Validation Dataset is of incorrect size"

print('Training and Validation dataset sizes match!')

Training and Validation dataset sizes match!


In [None]:
PADDING_VALUE = train_vocab.pad_id

In [None]:
def collate_fn(batch, padding_value=PADDING_VALUE):
    # Batch is of the form List[Tuple(Features(tokenized_ids,...), Labels)]
    sequences = [torch.tensor(sample[0]['tokenized_ids'], dtype=torch.long, device=device) for sample in batch]
    padded_tokens = torch.nn.utils.rnn.pad_sequence(sequences=sequences,batch_first=True, padding_value=padding_value)
    attention_mask = (padded_tokens != padding_value).float()
    labels = torch.tensor([sample[1] for sample in batch])

    return padded_tokens, attention_mask, labels

In [None]:
train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn)
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn)

In [None]:
for tokens, attention_masks, labels in train_iterator:
    print(f'tokens: {tokens.shape}')
    print(f'atteniton masks: {attention_masks.shape}')
    print(f'labels: {labels.shape}')
    break

tokens: torch.Size([32, 172])
atteniton masks: torch.Size([32, 172])
labels: torch.Size([32])


# BERT Model

In [None]:
!pip install transformers



In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import transformers
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

# add python path to include src directory
import sys
sys.path.insert(0, '../src')

# standard library imports
from dataclasses import dataclass
from pathlib import Path
from typing import Tuple
import math

# related third party imports
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader
from transformers import BertTokenizer
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import tqdm
import time

In [None]:
class BertTrainer:
    """ A training and evaluation loop for PyTorch models with a BERT like architecture. """


    def __init__(
        self,
        model,
        tokenizer,
        train_dataloader,
        eval_dataloader=None,
        epochs=1,
        lr=5e-04,
        output_dir='./',
        output_filename='model_state_dict.pt',
        save=False,
        tabular=False,
    ):
        """
        Args:
            model: torch.nn.Module: = A PyTorch model with a BERT like architecture,
            tokenizer: = A BERT tokenizer for tokenizing text input,
            train_dataloader: torch.utils.data.DataLoader =
                A dataloader containing the training data with "text" and "label" keys (optionally a "tabular" key),
            eval_dataloader: torch.utils.data.DataLoader =
                A dataloader containing the evaluation data with "text" and "label" keys (optionally a "tabular" key),
            epochs: int = An integer representing the number epochs to train,
            lr: float = A float representing the learning rate for the optimizer,
            output_dir: str = A string representing the directory path to save the model,
            output_filename: string = A string representing the name of the file to save in the output directory,
            save: bool = A boolean representing whether or not to save the model,
            tabular: bool = A boolean representing whether or not the BERT model is modified to accept tabular data,
        """

        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        self.loss_fn = nn.CrossEntropyLoss()
        self.output_dir = output_dir
        self.output_filename = output_filename
        self.save = save
        self.eval_loss = float('inf')  # tracks the lowest loss so as to only save the best model
        self.epochs = epochs
        self.epoch_best_model = 0  # tracks which epoch the lowest loss is in so as to only save the best model
        self.tabular = tabular

    def train(self, evaluate=False):
        """ Calls the batch iterator to train and optionally evaluate the model."""
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)

    def evaluate(self):
        """ Calls the batch iterator to evaluate the model."""
        epoch=0
        self.iteration(epoch, self.eval_dataloader, train=False)

    def iteration(self, epoch, data_loader, train=True):
        """ Iterates through one epoch of training or evaluation"""

        # initialize variables
        loss_accumulated = 0.
        correct_accumulated = 0
        samples_accumulated = 0
        preds_all = []
        labels_all = []

        self.model.train() if train else self.model.eval()

        # progress bar
        mode = "train" if train else "eval"
        batch_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"EP ({mode}) {epoch}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )

        total_comp_time = 0
        # iterate through batches of the dataset
        for i, batch in batch_iter:
            # print("Batch: ", batch)
            # print(len(batch))
            # tokenize data
            # batch_t = self.tokenizer(
            #     batch[0],
            #     padding='max_length',
            #     max_length=512,
            #     truncation=True,
            #     return_tensors='pt',
            # )
            # batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            # batch_t["input_labels"] = batch["label"].to(self.device)
            # batch_t["tabular_vectors"] = batch["tabular"].to(self.device)
            # batch = {key: value.to(self.device) for key, value in batch.items()}
            token_types = torch.zeros((len(batch[0][0])), dtype=torch.long).to(self.device)
            # forward pass - include tabular data if it is a tabular model
            if self.tabular:
                logits = self.model(
                    input_ids=batch_t["input_ids"],
                    token_type_ids=batch_t["token_type_ids"],
                    attention_mask=batch_t["attention_mask"],
                    tabular_vectors=batch_t["tabular_vectors"],
                )

            else:
                start = time.time()
                logits = self.model(
                    input_ids=batch[0],
                    token_type_ids=token_types,
                    attention_mask=batch[1],
                ).logits
                total_comp_time += time.time() - start

            # calculate loss
            labels = batch[2].to(self.device)
            loss = self.loss_fn(logits, labels)

            # compute gradient and and update weights
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # calculate the number of correct predictions
            preds = logits.argmax(dim=-1)
            correct = preds.eq(labels).sum().item()

            # accumulate batch metrics and outputs
            loss_accumulated += loss.item()
            correct_accumulated += correct
            samples_accumulated += len(batch[2])
            preds_all.append(preds.detach())
            labels_all.append(labels.detach())


        average_comp_time = total_comp_time / len(batch_iter)
        print("Average Comp Time: ", average_comp_time)

        # concatenate all batch tensors into one tensor and move to cpu for compatibility with sklearn metrics
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()

        # metrics
        accuracy = accuracy_score(labels_all, preds_all)
        precision = precision_score(labels_all, preds_all, average='macro')
        recall = recall_score(labels_all, preds_all, average='macro')
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)

        # print metrics to console
        print(
            f"samples={samples_accumulated}, \
            correct={correct_accumulated}, \
            acc={round(accuracy, 4)}, \
            recall={round(recall, 4)}, \
            prec={round(precision,4)}, \
            f1={round(f1, 4)}, \
            loss={round(avg_loss_epoch, 4)}"
        )

        # save the model if the evaluation loss is lower than the previous best epoch
        if self.save and not train and avg_loss_epoch < self.eval_loss:

            # create directory and filepaths
            dir_path = Path(self.output_dir)
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / f"{self.output_filename}_epoch_{epoch}.pt"

            # delete previous best model from hard drive
            if epoch > 0:
                file_path_best_model = dir_path / f"{self.output_filename}_epoch_{self.epoch_best_model}.pt"
                !rm -f $file_path_best_model

            # save model
            torch.save({
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict()
            }, file_path)

            # update the new best loss and epoch
            self.eval_loss = avg_loss_epoch
            self.epoch_best_model = epoch

In [None]:
# class BERT(nn.Module):
#     def __init__(self):
#         super(BERT, self).__init__()
#         self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
#         self.out = nn.Linear(768, 1)

#     def forward(self,ids,mask,token_type_ids):
#         _,o2= self.bert_model(ids,attention_mask=mask,token_type_ids=token_type_ids, return_dict=False)

#         out= self.out(o2)

#         return out

# model=BERT()

# loss_fn = nn.BCEWithLogitsLoss()

# #Initialize Optimizer
# optimizer= optim.Adam(model.parameters(),lr= 0.0001)


# load tokenizer and pretrained model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the latest version of the BERT tokenizer
tokenizer_base = AutoTokenizer.from_pretrained('bert-base-uncased')

# Load the latest version of the BERT model
bert_base = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2  # number of classes
)

# LORA

In [None]:
import math
from typing import List, Tuple


class LinearLoRA(nn.Module):
    """
    A low-rank adapted linear layer.

    Args:
        in_dim: int = An integer representing the input dimension of the linear layer
        out_dim: int = An integer representing the output dimension of the linear layer
        r: int = An integer representing the rank of the low-rank approximated matrices
        lora_alpha: int = An integer representing the numerator of the scaling constant alpha / r
        lora_dropout: float = A float between 0 and 1 representing the dropout probability
    """

    def __init__(
        self,
        in_dim: int,
        out_dim: int,
        r: int = 8,
        lora_alpha: int = 16,
        lora_dropout: float = 0.,
    ):
        super().__init__()
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = nn.Dropout(lora_dropout)

        # Check that the rank is at least 1
        assert r > 0, "Variable 'r' is not greater than zero. Choose a rank of 1 or greater."

        # recreate the linear layer and freeze it (the actual weight values will be copied in outside of this class)
        self.pretrained = nn.Linear(in_dim, out_dim, bias=True)
        self.pretrained.weight.requires_grad = False

        # create the low-rank A matrix and initialize with same method as in Hugging Face PEFT library
        self.lora_A = nn.Linear(in_dim, r, bias=False)
        nn.init.kaiming_uniform_(self.lora_A.weight, a=math.sqrt(5))

        # create the low-rank B matrix and initialize to zero
        self.lora_B = nn.Linear(r, out_dim, bias=False)
        nn.init.constant_(self.lora_B.weight, 0)

        # scaling constant
        self.scaling = self.lora_alpha / self.r

    def forward(self, x):
        pretrained_out = self.pretrained(x)
        lora_out = self.lora_dropout(x)
        lora_out = self.lora_A(lora_out)
        lora_out = self.lora_B(lora_out)
        lora_out = lora_out * self.scaling
        return pretrained_out + lora_out


def freeze_model(model):
    """Freezes all layers except the LoRa modules and classifier."""
    for name, param in model.named_parameters():
        if "lora" not in name and "classifier" not in name:
            param.requires_grad = False


def create_lora(module, r, lora_dropout, lora_alpha):
    """Converts a linear module to a LoRA linear module."""
    k, d = module.weight.shape  # pytorch nn.Linear weights are transposed, that is why shape is (k, d) and not (d, k)
    lora = LinearLoRA(d, k, r, lora_dropout=lora_dropout, lora_alpha=lora_alpha)
    with torch.no_grad():
        lora.pretrained.weight.copy_(module.weight)
        lora.pretrained.bias.copy_(module.bias)
    return lora


def add_lora_layers(
    model,
    module_names: Tuple=("query", "value"),
    r: int=8,
    lora_alpha: float=16,
    lora_dropout: float=0.1,
    ignore_layers: List[int]=[]
):
    """
        Replaces chosen linear modules with LoRA equivalents.

        Args:
            model: torch.nn.Module = The PyTorch model to be used
            module_names: Tuple = A tuple containing the names of the linear layers to replace
                Ex. ("query") to replace the linear modules with "query" in the name --> bert.encoder.layer.0.attention.self.query
            r: int =
            lora_alpha: int = An integer representing the numerator of the scaling constant alpha / r
            lora_dropout: float = A float between 0 and 1 representing the dropout probability
            ignore_layers: list = A list with the indices of all BERT layers NOT to add LoRA modules
        """
    module_types: Tuple=(nn.Linear,)

    # disable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = 0.0
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        print("Name: ", name)
        if isinstance(module, module_types) and name in module_names:
            temp_lora = create_lora(module, r=r, lora_dropout=lora_dropout, lora_alpha=lora_alpha)
            setattr(model, name, temp_lora)
        else:
            ignore_layers_str = [str(i) for i in ignore_layers]
            if name not in ignore_layers_str:
                add_lora_layers(module, module_names, r, lora_dropout, lora_alpha, ignore_layers)


def unfreeze_model(model):
    """Unfreezes all parameters in a model by setting requires_grad to True."""
    for name, param in model.named_parameters():
        param.requires_grad = True


def create_linear(module):
    """Converts a LoRA linear module back to a linear module."""
    k, d = module.pretrained.weight.shape  # pytorch nn.Linear weights are transposed, that is why variables are k, d and not d, k
    linear = nn.Linear(d, k, bias=True)

    with torch.no_grad():
        linear.weight.copy_(module.pretrained.weight + (module.lora_B.weight @ module.lora_A.weight) * module.scaling)
        linear.bias.copy_(module.pretrained.bias)

    return linear


def merge_lora_layers(model, module_names: Tuple=("query", "value"), dropout=0.1):
    """
        Replaces LoRA modules with their original linear equivalents.

        Args:
            model: torch.nn.Module = The PyTorch model to be used
            module_names: Tuple = A tuple containing the names of the LoRA layers to replace
                Ex. ("query") to replace the LoRA modules with "query" in the name --> bert.encoder.layer.0.attention.self.query
            r: int =
            dropout: float = A float between 0 and 1 representing the dropout probability
        """
    # enable dropout in frozen layers
    for module in model.modules():
        if isinstance(module, nn.Dropout):
            module.p = dropout
    # replace chosen linear modules with lora modules
    for name, module in model.named_children():
        if name in module_names and hasattr(module, "pretrained"):
            temp_linear = create_linear(module)
            setattr(model, name, temp_linear)
        else:
            merge_lora_layers(module, module_names=module_names, dropout=0.1)


In [None]:
add_lora_layers(bert_base, r=1, lora_alpha=2)  # inject the LoRA layers into the model
freeze_model(bert_base)  # freeze the non-LoRA parameters

Name:  bert
Name:  embeddings
Name:  word_embeddings
Name:  position_embeddings
Name:  token_type_embeddings
Name:  LayerNorm
Name:  dropout
Name:  encoder
Name:  layer
Name:  0
Name:  attention
Name:  self
Name:  query
Name:  lora_dropout
Name:  pretrained
Name:  lora_A
Name:  lora_B
Name:  key
Name:  value
Name:  lora_dropout
Name:  pretrained
Name:  lora_A
Name:  lora_B
Name:  dropout
Name:  output
Name:  dense
Name:  LayerNorm
Name:  dropout
Name:  intermediate
Name:  dense
Name:  intermediate_act_fn
Name:  output
Name:  dense
Name:  LayerNorm
Name:  dropout
Name:  1
Name:  attention
Name:  self
Name:  query
Name:  lora_dropout
Name:  pretrained
Name:  lora_A
Name:  lora_B
Name:  key
Name:  value
Name:  lora_dropout
Name:  pretrained
Name:  lora_A
Name:  lora_B
Name:  dropout
Name:  output
Name:  dense
Name:  LayerNorm
Name:  dropout
Name:  intermediate
Name:  dense
Name:  intermediate_act_fn
Name:  output
Name:  dense
Name:  LayerNorm
Name:  dropout
Name:  2
Name:  attention
Name:

In [None]:
n_params = 0
n_trainable_params = 0

# count the number of trainable parameters
for n, p in bert_base.named_parameters():
    n_params += p.numel()
    if p.requires_grad:
        n_trainable_params += p.numel()

print(f"Total parameters: {n_params}")
print(f"Trainable parameters: {n_trainable_params}")
print(f"Percentage trainable: {round(n_trainable_params / n_params * 100, 2)}%")

Total parameters: 109520642
Trainable parameters: 38402
Percentage trainable: 0.04%


In [None]:

#bert base lora all r = 8
trainer_bert_base_lora = BertTrainer(
    bert_base,
    tokenizer_base,
    lr=5e-04,
    epochs=10,
    train_dataloader=train_iterator,
    eval_dataloader=val_iterator,
    output_dir='../models/bert_base_fine_tuned_lora_r8',
    output_filename='bert_base_lora_r8',
    save=True,
)

trainer_bert_base_lora.train(evaluate=True)

NameError: name 'train_iterator' is not defined

In [None]:
torch.save(trainer_bert_base_lora, "./bert_lora_weights")

# BERT w/

In [1]:
# load tokenizer and pretrained model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the latest version of the BERT model
bert_base = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2  # number of classes
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

checkpoint = torch.load('bert_lora_weights')
# model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
bert_base.eval()

In [None]:
total_comp_time = 0
count = 0
for tokens, attention_masks, labels in val_iterator:
    # print(f'tokens: {tokens.shape}')
    # print(f'atteniton masks: {attention_masks.shape}')
    # print(f'labels: {labels.shape}')
    start = time.time()
    output = model(tokens)
    total_comp_time += time.time() - start
    print(total_comp_time)
    count += 1
print(count)
print(len(val_iterator))
print("Average Time: ", total_comp_time/len(val_iterator))