## Select GPU with highest VRAM

In [1]:
import torch
print(torch.__version__)

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("CUDA is available. Using GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

2.3.1+cu121
CUDA is available. Using GPU.


In [2]:
import pynvml

def get_memory_free_MiB(gpu_index):
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(int(gpu_index))
    mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
    return mem_info.free // 1024 ** 2

In [3]:
# I write this myself to select the GPU with highest available VRAM on puffer

total_gpus = torch.cuda.device_count()
largest_vram = 0
gpu_index = 0

for i in range(total_gpus):
    new_vram = get_memory_free_MiB(i)
    if new_vram > largest_vram:
        largest_vram = new_vram
        gpu_index = i
    print(f'GPU {i}: {torch.cuda.get_device_name(i)}')
    print(f'available memory of GPU {i}: {new_vram} MiB \n')

print(f'GPU {gpu_index} has the largest available VRAM: {largest_vram} MiB')

GPU 0: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 0: 11000 MiB 

GPU 1: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 1: 2696 MiB 

GPU 2: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 2: 11000 MiB 

GPU 3: NVIDIA GeForce RTX 2080 Ti
available memory of GPU 3: 11000 MiB 

GPU 0 has the largest available VRAM: 11000 MiB


In [4]:
torch.cuda.set_device(gpu_index)
print(f'current cuda device is set to: {torch.cuda.current_device()}')

current cuda device is set to: 0


In [5]:
tensor = torch.randn(3, 3, device=device)

print(f"Tensor is on: {tensor.device}")

Tensor is on: cuda:0


# [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108)

In this lecture, we will explore the architecture of DistilBERT, its key components, and how it can be utilized for various natural language processing tasks. Additionally, we'll discuss its advantages, limitations, and provide hands-on examples to showcase its effectiveness.

Reference : [The Theory](https://towardsdatascience.com/distillation-of-bert-like-models-the-code-73c31e8c2b0a) | [Code](https://towardsdatascience.com/distillation-of-bert-like-models-the-theory-32e19a02641f)

In [6]:
SEED = 69
torch.manual_seed(seed=SEED)
torch.backends.cudnn.deterministic=True

## Load dataset

In [7]:
import pandas as pd

df = pd.read_csv('davidson/labeled_data.csv')
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


### Understanding the dataset

The dataset has the following classes.
- 0: hate_speech
- 1: offensive_language
- 2: neither

In [8]:
df = df.rename(columns={'class': 'labels', 'tweet': 'text'})
df

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,labels,text
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


## Preprocessing data, Tokenization and Formatting

In [9]:
num_labels = len(df['labels'].unique())
num_labels

3

In [10]:
from datasets import Dataset, DatasetDict
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
    
teacher_id = 'bert-base-uncased'
max_length = 128

def tokenize_function(examples, teacher_id=teacher_id, max_length=max_length):
    tokenizer = BertTokenizer.from_pretrained(teacher_id)
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)


def preprocess_data(df):

    df = df[['text','labels']]

    # make sure the labels are integers
    df['labels'] = df['labels'].astype(int)

    # split datasets into train, test, validation
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        df['text'].tolist(), df['labels'].tolist(), test_size=0.3, random_state=SEED
    )

    valid_texts, test_texts, valid_labels, test_labels = train_test_split(
        temp_texts, temp_labels, test_size=0.5, random_state=SEED
    )

    dataset_dict = DatasetDict({
        'train': Dataset.from_dict({'text': train_texts, 'labels': train_labels}).map(tokenize_function, batched=True),
        'valid': Dataset.from_dict({'text': valid_texts, 'labels': valid_labels}).map(tokenize_function, batched=True),
        'test': Dataset.from_dict({'text': test_texts, 'labels': test_labels}).map(tokenize_function, batched=True),
    })


    return dataset_dict

dataset_dict = preprocess_data(df)
dataset_dict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['labels'] = df['labels'].astype(int)


Map:   0%|          | 0/17348 [00:00<?, ? examples/s]

Map:   0%|          | 0/3717 [00:00<?, ? examples/s]

Map:   0%|          | 0/3718 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17348
    })
    valid: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3717
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3718
    })
})

In [11]:
train_dataset = dataset_dict['train']
valid_dataset = dataset_dict['valid']
test_dataset = dataset_dict['test']

## Model & Tokenization

Load teacher model

In [12]:
from transformers import BertForSequenceClassification
teacher_model = BertForSequenceClassification.from_pretrained(teacher_id, num_labels=num_labels)
teacher_model = teacher_model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training

### Create training arguments

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./saved_models/lora",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)



In [14]:
from transformers import Trainer

def train_model(model, train_dataset, valid_dataset, training_args):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset
    )
    
    trainer.train()
    return trainer

## Implement Lora with 6 layer architecture

In [15]:
from peft import get_peft_model, LoraConfig, TaskType

teacher_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
student_layers = [1, 3, 5, 7, 9, 11]

teacher_model.bert.encoder.layer = torch.nn.ModuleList([
    teacher_model.bert.encoder.layer[i] for i in student_layers
])

teacher_model.config.num_hidden_layers = len(student_layers)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
student_lora = get_peft_model(teacher_model, lora_config).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
trainer_lora = train_model(student_lora, train_dataset, valid_dataset, training_args)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mst124974[0m ([33mbinit-ait[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,0.7178,0.584811
2,0.5193,0.440932
3,0.4465,0.4204




### Save trained models

In [17]:
trainer_lora.save_model('saved_models/lora')

## Evaluate a model on the test dataset

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(trainer, test_dataset):
    results = trainer.evaluate(test_dataset)
    predictions = trainer.predict(test_dataset)
    
    preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).cpu().numpy()
    labels = torch.tensor(test_dataset["labels"]).cpu().numpy()
    
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    
    results["test_accuracy"] = accuracy
    results["test_precision"] = precision
    results["test_recall"] = recall
    results["test_f1"] = f1
    
    return results

In [19]:
lora_results = evaluate_model(trainer_lora, test_dataset)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [20]:
def get_train_loss(trainer):
    for log in reversed(trainer.state.log_history):
        if "loss" in log:
            return log["loss"]
    return None

In [21]:
lora_train_loss = get_train_loss(trainer_lora)

### Show the results inside a Pandas `DataFrame`

In [22]:
import pandas as pd
results_df = pd.DataFrame({
    "Model": ["LoRA"],
    "Train Loss": [lora_train_loss],
    "Validation Loss": [lora_results["eval_loss"]],
    "Test Accuracy": [lora_results["test_accuracy"]],
    "Test F1-Score": [lora_results["test_f1"]],
})
results_df

Unnamed: 0,Model,Train Loss,Validation Loss,Test Accuracy,Test F1-Score
0,LoRA,0.4465,0.434195,0.842926,0.811601


In [23]:
results_df.to_csv('results/lora.csv')