#### Double checking we are using the GPU on the VSC

In [2]:
import torch

# Check if CUDA is available and set the device to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [98]:
import transformers
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling

from peft import get_peft_model, LoraConfig, TaskType

from datasets import load_dataset, DatasetDict

import numpy as np

import torch

from sklearn.model_selection import KFold, cross_val_predict, GridSearchCV

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

#### Import the dataset

In [54]:
# Specify the dataset name
dataset_name = "helena-balabin/pereira_fMRI_sentences"

# Specify the path to save or load the dataset
save_path = "./data"

# Load the dataset, use the cache if available
pereira_dataset = load_dataset(dataset_name, cache_dir=save_path)

In [55]:
type(pereira_dataset)

datasets.dataset_dict.DatasetDict

In [56]:
modelname = "bert-base-uncased"
# modelname = "gpt2"

In [57]:
tokenizer = AutoTokenizer.from_pretrained(modelname)

loading configuration file config.json from cache at C:\Users\alito/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\alito/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\vocab.t

In [58]:
transformers.logging.set_verbosity_info()

##### Preprocessing Function 1 - Map the data to the tokenizer function

In [59]:
def preprocess_function(tokenizer, examples):
    return tokenizer([" ".join(x) for x in examples["sentences"]])

In [60]:
from functools import partial

partial_tokenize_function = partial(preprocess_function, tokenizer)

tokenized_pereira = pereira_dataset.map(
    partial_tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=pereira_dataset['train'].column_names,
)

In [81]:
tokenized_pereira

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8
    })
})

##### Tokenizer Function 2 - Divide the dataset into blocks of block size. Drop the remainder if the length of the dataset is not fully divisible to the block size.

In [70]:
def group_texts(examples):
    block_size = 128

    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [71]:
preprocessed_dataset = tokenized_pereira.map(group_texts, batched=True, num_proc=4)

##### Data Collator Function for (Causal) LM. This function will ensure that for each token, we have the following token respective to it as it's label/target.

In [72]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

#### Import the LoRA library from PEFT. Set it's parameters and load the model optimized using LoRA

In [73]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8,
    lora_alpha=32, 
    lora_dropout=0.1)

We can see the reduced number of parameters below

In [74]:
model_without_peft = AutoModelForCausalLM.from_pretrained(modelname)
# model_without_peft = DebertaV2ForMaskedLM.from_pretrained(modelname)

model = get_peft_model(model_without_peft, peft_config)

model.print_trainable_parameters()
print(next(model.parameters()).device)

loading configuration file config.json from cache at C:\Users\alito/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64c8a40076\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.35.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at C:\Users\alito/.cache\huggingface\hub\models--bert-base-uncased\snapshots\1dbc166cf8765166998eff31ade2eb64

trainable params: 294,912 || all params: 109,809,210 || trainable%: 0.26856763653977656
cpu


If the tokenizer doesn't have a padding token by default, use End of Sequence Token. If it also doesn't have that, then we have to use a Separator or a Classification token...

In [86]:
# tokenizer.pad_token = tokenizer.cls_token
# tokenizer.pad_token = tokenizer.eos_token

tokenizer.pad_token

'[PAD]'

Ensure that we are running the model on Gpu and not on Cpu

In [87]:
print(next(model.parameters()).device)

cuda:0


In [88]:
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BertLMHeadModel(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_f

In [89]:
print(next(model.parameters()).device)

cuda:0


In [91]:
import random

def train_test_split(dataset, test_size=0.2, seed=None):
    """
    Splits a Hugging Face dataset into training and testing sets.
    
    Args:
    dataset (Dataset): The dataset to split.
    test_size (float): The proportion of the dataset to include in the test split (between 0 and 1).
    seed (int, optional): A seed for random shuffling for reproducibility.

    Returns:
    tuple: Two datasets, the first being the training set and the second the testing set.
    """
    # Shuffle the dataset
    if seed is not None:
        random.seed(seed)
        shuffled_indices = random.sample(range(len(dataset)), len(dataset))
    else:
        shuffled_indices = list(range(len(dataset)))

    # Calculate the split index
    split_index = int(len(dataset) * (1 - test_size))

    # Split the dataset
    train_indices = shuffled_indices[:split_index]
    test_indices = shuffled_indices[split_index:]

    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)

    return train_dataset, test_dataset

In [84]:
preprocessed_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 336
    })
})

In [94]:
train_set, test_set = train_test_split(preprocessed_dataset["train"], test_size=0.2, seed=42)

# Create a new DatasetDict with the new splits
final_dataset = DatasetDict({
    'train': train_set,
    'test': test_set
})

In [96]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 268
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 68
    })
})

#### Set the Training Arguments

In [85]:
training_args = TrainingArguments(
    output_dir=f"mymodels/{modelname}-conference",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    report_to="all",
    logging_dir='./logs',            
    logging_steps=100,
)

PyTorch: setting up devices


#### Finally create the Trainer class and train the model

In [99]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [101]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 268
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 102
  Number of trainable parameters = 294,912


  0%|          | 0/102 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 68
  Batch size = 8


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 6.518098831176758, 'eval_runtime': 0.415, 'eval_samples_per_second': 163.86, 'eval_steps_per_second': 21.687, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 68
  Batch size = 8


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 6.1253662109375, 'eval_runtime': 0.4222, 'eval_samples_per_second': 161.064, 'eval_steps_per_second': 21.317, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 68
  Batch size = 8


{'loss': 6.463, 'learning_rate': 3.921568627450981e-07, 'epoch': 2.94}


  0%|          | 0/9 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 6.023798942565918, 'eval_runtime': 0.3833, 'eval_samples_per_second': 177.424, 'eval_steps_per_second': 23.483, 'epoch': 3.0}
{'train_runtime': 12.2454, 'train_samples_per_second': 65.657, 'train_steps_per_second': 8.33, 'train_loss': 6.464234408210306, 'epoch': 3.0}


TrainOutput(global_step=102, training_loss=6.464234408210306, metrics={'train_runtime': 12.2454, 'train_samples_per_second': 65.657, 'train_steps_per_second': 8.33, 'train_loss': 6.464234408210306, 'epoch': 3.0})

In [102]:
trainer.push_to_hub()

Saving model checkpoint to mymodels/bert-base-uncased-conference
tokenizer config file saved in mymodels/bert-base-uncased-conference\tokenizer_config.json
Special tokens file saved in mymodels/bert-base-uncased-conference\special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{}


adapter_model.safetensors:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

'https://huggingface.co/alitolga/bert-base-uncased-conference/tree/main/'

In [103]:
repo_name = "alitolga/bert-base-uncased-large-peft"
# repo_name = "alitolga/gpt2-large-peft"