In [12]:
import torch
import os

# os.environ["WANDB_DISABLED"] = "true"

# Import the Datasets of HuggingFace
from datasets import load_dataset

# Import Transformers Library and Models of HuggingFace
import transformers
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import DebertaV2ForSequenceClassification, DebertaV2ForMaskedLM, DebertaV2ForQuestionAnswering
from transformers import AutoModelForCausalLM, AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM
from transformers import EncoderDecoderModel, AutoModelForSequenceClassification, AutoModelForTokenClassification

# Import Data Collators
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, DataCollatorForTokenClassification
from transformers import DataCollatorForPermutationLanguageModeling, DataCollatorWithPadding, default_data_collator
from transformers import DataCollatorForSOP, DataCollatorForWholeWordMask

# transformers.logging.set_verbosity_info()

# Other utility libraries
from functools import partial
from peft import LoraConfig, TaskType, get_peft_model 



In [13]:
save_path = "./data"

bookcorpus_dataset = load_dataset("bookcorpus", split="train[:5000]", cache_dir=save_path)
bookcorpus_dataset = bookcorpus_dataset.train_test_split(test_size=0.2)

bookcorpus_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [14]:
# modelname = "microsoft/deberta-v3-base"
modelname = "google/electra-base-generator"

# modelname = "bert-base-uncased"
# modelname = "gpt2"
# modelname = "roberta-base"
# modelname = "microsoft/deberta-base"
# modelname = "facebook/bart-base"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [16]:
def preprocess_function(tokenizer, examples):
    return tokenizer([" ".join(x) for x in examples["text"]])

partial_tokenize_function = partial(preprocess_function, tokenizer)

tokenized_bookcorpus = bookcorpus_dataset.map(
    partial_tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=bookcorpus_dataset["train"].column_names,
)

In [17]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False, 
    r=128,
    lora_alpha=32, 
    lora_dropout=0.1,
)

In [18]:
# model_without_peft = AutoModelForCausalLM.from_pretrained(modelname)
model_without_peft = DebertaV2ForMaskedLM.from_pretrained(modelname)

model = get_peft_model(model_without_peft, peft_config)

model.print_trainable_parameters()
print(next(model.parameters()).device)

You are using a model of type electra to instantiate a model of type deberta-v2. This is not supported for all configurations of models and can yield errors.
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['encoder.layer.3.attention.self.query_proj.weight', 'encoder.layer.3.attention.self.key_proj.weight', 'encoder.layer.10.output.dense.weight', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.2.attention.self.key_proj.weight', 'encoder.layer.4.intermediate.dense.weight', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.6.attention.self.value_proj.weight', 'encoder.layer.10.attention.self.query_proj.weight', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.8.attention.self.value_proj.bias', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.5.attention.output.LayerNorm.weight', 'cls.predictions.decoder.bias', 'encoder.layer.4.output.dense.weight', 'enc

trainable params: 1,572,864 || all params: 35,312,186 || trainable%: 4.454167748210207
cpu


In [19]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# tokenizer.pad_token = tokenizer.cls_token
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token

'[PAD]'

In [20]:
print(next(model.parameters()).device)

# Check if CUDA is available and set the device to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)
print(next(model.parameters()).device)

cpu
Using device: cuda
cuda:0


In [21]:
training_args = TrainingArguments(
    output_dir=f"mymodels/{modelname}-Rank128",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    report_to="all",
    logging_dir='./logs',
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bookcorpus['train'],
    eval_dataset=tokenized_bookcorpus['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  0%|          | 0/1500 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 10.1557, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}
{'loss': 9.7731, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}
{'loss': 9.244, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.6}
{'loss': 8.8771, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}
{'loss': 8.6941, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 8.575545310974121, 'eval_runtime': 2.7267, 'eval_samples_per_second': 366.745, 'eval_steps_per_second': 45.843, 'epoch': 1.0}
{'loss': 8.6035, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 8.5423, 'learning_rate': 1.0666666666666667e-05, 'epoch': 1.4}
{'loss': 8.5017, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}
{'loss': 8.4845, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.8}
{'loss': 8.4518, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 8.388410568237305, 'eval_runtime': 3.0041, 'eval_samples_per_second': 332.881, 'eval_steps_per_second': 41.61, 'epoch': 2.0}
{'loss': 8.4531, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.2}
{'loss': 8.4484, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}
{'loss': 8.4268, 'learning_rate': 2.666666666666667e-06, 'epoch': 2.6}
{'loss': 8.4503, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.8}
{'loss': 8.424, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 8.362646102905273, 'eval_runtime': 2.9569, 'eval_samples_per_second': 338.194, 'eval_steps_per_second': 42.274, 'epoch': 3.0}
{'train_runtime': 99.5439, 'train_samples_per_second': 120.55, 'train_steps_per_second': 15.069, 'train_loss': 8.768690185546875, 'epoch': 3.0}


TrainOutput(global_step=1500, training_loss=8.768690185546875, metrics={'train_runtime': 99.5439, 'train_samples_per_second': 120.55, 'train_steps_per_second': 15.069, 'train_loss': 8.768690185546875, 'epoch': 3.0})

In [22]:
trainer.push_to_hub()

'https://huggingface.co/alitolga/electra-base-generator-Rank128/tree/main/'