In [None]:
!pip install transformers datasets accelerate torch wandb

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
!nvidia-smi

Mon Dec 16 01:48:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              50W / 400W |  40511MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import gc

gc.collect()

19221

In [2]:
import os
import gc
from google.colab import drive
import wandb
from datasets import load_from_disk, Dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, get_linear_schedule_with_warmup

drive.mount('/content/drive')

MODEL_NAME = "gpt2-medium"
TRAINED_MODELS_DIR = '/content/drive/My Drive/Colab Notebooks/trained_models'
FINE_TUNING_OUTPUT_DIR = '/content/drive/My Drive/Colab Notebooks/fine_tuning_output'
LOG_DIR = '/content/drive/My Drive/Colab Notebooks/logs'
BATCH_SIZE = 64


def get_last_checkpoint(output_dir):
    if not os.path.exists(output_dir):
        return None

    checkpoints = [f for f in os.listdir(output_dir) if f.startswith("checkpoint")]
    checkpoints.sort(key=lambda x: int(x.split('-')[-1]))

    if checkpoints:
        return os.path.join(output_dir, checkpoints[-1])
    return None


def train_fine_tune():
    print("Login no WandB")
    wandb.login(key=os.getenv('WANDB_KEY'))
    print("Carregando o tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    tokenized_dataset = load_from_disk(f"{TRAINED_MODELS_DIR}/dataset")
    print("Tokenizer carregado!")

    print("Carregando o modelo pré-treinado...")
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=32,
        lora_alpha=32,
        lora_dropout=0.1
    )
    model = get_peft_model(model, config)
    print("Modelo carregado com sucesso!")

    print("Fazendo login no WandB...")
    wandb.login(key=os.getenv('WANDB_KEY'))
    print("Inicializando o projeto no WandB...")
    wandb.init(project="postech03", name="gpt2-medium-fine-tuning")
    print("Projeto inicializado no WandB.")

    print("Configurando os argumentos de treinamento...")
    training_args = TrainingArguments(
        output_dir=FINE_TUNING_OUTPUT_DIR,
        overwrite_output_dir=True,
        eval_strategy="no",
        learning_rate=3e-5,
        per_device_train_batch_size=BATCH_SIZE,
        warmup_steps=100,
        save_steps=100,
        save_total_limit=3,
        logging_dir=LOG_DIR,
        logging_steps=100,
        push_to_hub=False,
        report_to="wandb",
        fp16=True,
        gradient_accumulation_steps=4,
        dataloader_num_workers=2,
        optim="adamw_torch",
    )
    print("Argumentos de treinamento configurados!")

    print("Inicializando o trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        processing_class=tokenizer,
    )
    print("Trainer inicializado com sucesso!")

    print("Iniciando o treinamento...")
    last_checkpoint = get_last_checkpoint(FINE_TUNING_OUTPUT_DIR)

    if last_checkpoint:
        print(f"Encontrado checkpoint: {last_checkpoint}")
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        print("Começando do inicio, nenhum checkpoint encontrado")
        trainer.train()

    print("Treinamento concluído!")

    print("Salvando o modelo e o tokenizer...")
    model.save_pretrained(f"{TRAINED_MODELS_DIR}/model/")
    tokenizer.save_pretrained(f"{TRAINED_MODELS_DIR}/tokenizer/")
    print("Modelo e tokenizer salvos com sucesso!")

attempt = 0
while attempt < 3:
  try:
    attempt += 1
    train_fine_tune()
    break
  except Exception as e:
    print(f"Erro treinando {e} na tentativa nº {attempt}, limpando memória e tentando novamente.")
    gc.collect()

print("Treinamento concluído com sucesso!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Login no WandB


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Carregando o tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer carregado!
Carregando o modelo pré-treinado...


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33manibalmf1[0m ([33mpapismunano[0m). Use [1m`wandb login --relogin`[0m to force relogin


Modelo carregado com sucesso!
Fazendo login no WandB...
Inicializando o projeto no WandB...


Projeto inicializado no WandB.
Configurando os argumentos de treinamento...
Argumentos de treinamento configurados!
Inicializando o trainer...
Trainer inicializado com sucesso!
Iniciando o treinamento...
Encontrado checkpoint: /content/drive/My Drive/Colab Notebooks/fine_tuning_output/checkpoint-7100


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
7200,5.4698
7300,5.4449
7400,5.456
7500,5.4223
7600,5.4225
7700,5.4194
7800,5.4068
7900,5.4123
8000,5.4384
8100,5.4292


Step,Training Loss
7200,5.4698
7300,5.4449
7400,5.456
7500,5.4223
7600,5.4225
7700,5.4194
7800,5.4068
7900,5.4123
8000,5.4384
8100,5.4292


Treinamento concluído!
Salvando o modelo e o tokenizer...




Modelo e tokenizer salvos com sucesso!
Login no WandB
Carregando o tokenizer...
Tokenizer carregado!
Carregando o modelo pré-treinado...




Modelo carregado com sucesso!
Fazendo login no WandB...
Inicializando o projeto no WandB...


0,1
train/epoch,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▅▁▂▆▇█▁▂▂▂█▄▁▁▂▂▄▁▁▂▃▄▅▂▂▄▃▃▂▃▁▂▅▂▁▃▃▂▄▃
train/learning_rate,███▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,▆▂▂▄▇▅▅▅▅▁█▄▄▅▃▅▄▆▃▆▂▄▇▄▄▃▅▂▂█▃▆▃▄▃▃▁▁▅▂

0,1
total_flos,9.602596959934218e+17
train/epoch,2.99972
train/global_step,15990.0
train/grad_norm,1.78113
train/learning_rate,0.0
train/loss,5.4064
train_loss,3.01608
train_runtime,9617.9087
train_samples_per_second,425.649
train_steps_per_second,1.663


Projeto inicializado no WandB.
Configurando os argumentos de treinamento...
Argumentos de treinamento configurados!
Inicializando o trainer...
Trainer inicializado com sucesso!
Iniciando o treinamento...
Encontrado checkpoint: /content/drive/My Drive/Colab Notebooks/fine_tuning_output/checkpoint-15990


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Step,Training Loss


Treinamento concluído!
Salvando o modelo e o tokenizer...




Modelo e tokenizer salvos com sucesso!
Login no WandB
Carregando o tokenizer...
Tokenizer carregado!
Carregando o modelo pré-treinado...




Modelo carregado com sucesso!
Fazendo login no WandB...
Inicializando o projeto no WandB...


0,1
train/epoch,▁
train/global_step,▁

0,1
total_flos,9.602596959934218e+17
train/epoch,2.99972
train/global_step,15990.0
train_loss,0.0
train_runtime,0.0149
train_samples_per_second,275302316.63
train_steps_per_second,1075290.134


Projeto inicializado no WandB.
Configurando os argumentos de treinamento...
Argumentos de treinamento configurados!
Inicializando o trainer...
Trainer inicializado com sucesso!
Iniciando o treinamento...
Encontrado checkpoint: /content/drive/My Drive/Colab Notebooks/fine_tuning_output/checkpoint-15990


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Step,Training Loss


Treinamento concluído!
Salvando o modelo e o tokenizer...
Modelo e tokenizer salvos com sucesso!
Treinamento concluído com sucesso!
