In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
model_name = "HooshvareLab/gpt2-fa-poetry"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [4]:
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(42001, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=42001, bias=False)
)

In [5]:
with open("p.txt", "r", encoding="utf-8") as f:
    poems = f.read().split('\n')

In [6]:
poems = [poem.strip() for poem in poems if poem.strip()]

In [7]:
dataset = Dataset.from_dict({"text": poems})

In [8]:
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map: 100%|██████████████████████████████████████████████████████████████| 10686/10686 [00:03<00:00, 3022.11 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir="./gpt2-fa-poetry-finetuned",  
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4, 
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",  
    logging_steps=100, 
    prediction_loss_only=True,
    remove_unused_columns=True,
    fp16=True,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

In [None]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.6425
200,0.1518
300,0.1454
400,0.1412
500,0.1386
600,0.1352
700,0.136
800,0.1323
900,0.135
1000,0.132


In [None]:
trainer.save_model("./gpt2-fa-poetry-finetuned")
tokenizer.save_pretrained("./gpt2-fa-poetry-finetuned")