Install dependency

In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

Import library and model

In [18]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
from sklearn.model_selection import train_test_split

model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)




Load dataset

In [54]:
# Load dataset
dataset = load_dataset('json', data_files='modified_data.json', split='train')

# Split dataset to train and validation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

Generating train split: 0 examples [00:00, ? examples/s]

Preprocess data

In [55]:
# Preprocess data function
def preprocess_function(examples):
    inputs = examples['prompt']
    targets = examples['response']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenisasi output and add as 'labels'
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Preprocess dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]



Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Training argument

In [65]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,  # Rentang: 1e-5 hingga 5e-6
    per_device_train_batch_size=4,  # Rentang: 1 hingga 4
    per_device_eval_batch_size=4,  # Rentang: 1 hingga 4
    num_train_epochs=3,  # Rentang: 3 hingga 10
    weight_decay=0.001,  # Rentang: 0.01 hingga 0.1
    save_total_limit=2,
    fp16=True,  # Mixed precision
    gradient_accumulation_steps=4,  # Rentang: 4 hingga 16
    logging_dir='./logs',
    logging_steps=10,
    warmup_steps=500,  # Rentang: 0 hingga 1000
    report_to = "none"
)




Fine tune model

In [66]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    args=training_args
)

trainer.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
0,No log,2.43281
2,2.677400,2.429806


TrainOutput(global_step=18, training_loss=2.7131500244140625, metrics={'train_runtime': 29.6746, 'train_samples_per_second': 10.211, 'train_steps_per_second': 0.607, 'total_flos': 13012906798080.0, 'train_loss': 2.7131500244140625, 'epoch': 2.769230769230769})

Save model

In [67]:
model.save_pretrained('fine_tuned_t5-arifian-ai')
tokenizer.save_pretrained('fine_tuned_t5-arifian-ai')


('fine_tuned_t5-arifian-ai/tokenizer_config.json',
 'fine_tuned_t5-arifian-ai/special_tokens_map.json',
 'fine_tuned_t5-arifian-ai/spiece.model',
 'fine_tuned_t5-arifian-ai/added_tokens.json')

Test model

In [68]:
from transformers import pipeline

generator = pipeline('text2text-generation', model='fine_tuned_t5-arifian-ai', tokenizer='fine_tuned_t5-arifian-ai', device=0)

prompt = "Pertanyaan: Arifian, apa yang bisa kamu ceritakan tentang dirimu?"
result = generator(prompt, max_length=50)
print('Generated : ', result[0]['generated_text'])


Generated :  Saya melihatkan teknologi yang paling adalah berbagi pengetahuan yang paling yang paling yang paling ber
