Install Dependencies

In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:0

Import Libraries

In [None]:
import torch
from datasets import load_dataset
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.nn import CrossEntropyLoss

Load Model and Tokenizer

In [None]:
model_name = "EleutherAI/gpt-neo-1.3B"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]



Prepare Dataset

In [None]:
# Load dataset
data = load_dataset('json', data_files='/content/data.json')

# Ambil bagian train dari dataset
train_data = data['train']

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

# Tambahkan pad_token jika belum ada
tokenizer.pad_token = tokenizer.eos_token

# Preprocess function
def preprocess_function(examples):
    # Gabungkan prompt dan response
    full_texts = [f"{prompt} {response}" for prompt, response in zip(examples['prompt'], examples['response'])]

    # Tokenize the full texts
    return tokenizer(full_texts, truncation=True, max_length=256, padding='max_length')

# Tokenize data
tokenized_data = train_data.map(preprocess_function, batched=True)

# Data collator for dynamic padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Generating train split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_total_limit=2,
    fp16=True,
    gradient_accumulation_steps=1,  # Coba kurangi ini
    logging_dir='./logs',
    logging_steps=10,
    dataloader_num_workers=2,  # Untuk mempercepat data loading
)




Trainer Setup

In [None]:
# Custom Trainer class to handle loss calculation
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs['input_ids']
        outputs = model(**inputs)
        logits = outputs.logits

        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # Flatten the tokens
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Start Training!

In [None]:
trainer.train()


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 15.06 MiB is free. Process 5758 has 14.73 GiB memory in use. Of the allocated memory 14.51 GiB is allocated by PyTorch, and 86.45 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Save the Model

In [None]:
model.save_pretrained("arifian-ai")
tokenizer.save_pretrained("arifian-ai")

Testing the Fine-Tuned Model

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model="arifian-ai", tokenizer=tokenizer)

prompt = "Nama saya"
result = generator(prompt, max_length=50, num_return_sequences=1)
print(result[0]['generated_text'])
