In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
!wget https://files.catbox.moe/s33mcj.parquet -O kmeans.parquet

--2025-04-14 14:28:01--  https://files.catbox.moe/s33mcj.parquet
Resolving files.catbox.moe (files.catbox.moe)... 108.181.20.35
Connecting to files.catbox.moe (files.catbox.moe)|108.181.20.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5039601 (4.8M) [application/octet-stream]
Saving to: ‘kmeans.parquet’


2025-04-14 14:28:08 (730 KB/s) - ‘kmeans.parquet’ saved [5039601/5039601]



In [3]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
import torch

In [4]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset('parquet',
                       data_files='kmeans.parquet',
                       split = 'train')

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

Dataset({
    features: ['dataset_name', 'subset_name', 'prompt_id', 'template_name', 'dataset_key', 'input', 'output', 'cluster'],
    num_rows: 10000
})

In [6]:
def pre_process_conversation(examples):
    conversations = [
        [
            {"role": "user", "content": input_text},
            {"role": "assistant", "content": output_text}
        ]
        for input_text, output_text in zip(examples["input"], examples["output"])
    ]

    return {"conversations": conversations}

def check_valid_conversations(examples):
    return [all(msg["content"] is not None for msg in conv)
            for conv in examples["conversations"]]

dataset = dataset.map(
    pre_process_conversation,
    batched=True,
    batch_size=1000,
    num_proc=10,
    remove_columns=dataset.column_names
)

Map (num_proc=10):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['conversations'],
    num_rows: 10000
})

In [8]:
from google.colab import userdata
import wandb
wandb.login(key=userdata.get("WANDB_KEY"))
wandb.init(project="fp-kcv", name=f"fp-kcv-wak-report")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdan-hafidz[0m ([33mabdan-hafidz-institut-teknologi-sepuluh-nopember[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
class Config:
  def __init__(self):
    self.LORA_R = 8
    self.LORA_ALPHA = 32
    self.LORA_DROPOUT = 0.05
    self.LEARNING_RATE = 2e-4
    self.BATCH_SIZE = 8
    self.EPOCHS = 3
    self.CUTOFF_LEN = 512
    self.OUTPUT_DIR = "olmo-lora"

In [10]:
tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf")
tokenizer.pad_token = tokenizer.eos_token

olmo_model = AutoModelForCausalLM.from_pretrained(
    "allenai/OLMo-1B-hf",
    torch_dtype=torch.float16,
    device_map="auto"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [11]:
olmo_model.to("cuda")

OlmoForCausalLM(
  (model): OlmoModel(
    (embed_tokens): Embedding(50304, 2048, padding_idx=1)
    (layers): ModuleList(
      (0-15): 16 x OlmoDecoderLayer(
        (self_attn): OlmoAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): OlmoMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): OlmoLayerNorm()
        (post_attention_layernorm): OlmoLayerNorm()
      )
    )
    (norm): OlmoLayerNorm()
    (rotary_emb): OlmoRotaryEmbedding()
  )
  (lm_head): 

In [12]:
message = ["What's a Bird?", "What a bird can does?"]
inputs = tokenizer(message,
                   return_tensors='pt',
                   return_token_type_ids=False,
                   padding=True)

inputs = {k: v.to('cuda') for k,v in inputs.items()}

response = olmo_model.generate(**inputs,
                               max_new_tokens=100,
                               do_sample=True,
                               top_k=50,
                               top_p=0.95)

print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])

What's a Bird?Greetings Birdies! I have to share this news with all my birdies -- I had the best Valentine's date ever last night and I'll share a little something I wrote with my blog later on in the week.
The other day I came across some of my old dating profiles written by me years ago, when I was just getting my feet wet with online dating. I know they were written in the early '90s, and a few I had never shared, but the majority


In [13]:
config_params = Config()
config = LoraConfig(
    r=config_params.LORA_R,
    lora_alpha=config_params.LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Adjust these for OLMo
    lora_dropout=config_params.LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

In [14]:
def format_conversation(example):
    conversation = example['conversations']
    formatted_text = ""
    for message in conversation:
        if message['role'] == 'user':
            formatted_text += f"User: {message['content']}\n"
        else:
            formatted_text += f"Assistant: {message['content']}\n"
    return formatted_text.strip()

In [15]:
train_model = get_peft_model(olmo_model, config)

In [16]:
train_dataset = dataset.map(
        lambda x: {'formatted_text': format_conversation(x)},
        remove_columns=dataset.column_names
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [17]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7aa6a7ebda051a018cb4a5e5a671c9f6b80800b21e538b21bde0d099a8ad5695
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.3 rouge_score-0.1.2


In [18]:
from transformers import Trainer, TrainingArguments

from transformers import Trainer, TrainingArguments
import evaluate

# Load metrik dari evaluate
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Untuk BLEU, bentuknya: list of tokens
    bleu_preds = [pred.strip().split() for pred in decoded_preds]
    bleu_labels = [[label.strip().split()] for label in decoded_labels]

    bleu = bleu_metric.compute(predictions=bleu_preds, references=bleu_labels)

    # Untuk ROUGE, langsung pakai string
    rouge = rouge_metric.compute(
        predictions=[pred.strip() for pred in decoded_preds],
        references=[label.strip() for label in decoded_labels],
        use_stemmer=True
    )

    return {
        "bleu": bleu["bleu"],
        "rouge1": rouge["rouge1"],
        "rouge2": rouge["rouge2"],
        "rougeL": rouge["rougeL"],
        "rougeLsum": rouge["rougeLsum"],
    }

# TrainingArguments tetap
training_args = TrainingArguments(
    output_dir=config_params.OUTPUT_DIR,
    num_train_epochs=config_params.EPOCHS,
    per_device_train_batch_size=config_params.BATCH_SIZE,
    save_steps=50,
    logging_steps=10,
    eval_steps=100,
    learning_rate=config_params.LEARNING_RATE,
    fp16=False,
    optim="adamw_torch",
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    dataloader_num_workers=11,
    report_to="wandb",
)

# Preprocess dataset (dengan tokenizer)
def preprocess_function(examples):
    tokenized = tokenizer(
        examples["formatted_text"],
        truncation=True,
        max_length=config_params.CUTOFF_LEN,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()  # ← tambahkan ini
    return tokenized




Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
tokenized_dataset = train_dataset.map(preprocess_function, remove_columns=["formatted_text"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [20]:
trainer = Trainer(
    model=train_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=11,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=100,
eval_strategy=IntervalStrategy.NO,
eval_us

In [22]:
train_model.print_trainable_parameters()

trainable params: 2,097,152 || all params: 1,178,861,568 || trainable%: 0.1779


In [23]:

trainer.train()




Step,Training Loss
10,6.9758
20,6.4282
30,2.9591
40,1.5608
50,1.4802
60,1.5349
70,1.4507
80,1.272
90,1.2642
100,1.4326


TrainOutput(global_step=3750, training_loss=1.150361833445231, metrics={'train_runtime': 822.1255, 'train_samples_per_second': 36.491, 'train_steps_per_second': 4.561, 'total_flos': 9.914932002816e+16, 'train_loss': 1.150361833445231, 'epoch': 3.0})

In [24]:
trainer.save_model(f'{config_params.OUTPUT_DIR}-final-model')
train_model.save_pretrained(f'{config_params.OUTPUT_DIR}-final-model')

In [25]:
!pip install requests



In [26]:
import os
import zipfile
import requests

def upload_to_catbox(file_path):
    url = 'https://catbox.moe/user/api.php'
    with open(file_path, 'rb') as f:
        response = requests.post(url, data={'reqtype': 'fileupload'}, files={'fileToUpload': f})
    if response.status_code == 200:
        return response.text.strip()
    else:
        raise Exception(f"Gagal upload {file_path}. Status: {response.status_code}, Respon: {response.text}")

def zip_directory(source_dir, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(source_dir):
            for file in files:
                full_path = os.path.join(root, file)
                arcname = os.path.relpath(full_path, start=source_dir)
                zipf.write(full_path, arcname)

def upload_zipped_model(base_output_dir):
    final_model_dir = f"{base_output_dir}-final-model"
    zip_filename = f"{final_model_dir}.zip"

    if not os.path.exists(final_model_dir):
        raise FileNotFoundError(f"Direktori {final_model_dir} tidak ditemukan.")

    print(f"📦 Membuat zip dari direktori: {final_model_dir} -> {zip_filename}")
    zip_directory(final_model_dir, zip_filename)

    print(f"☁️ Mengupload zip ke Catbox: {zip_filename}")
    try:
        url = upload_to_catbox(zip_filename)
        print(f"✅ Upload berhasil! Link: {url}")
        return zip_filename, url
    except Exception as e:
        print(f"❌ Upload gagal: {e}")
        return zip_filename, None


# Jalankan
zip_name, link = upload_zipped_model(f'{config_params.OUTPUT_DIR}')

# Print hasil akhir
print("\n=== Hasil Upload ===")
if link:
    print(f"{zip_name}: {link}")
else:
    print("Tidak ada file yang berhasil diupload.")


📦 Membuat zip dari direktori: olmo-lora-final-model -> olmo-lora-final-model.zip
☁️ Mengupload zip ke Catbox: olmo-lora-final-model.zip
✅ Upload berhasil! Link: https://files.catbox.moe/f2qoxh.zip

=== Hasil Upload ===
olmo-lora-final-model.zip: https://files.catbox.moe/f2qoxh.zip


In [28]:
import torch
from transformers import AutoTokenizer
import time
import sys

class OLMoPEFTInference:
    def __init__(self, model, tokenizer_name="allenai/OLMo-1B-hf"):
        """
        Inisialisasi untuk inference dengan model hasil get_peft_model.
        """
        # Tokenizer setup
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token  # Jaga-jaga biar gak error padding

        # Model hasil training (get_peft_model)
        self.model = model
        self.model.eval()  # Mode eval wajib
        self.device = next(self.model.parameters()).device

    def _prepare_input(self, user_text, history=None):
        """
        Format input jadi prompt. History bisa ditambahkan untuk chat mode.
        """
        prompt = ""
        if history:
            for turn in history:
                role, content = turn["role"], turn["content"]
                prompt += f"{role.capitalize()}: {content}\n"
        prompt += f"User: {user_text}\nAssistant:"
        return prompt

    def generate(self, user_input, history=None, stream_output=True):
        """
        Generate output dari prompt atau percakapan.
        """
        formatted = self._prepare_input(user_input, history)

        # Tokenisasi prompt
        encoded = self.tokenizer(
            formatted,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(self.device)

        # Generate dengan model
        with torch.no_grad():
            output = self.model.generate(
                **encoded,
                max_new_tokens=300,
                # temperature=0.3,
                # top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1,
                pad_token_id=self.tokenizer.eos_token_id
            )

        # Decode hasil
        decoded = self.tokenizer.decode(output[0], skip_special_tokens=True)
        response = decoded.split("Assistant:")[-1].strip()

        if stream_output:
            self._stream_response(response)
        else:
            print(response)

        return response

    def _stream_response(self, response):
        """
        Print response secara streaming seperti chatbot beneran.
        """
        for char in response:
            sys.stdout.write(char)
            sys.stdout.flush()
            time.sleep(0.015)
        print("\n")

# =====================================
# ✅ Contoh penggunaan
# =====================================
if __name__ == "__main__":
    from peft import get_peft_model, LoraConfig
    from transformers import AutoModelForCausalLM

    # Load base dan adapter model
    base = AutoModelForCausalLM.from_pretrained(
        "allenai/OLMo-1B-hf",
        torch_dtype=torch.float16,
        device_map="auto"
    )

    # Misal kita sudah punya config LoRA-nya (bisa dari PeftConfig atau manual)
    config = LoraConfig.from_pretrained("olmo-lora/checkpoint-1550")  # Ganti path sesuai model kamu

    # Inisialisasi inferencer
    infer = OLMoPEFTInference(train_model)

    # Jalankan inference (mode chat)
    chat = [
        {"role": "user", "content": "Halo siapa kamu?"},
        {"role": "assistant", "content": "Saya Cendol, model bahasa yang cerdas."}
    ]
    infer.generate("Apa itu sayuran?")#, dauistory=chat)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Saradikannya mengundang kaum manusia untuk menjadi sayur - sayur yang paling mendapat kesempatan dalam segmen, seperti luntungan atau gugah. Sayura tidak pernah sampai di Jepang sebagai sediminar dan keindahan-keindaan semesta ini adalah karena dirinya akan bercita-cita oleh saudara, kerajinan suku bule atau penuaian tanpa fasa agar lebih besar diberi kepada manusia tersebut. Seseorong cinta dimakan atau ketemu dengan siapa mereka. Dalam bidang hiburan, cerintah dibaca secara otentisitas sehingga dia juga berbincang dan membawanya apabila anda membalas jiwa atau pengacuan selagi belajar di tiongkok telah melestarikan pelayanan komunikasi dan mewujudkan hidup mereka tetapi ia berubah setelah berpeluang kuliah dengan begitu baiknya lagenda baru ditambah temen.

