## install libraries

In [1]:
!pip install -U unsloth


























## Imports

In [2]:
from unsloth import FastLanguageModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import torch, random, os
from typing import Dict, List, Any
from transformers.data.data_collator import DefaultDataCollator
SEED = 42


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


INFO 09-05 10:18:48 [__init__.py:235] Automatically detected platform cuda.


🦥 Unsloth Zoo will now patch everything to make training faster!


## Load model and Tokenizer

In [3]:
from huggingface_hub import login

# Replace 'your_hf_token' with your actual token
# You can get your token from: https://huggingface.co/settings/tokens
login(token="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")

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    "./Qwen3-4B-Instruct-2507",
    max_seq_length = 12000,
    dtype = torch.float16,
    load_in_4bit = True,
)

FastLanguageModel.for_training(model)
model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    lora_alpha=128,
    lora_dropout=0.0,
    use_rslora=True,
)


==((====))==  Unsloth 2025.9.1: Fast Qwen3 patching. Transformers: 4.53.3. vLLM: 0.10.0.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.559 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|                                                              | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:  33%|██████████████████                                    | 1/3 [00:06<00:13,  6.89s/it]

Loading checkpoint shards:  67%|████████████████████████████████████                  | 2/3 [00:15<00:07,  7.84s/it]

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████| 3/3 [00:15<00:00,  4.34s/it]

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████| 3/3 [00:15<00:00,  5.19s/it]




Unsloth 2025.9.1 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [5]:

RESPONSE_TAG = "<|response|>"
tokenizer.add_special_tokens({"additional_special_tokens": [RESPONSE_TAG]})
model.resize_token_embeddings(len(tokenizer))
response_token_id = tokenizer.convert_tokens_to_ids(RESPONSE_TAG)

## Format function

In [6]:
SPECIAL_TOKENS = ["<|response|>", "<|analysis|>", "<|forecast|>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": SPECIAL_TOKENS})
if num_added > 0:
    model.resize_token_embeddings(len(tokenizer))

RESPONSE_TAG = "<|response|>"
ANALYSIS_TAG = "<|analysis|>"
FORECAST_TAG = "<|forecast|>"

response_token_id = tokenizer.convert_tokens_to_ids(RESPONSE_TAG)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"
tokenizer.padding_side     = "right"


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
import json


def normalize_output_to_commas(output: str) -> str:
    txt = str(output).strip()
    if txt.startswith("[") and txt.endswith("]"):
        try:
            arr = json.loads(txt)
            return ",".join(str(x).strip() for x in arr)
        except Exception:
            pass
    return ",".join([t.strip() for t in txt.split(",")])

def make_brief_analysis(thinking: str, limit_chars: int = 200) -> str:
    t = (thinking or "").strip()
    if not t:
        return "brief outlook based on the provided data"
    return t[:limit_chars].replace("\n", " ")

def format_chat(ex):
    instruction = ex.get("__index_level_0__", "") or ""
    user_input  = ex.get("__index_level_1__", "") or ""
    thinking    = ex.get("custom_bitcoin_dataset", "") or ""
    output      = ex.get("__index_level_2__", "") or ""

    post_input_directive = (
        "Please analyze it first and then give me 10 next day prices separated by comma."
    )
    user_block = f"{user_input}\n\n{post_input_directive}"

    brief_note = make_brief_analysis(thinking, limit_chars=20000000)
    normalized_output = normalize_output_to_commas(output)

    assistant_payload = (
        f"{RESPONSE_TAG}\n"
        f"{ANALYSIS_TAG} Analysis: {brief_note}\n\n"
        f"{FORECAST_TAG}\n{normalized_output}"
    )

    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": user_block},
        {"role": "assistant", "content": assistant_payload},
    ]
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}

raw = load_dataset("tahamajs/not_clearned_bitcoin_dataset")
train_data = raw["train"].map(format_chat, remove_columns=raw["train"].column_names)

def tokenize_fn(ex):
    return tokenizer(ex["text"], truncation=True, max_length=12000, padding=False)

train_tok = train_data.map(tokenize_fn, batched=True, remove_columns=["text"])
train_tok = train_tok.shuffle(seed=SEED)



## Data callector

In [8]:
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from torch.nn.utils.rnn import pad_sequence
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.truncation_side = "left"
tokenizer.padding_side     = "right"

from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from torch.nn.utils.rnn import pad_sequence
import torch
from typing import Dict, List, Any

def _find_subsequence(haystack: torch.Tensor, needle: torch.Tensor) -> int:
    if needle.numel() == 0 or haystack.numel() < needle.numel():
        return -1
    for i in range(haystack.numel() - needle.numel() + 1):
        if torch.equal(haystack[i:i+needle.numel()], needle):
            return i
    return -1

class DataCollatorMaskResponse:
    def __init__(self, tokenizer: PreTrainedTokenizerBase, response_token_id: int):
        self.tokenizer = tokenizer
        self.response_token_id = response_token_id

        assistant_start_str = tokenizer.apply_chat_template(
            [{"role":"assistant","content":""}],
            tokenize=False, add_generation_prompt=True
        )

        self.assistant_start_ids = torch.tensor(
            tokenizer(assistant_start_str, add_special_tokens=False)["input_ids"],
            dtype=torch.long
        )

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        input_ids_list      = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
        attention_mask_list = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]

        input_ids = pad_sequence(input_ids_list, batch_first=True,
                                 padding_value=self.tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_mask_list, batch_first=True, padding_value=0)

        labels = input_ids.clone()

        for i in range(labels.size(0)):
            row = input_ids[i]

            pos = (row == self.response_token_id).nonzero(as_tuple=True)
            start_idx = -1
            if len(pos[0]) > 0:
                start_idx = int(pos[0][0].item())

            if start_idx < 0 and self.assistant_start_ids.numel() > 0:
                j = _find_subsequence(row, self.assistant_start_ids)
                if j >= 0:
                    start_idx = j + self.assistant_start_ids.numel() - 1

            if start_idx >= 0 and start_idx + 1 < row.numel():
                labels[i, : start_idx + 1] = -100
            else:
                keep = min(64, row.numel())
                labels[i, : row.numel() - keep] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }
collator = DataCollatorMaskResponse(tokenizer, response_token_id)



## Training Argumenst

In [9]:
args = TrainingArguments(
    output_dir="qwen_bitcoin_chat_fast1_long",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-4,
    logging_steps=2,
    save_steps=200,
    bf16=False,
    fp16=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    gradient_checkpointing=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    tokenizer=tokenizer,
    data_collator=collator,
)

  trainer = Trainer(


In [10]:
trainer.train()




==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,843 | Num Epochs = 10 | Total steps = 4,610
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 132,120,576 of 4,153,912,832 (3.18% trained)


Step,Training Loss
2,69.3411
4,35.001
6,13.0242
8,2.3876
10,2.3097
12,2.3123
14,2.2392
16,2.0719
18,1.8432
20,1.7787


Unsloth: Will smartly offload gradients to save VRAM!






In [None]:
print("=== Sample formatted texts ===")
for i in range(2):
    print(train_data[i]["text"])
    print("="*80)

sample_batch = [train_tok[i] for i in range(2)]
batch = collator(sample_batch)

print("\n=== Tokenized input_ids ===")
print(batch["input_ids"][0][:4000])
print("\nDecoded back:\n", tokenizer.decode(batch["input_ids"][0][:80]))

print("\n=== Labels ===")
print(batch["labels"][0][:4000])

masked_decoded = [
    tok if lab != -100 else "[MASK]"
    for tok, lab in zip(batch["input_ids"][0][:4000].tolist(), batch["labels"][0][:4000].tolist())
]
print("\nMasked Decoded (first 80 tokens):")
print(masked_decoded)


## Save model

In [None]:
trainer.model.save_pretrained("qwen_bitcoin_chat_fast/lora_adapter")
tokenizer.save_pretrained("qwen_bitcoin_chat_fast")


In [None]:
from huggingface_hub import HfApi, create_repo

# 1. Initialize the API
api = HfApi()
local_folder = "qwen_bitcoin_chat_fast" # <--- Path to your checkpoint
repo_id = "tahamajs/qwen-bitcoin-chat-v1" # <--- Change this to your username and model name
create_repo(
    repo_id=repo_id,
    repo_type="model",
    private=False,  # Set to True if you want a private model
    exist_ok=True
)

# 3. Upload the entire folder
# This will create the repo if it doesn't exist and upload all contents.
print(f"Uploading folder '{local_folder}' to '{repo_id}'...")
api.upload_folder(
    folder_path=local_folder,
    repo_id=repo_id,
    repo_type="model", # Can be "dataset" or "space"
    commit_message="Uploading full model checkpoint"
)

print("Upload complete!")


In [None]:
import matplotlib.pyplot as plt

# --- Get the training logs ---
training_logs = trainer.state.log_history

# --- Extract loss and steps ---
# We need to filter out the final summary entry that doesn't have a 'loss' key.
train_losses = [log['loss'] for log in training_logs if 'loss' in log]
train_steps = [log['step'] for log in training_logs if 'loss' in log]

# --- Plot the loss ---
plt.figure(figsize=(10, 6))
plt.plot(train_steps, train_losses, label='Training Loss')

# --- Add labels and title for clarity ---
plt.title('Training Loss Curve')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# --- Get the training logs ---
training_logs = trainer.state.log_history

# --- Convert to a pandas DataFrame ---
log_df = pd.DataFrame(training_logs)

# --- Filter for training loss entries ---
# The 'loss' column will have NaN for evaluation logs, so we can drop them.
train_loss_df = log_df.dropna(subset=['loss'])

# --- Plot using Seaborn ---
plt.figure(figsize=(12, 7))
sns.lineplot(data=train_loss_df, x='step', y='loss')

# --- Add labels and title ---
plt.title('Training Loss Curve (using Seaborn)')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

In [None]:
# Assuming you already have the log_df from Method 2
# log_df = pd.DataFrame(trainer.state.log_history)

# --- Separate training and validation logs ---
train_df = log_df.dropna(subset=['loss'])
eval_df = log_df.dropna(subset=['eval_loss'])

# --- Plot both curves ---
plt.figure(figsize=(12, 7))
plt.plot(train_df['step'], train_df['loss'], label='Training Loss')
plt.plot(eval_df['step'], eval_df['eval_loss'], label='Validation Loss', linestyle='--')

plt.title('Training vs. Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()
```If the validation loss starts to increase while the training loss continues to decrease, your model is overfitting.

In [None]:
from huggingface_hub import login

login()
