<a href="https://colab.research.google.com/github/adhamhelmy/llm-fine-tuning/blob/main/unsloth/Ministral_3_(3B)_Reinforcement_Learning_Sudoku_Game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Goal: Make Ministral generate trading strategy with Reinforcement Learning

Our goal is to make Ministral learn to generate trading strategies using reinforcement learning (GRPO).
The model will devise a strategy, and we'll reward it for valid strategies and returns.

# Installation
We'll be using [Unsloth](https://github.com/unslothai/unsloth) to do RL on Ministral. Unsloth saves 70% VRAM usage and makes reinforcement learning 2 to 6x faster.

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
# Install transformers branch for Ministral
!pip install git+https://github.com/huggingface/transformers.git@bf3f0ae70d0e902efab4b8517fce88f6697636ce
!pip install --no-deps trl==0.22.2

### Unsloth

In [None]:
from unsloth import FastVisionModel
import torch
max_seq_length = 4096 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

ministral_models = [
    "unsloth/Ministral-3-3B-Instruct-2512", # Ministral instruct models
    "unsloth/Ministral-3-8B-Instruct-2512",
    "unsloth/Ministral-3-14B-Instruct-2512",

    "unsloth/Ministral-3-3B-Reasoning-2512", # Ministral reasoning models
    "unsloth/Ministral-3-8B-Reasoning-2512",
    "unsloth/Ministral-3-14B-Reasoning-2512",

    "unsloth/Ministral-3-3B-Base-2512", # Ministral base models
    "unsloth/Ministral-3-8B-Base-2512",
    "unsloth/Ministral-3-14B-Base-2512",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastVisionModel.from_pretrained(
    model_name = "unsloth/Ministral-3-3B-Instruct-2512",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
)

To do efficient RL, we will use [LoRA](https://arxiv.org/abs/2106.09685), which allows us to only add 1 to 5% of extra weights to the model for finetuning purposes. This allows us to save memory usage by over 60%, and yet it retains good accuracy.

In [None]:
model = FastVisionModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

# Backtest Implementation

In [None]:
%pip install backtrader
%pip install alpaca_trade_api

In [None]:
# Load API Key ID and Secret Key from Colab secrets
API_KEY_ID = ""
API_SECRET_KEY = ""

In [None]:
from alpaca_trade_api.rest import REST, TimeFrame

rest_api = REST(API_KEY_ID, API_SECRET_KEY, 'https://paper-api.alpaca.markets')

In [None]:
import backtrader as bt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 140 # chart resolution

def run_backtest(strategy, symbols, start, end, timeframe=TimeFrame.Day, cash=10000):
    '''params:
        strategy: the strategy you wish to backtest, an instance of backtrader.Strategy
        symbols: the symbol (str) or list of symbols List[str] you wish to backtest on
        start: start date of backtest in format 'YYYY-MM-DD'
        end: end date of backtest in format: 'YYYY-MM-DD'
        timeframe: the timeframe the strategy trades on (size of bars) -
                   1 min: TimeFrame.Minute, 1 day: TimeFrame.Day, 5 min: TimeFrame(5, TimeFrameUnit.Minute)
        cash: the starting cash of backtest
    '''

    # initialize backtrader broker
    cerebro = bt.Cerebro(stdstats=True)
    cerebro.broker.setcash(cash)

    # add strategy
    cerebro.addstrategy(strategy)

    # add analytics
    # cerebro.addobserver(bt.observers.Value)
    # cerebro.addobserver(bt.observers.BuySell)
    cerebro.addanalyzer(bt.analyzers.SharpeRatio, _name='mysharpe')

    # historical data request
    if type(symbols) == str:
        symbol = symbols
        alpaca_data = rest_api.get_bars(symbol, timeframe, start, end,  adjustment='all').df
        data = bt.feeds.PandasData(dataname=alpaca_data, name=symbol)
        cerebro.adddata(data)
    elif type(symbols) == list or type(symbols) == set:
        for symbol in symbols:
            alpaca_data = rest_api.get_bars(symbol, timeframe, start, end, adjustment='all').df
            data = bt.feeds.PandasData(dataname=alpaca_data, name=symbol)
            cerebro.adddata(data)

    # run
    initial_portfolio_value = cerebro.broker.getvalue()
    print(f'Starting Portfolio Value: {initial_portfolio_value}')
    results = cerebro.run()
    final_portfolio_value = cerebro.broker.getvalue()
    print(f'Final Portfolio Value: {final_portfolio_value} ---> Return: {(final_portfolio_value/initial_portfolio_value - 1)*100}%')

    strat = results[0]
    sharpe_ratio = strat.analyzers.mysharpe.get_analysis()
    print('Sharpe Ratio:', sharpe_ratio['sharperatio'])
    # plot (non-interactive backend) -> save figures and display in notebook
    import matplotlib.pyplot as plt
    from IPython.display import Image, display

    cerebro.plot(iplot=False)  # creates matplotlib figures when using Agg
    for i, fig_num in enumerate(plt.get_fignums(), start=1):
        plt.figure(fig_num)
        filename = f'backtest_plot_{i}.png'
        plt.savefig(filename, dpi=140, bbox_inches='tight')
        display(Image(filename))
    plt.close('all')

    return final_portfolio_value, sharpe_ratio['sharperatio']

Execute strategies with time limits to prevent infinite loops.

In [None]:
from typing import Callable
from unsloth import execute_with_time_limit

To allow longer strategies for Reinforcement Learning, we shall allow a 10 second timer.

In [None]:
@execute_with_time_limit(10)
def execute_strategy(strategy, symbols, start, end, timeframe=TimeFrame.Day, cash=10000):
    """Execute strategy with 10 second time limit."""
    return run_backtest(strategy, symbols, start, end, timeframe, cash)

Test with a simple strategy:

In [None]:
class SmaCross(bt.Strategy):
  # list of parameters which are configurable for the strategy
    params = dict(
        pfast=13,  # period for the fast moving average
        pslow=25   # period for the slow moving average
    )

    def __init__(self):
        sma1 = bt.ind.SMA(period=self.p.pfast)  # fast moving average
        sma2 = bt.ind.SMA(period=self.p.pslow)  # slow moving average
        self.crossover = bt.ind.CrossOver(sma1, sma2)  # crossover signal

    def next(self):
        if not self.position and self.crossover > 0:  # not in the market
            self.buy()
        elif self.position and self.crossover < 0:  # in the market & cross to the downside
            self.close()  # close long position


try:
    final_value, sharp_ratio = execute_strategy(SmaCross, 'AAPL', '2019-01-01', '2021-11-01', timeframe=TimeFrame.Day, cash=10000)
    print("Backtest completed successfully.")
except TimeoutError as e:
    print(f"Timed out: {e}")

# Code Execution

To execute and create a new Python function, we first have to check if the function does not call other global variables or cheat. This is called `countering reward hacking` since we don't want the function to cheat.

For example the below piece of code is fine, since it only imports Python level functions. We use `check_python_modules`:

In [None]:
from unsloth import check_python_modules, create_locked_down_function

# Test safe code
sample = """
def strategy(board, initial):
    for r in range(9):
        for c in range(9):
            if board[r][c] == 0:
                return (r, c, 1)
    return (0, 0, 1)
"""

ok, info = check_python_modules(sample)
print("Safe Python code?", ok)
print(info)


For the below piece of code, since we import `numpy`, we should not allow the execution:

In [None]:
sample = """
def strategy(board, initial):
    import numpy as np
    return (0, 0, 1)
"""

ok, info = check_python_modules(sample)
print("Safe Python code?", ok)
print(info)

# Data & RL task setup

Create the prompt that instructs the model to generate a Sudoku solving strategy. You can customize this to some other task for another RL task.


In [None]:
prompt = """
Create a trading strategy that is fully compatible with the following backtesting setup:

- Framework: Backtrader
- Strategy must subclass bt.Strategy
- The strategy will be passed directly into:
  run_backtest(StrategyClass, symbols, start, end, timeframe, cash)

STRICT RULES:
1. Output ONLY a single Python class definition (no explanations, no markdown, no comments outside the class).
2. The class MUST be named Strategy.
3. Do NOT include imports (bt is already available).
4. Do NOT reference external data, files, APIs, or indicators outside Backtrader.
5. The strategy MUST work for:
   - Single-symbol strategies
   - Multi-symbol strategies (using self.datas)
6. All indicators must be created in __init__.
7. Trading logic must be implemented in next().
8. Orders must use only:
   - self.buy()
   - self.sell()
   - self.close()
   - self.order_target_percent()
9. No plotting, printing, logging, or analyzers.
10. Strategy must be deterministic and backtest-safe (no lookahead bias).

OUTPUT FORMAT:
Return ONLY the Python class exactly like this structure:

class Strategy(bt.Strategy):

    params = dict(
        # parameters here
    )

    def __init__(self):
        # indicator definitions

    def next(self):
        # trading logic

DO NOT output anything else.

""".strip()

print(prompt)

First, let's prompt the model without RL and see how it goes:

In [None]:
text = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt.strip()}],
    tokenize = False,
    add_generation_prompt = True,
)

from transformers import TextStreamer
print("=" * 50)
print("BASE MODEL OUTPUT (before RL training):")
print("=" * 50)
_ = model.generate(
    **tokenizer(images=None,text=text, return_tensors = "pt").to("cuda"),
    temperature = 1.0,
    max_new_tokens = 512,
    streamer = TextStreamer(tokenizer, skip_prompt = False),
)

# Reward functions

We now design a `extract_function` function which simply extracts the function wrapped in 3 back ticks.

And 3 reward functions:

1. `function_works` which rewards the model if the strategy is a valid Python function.
2. `no_cheating` which checks if the function imported other modules, and if it did, we penalize it.
3. `strategy_succeeds` which checks if the game strategy actually succeeds in attaining Sudoku after running the auto-generated strategy.

In [None]:
def extract_function(text):
    """Extract Python function from markdown code blocks."""
    if text.count("```") >= 2:
        first = text.find("```") + 3
        second = text.find("```", first)
        fx = text[first:second].strip()
        fx = fx.removeprefix("python\n")
        fx = fx[fx.find("class Strategy"):]
        if fx.startswith("class Strategy(bt.Strategy):"):
            return fx
    return None

**Reward 1: Function Works**

Checks if the generated code is valid Python and can be executed.

In [None]:
def function_works(completions, **kwargs):
    """Reward for generating valid executable Python code."""
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        function = extract_function(response)
        print("Extracted Function:\n", function)

        if function is not None:
            ok, info = check_python_modules(function)
            print("Safe Python code?", ok)
            print(info)

        if function is None or "error" in info:
            score = -2.0  # Invalid function
        else:
            score = 1.0  # Valid function
            

        scores.append(score)
    return scores

**Reward 2: No Cheating**

Penalizes functions that import external libraries.

In [None]:
def no_cheating(completions, **kwargs):
    """Penalize use of external imports."""
    scores = []
    for completion in completions:
        response = completion[0]["content"]
        function = extract_function(response)

        if function is not None:
            ok, info = check_python_modules(function)
            scores.append(1.0 if ok else -20.0)  # Heavy penalty for cheating
        else:
            scores.append(-1.0)  # Failed to create function

    return scores

**Reward 3: Strategy Succeeds**

Rewards strategies that successfully solve Sudoku puzzles.

In [None]:
import numpy as np

global PRINTER
PRINTER = 0

def strategy_succeeds(completions, **kwargs):
    """Reward valid moves even if strategy eventually fails."""
    global PRINTER
    scores = []

    for completion in completions:
        printed = False
        response = completion[0]["content"]
        function = extract_function(response)

        if PRINTER % 5 == 0:
            printed = True
            print("\n" + "=" * 60)
            print(function)
            print("=" * 60)
        PRINTER += 1

        if function is not None:
            ok, info = check_python_modules(function)

        if function is None or "error" in info:
            scores.append(0)
            continue

        try:
            final_value, sharp_ratio = execute_strategy(function, symbols='AAPL', start='2019-01-01', end='2021-11-01', timeframe=TimeFrame.Day, cash=10000)

            print(f"\n Final Value: {final_value}, Sharpe Ratio: {sharp_ratio}")

            if not printed:
                print("Strategy:")
                print(function[:200] + "..." if len(function) > 200 else function)
                
            scores.append(sharp_ratio if sharp_ratio is not None else -2.0)    

        except TimeoutError:
            print("Timeout")
            scores.append(-1.0)
        except Exception as e:
            print(f"Exception: {str(e)[:100]}")
            scores.append(-3.0)

    return scores

# Dataset Preparation

Create the training dataset.

In [None]:
from datasets import Dataset

dataset = Dataset.from_list([
    {
        "prompt": [{"role": "user", "content": prompt.strip()}],
        "answer": 0,
    }
] * 1000)

maximum_length = len(tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt.strip()}],
    add_generation_prompt=True
))

print(f"Maximum prompt length: {maximum_length}")
print("\nDataset sample:")
print(dataset[0])

## Train the model

Now set up GRPO Trainer and all configurations! We also support GSPO, GAPO, Dr GRPO and more! Go the Unsloth [Reinforcement Learning Docs](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) for more options.

In [None]:
max_prompt_length = maximum_length + 1 # + 1 just in case!
max_completion_length = max_seq_length - max_prompt_length

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    temperature = 1.0,
    learning_rate = 5e-5,
    weight_decay = 0.001,
    warmup_ratio = 0.1,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_completion_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 100,
    save_steps = 100,
    report_to = "none", # Can use Weights & Biases, TrackIO
    output_dir = "outputs",

    # For optional training + evaluation
    # fp16_full_eval = True,
    # per_device_eval_batch_size = 4,
    # eval_accumulation_steps = 1,
    # eval_strategy = "steps",
    # eval_steps = 1,
)

And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!

You might have to wait 150 to 200 steps for any action. You'll probably get low reward for the first 100 steps. Please be patient!

| Step | Training Loss | reward    | reward_std | completion_length | kl       |
|------|---------------|-----------|------------|-------------------|----------|
| 1    | 0.000000      | 0.125000  | 0.000000   | 200.000000        | 0.000000 |
| 2    | 0.000000      | 0.072375  | 0.248112   | 200.000000        | 0.000000 |
| 3    | 0.000000      | -0.079000 | 0.163776   | 182.500000        | 0.000005 |


In [None]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        function_works,
        no_cheating,
        strategy_succeeds,
    ],
    args = training_args,
    train_dataset = dataset,

    # For optional training + evaluation
    # train_dataset = new_dataset["train"],
    # eval_dataset = new_dataset["test"],
)

And let's train the model!

**NOTE** A T4 free GPU might take 5 minutes for one generation sadly since it's an old GPU - A100 or H100 will be much faster!

In [None]:
trainer.train()

In [None]:
# Test the function_works with properly formatted completions
test_completions = [[{"content": """```class Strategy(bt.Strategy):
    params = dict(
        lookback=40,
        volume_scale=1.0,
    )

    def __init__(self):
        self.ata = bt.indicators.SMA(self.data, period=10)
        self.atb = bt.indicators.SMA(self.data, period=20)
        self.atc = bt.indicators.EMA(self.data.close, period=25)
        self.ratio = self.atb / self.ata if hasattr(self, 'ata') and hasattr(self, 'atb') else 1.0

    def next(self):
        if self.position:
            target_diff = (self.atb - self.atc) / self.ata * self.p.volume_scale
            self.order_target_percent(target=target_diff)
        else:
            if self.data.close > self.atb and self.data.close > self.atc:
                self.buy()
            elif self.data.close < self.atc:
                self.order_target_percent(target=-0.05)
```"""}]]

function_works(test_completions)

And now with the LoRA we just trained with GRPO - we first save the LoRA first!


In [None]:
model.save_pretrained("grpo_saved_lora")  # Local saving
tokenizer.save_pretrained("grpo_saved_lora")

Verify LoRA is actually trained!


In [None]:
from safetensors import safe_open

tensors = {}
with safe_open("grpo_saved_lora/adapter_model.safetensors", framework = "pt") as f:
    # Verify both A and B are non zero
    for key in f.keys():
        tensor = f.get_tensor(key)
        n_zeros = (tensor == 0).sum() / tensor.numel()
        assert(n_zeros.item() != tensor.numel())

<a name="Inference"></a>
# Inference
Now let's try the model we just trained!

In [None]:
text = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt.strip()}],
    tokenize = False,
    add_generation_prompt = True,
)

from transformers import TextStreamer

_ = model.generate(
    **tokenizer(images=None,text=text, return_tensors = "pt").to("cuda"),
    temperature = 1.0,
    max_new_tokens = 512,
    streamer = TextStreamer(tokenizer, skip_prompt = False),
)

<a name="Save"></a>
### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if True:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")

    model.push_to_hub("adhamhelmy/model", token = "")
    tokenizer.push_to_hub("adhamhelmy/model", token = "")


### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

[**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "",
    )

## Load and Use the Trained Model

Load the saved LoRA model for inference on new trading strategy prompts.


In [None]:
from unsloth import FastVisionModel
from peft import PeftModel


# Load the saved weights
from safetensors.torch import load_file
state_dict = load_file("model/adapter_model.safetensors")
model.load_state_dict(state_dict, strict=False)

print("✓ Model loaded successfully!")


In [None]:
FastVisionModel.for_inference(model)

# Generate a trading strategy
text = tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt.strip()}],
    tokenize = False,
    add_generation_prompt = True,
)

from transformers import TextStreamer
print("\n" + "=" * 60)
print("TRAINED MODEL OUTPUT:")
print("=" * 60)
_ = model.generate(
    **tokenizer(images=None, text=text, return_tensors = "pt").to("cuda"),
    temperature = 1.0,
    max_new_tokens = 512,
    streamer = TextStreamer(tokenizer, skip_prompt = False),
)
