## Prepare dataset for fine-tuning

### Financial-Alpaca dataset

In [None]:
from datasets import load_dataset

# Load Finanace-Alpaca dataset from huggingface
alpaca_data = load_dataset("gbharti/finance-alpaca", split="train")

In [None]:
import json

def convert_to_jsonl(example):
    system_msg = {"role": "system", "content": "You are a financial expert specialized in SEC filings and market analysis."}
    
    if example["input"]:
        user_msg = f"{example['instruction']}\nInput:\n{example['input']}"
    else:
        user_msg = example["instruction"]
    
    return {
        "messages": [
            system_msg,
            {"role": "user", "content": user_msg},
            {"role": "assistant", "content": example["output"]}
        ]
    }

# Process and save as JSONL
with open("financial_alpaca.jsonl", "w") as f:
    for example in alpaca_data:
        converted = convert_to_jsonl(example)
        json.dump(converted, f)
        f.write("\n")

In [None]:
# Validate dataset
!python ~/mulkooo/mistral-finetune/utils/validate_data.py ~/mulkooo/sj_Trading/content/alpaca/financial_alpaca.jsonl

In [None]:
# Reformat the dataset if needed
!python ~/mulkooo/mistral-finetune/utils/reformat_data.py ~/mulkooo/sj_Trading/content/alpaca/financial_alpaca.jsonl

In [None]:
# Run fine-tuning with alpaca data
# Commands are run under /mistral-finetune:

# python -m torch.distributed.run --nproc_per_node=1 finetune.py financial_alpaca.jsonl \
#     --model_name_or_path mistralai/Mistral-7B-Instruct-v0.3 \
#     --max_steps 1000 \
#     --learning_rate 2e-4 \
#     --seq_len 4096