# Fine-tuning Open Source Model
Fine-tune the Qwen 2.5 3B model on 1,000 prompts from the S&P that do not appear in the Dow Jones Index

In [1]:
%package install trl

Running: micromamba install trl --yes --quiet --log-level=error

Note: Packages not from Bloomberg channels are not vetted by Bloomberg.
[93mPlease restart the Jupyter kernel if you run into any issues after installing or updating packages via %package.[0m



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from tqdm import tqdm
from datasets import load_dataset
import random
import bql
import json

from company_data import SecurityData

import importlib
import company_data
import utils.model_helper as mh

In [3]:
importlib.reload(company_data)
bq = bql.Service()

### Set up training data

Need to be careful to remove the securities that will be used to test the strategy to make sure there is no contamination of the datasets. The Training data will use the S&P 500 index as the universe with the Dow Jones securities removed from the universe.

In [4]:
# get the actual Dow Jones datasets
dow_data = company_data.SecurityData('tmp/fs','data_quarterly_pit_indu_refresh_blended.json')
dow_secs = dow_data.get_unique_securities()

# get the training data from S&P
spx_data = company_data.SecurityData('tmp/fs','data_quarterly_pit_spx_refresh_blended.json') #'data_annual_pit_spx.json')#
spx_secs = spx_data.get_unique_securities()

In [5]:
#STEP 2: Get the unique securities that could be part of the training data set
training_secs = []
for sec in spx_secs:
    if sec not in dow_secs:
        training_secs.append(sec)

In [6]:
# STEP 3: select random securities and dates for the training set
dates = spx_data.get_dates()

In [7]:
random.seed(21)
random.shuffle(dates)

training_system_prompt = """You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score."""

# assemble the prompts - 1000 dates
prompts = []
count = 10000
for date in dates:
    # Pull out the securities reporting on that date
    securities = spx_data.get_securities_reporting_on_date(date)
    # Loop through the securities
    for security in securities:
        # check to see if the security is in the training set
        if security in training_secs:
            if len(prompts) < count:
                prompt = spx_data.get_prompt(date, security, training_system_prompt)
                record = {'security': security, 'date': date, 'prompt': training_system_prompt + prompt[1]['content'] + "\nThe next period EPS is "}
                prompts.append(record)
            else:
                break

In [8]:

def get_eps_values(security:str, as_of_date:str) -> tuple[float, float]:
    """Return the actual and estimate EPS for a security and point in time date"""
    field = {'eps': bq.data.is_basic_eps_cont_ops(dates=as_of_date, 
                               fpo=bq.func.range('0Q','+1Q'), 
                               currency='USD', 
                               fpt='LTM', 
                               fa_period_year_end='C1231',
                               fa_period_type_source='Q', 
                               fa_act_est_data='AE')}
    # Request the data and unpack to a DataFrame
    req = bql.Request(security,field)
    dt = bq.execute(req)
    df = dt[0].df()
    # extract the actual and estimate
    actual_value = df['eps'][0]
    estimate_value = df['eps'][1]
    return actual_value, estimate_value

In [9]:
for prompt in prompts:
    actual_value, est_value = get_eps_values(prompt['security'],prompt['date'])
    #prompt['prompt'] = prompt['prompt'] + str(round(est_value,3))
    prompt['eps'] = round(est_value,3)

In [11]:
with open('/tmp/training_prompts2.json', 'w') as f:
    json.dump(prompts, f)

### Load the LLM for fine tuning

In [12]:
#with open('Data/training_prompts2.json', 'rb') as f:
#    prompts = json.load(f)

# convert to training sets needed jsonl format
data_set_for_training = []
for prompt in prompts:
    #all_data_for_training += "{\"prompt\":\"" + prompt['prompt'] + "\",\"completion\":\"" + str(prompt['eps']) + "\"}\n"
    data_set_for_training.append({'prompt':prompt['prompt'][:prompt['prompt'].find('Balance Sheet:')] + "The next period EPS is ", 'completion': str(prompt['eps'])})

with open('/tmp/training_set.jsonl', 'w') as f:
     json.dump({'data': data_set_for_training}, f)

In [31]:

dataset = load_dataset("json", data_files="/tmp/training_set.jsonl", field='data', split='train').train_test_split(test_size=0.2)

In [32]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [33]:
model_loader = mh.ModelHelper('tmp/fs')
model = model_loader.load_model('qwen3b','auto')

qwen3b


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [34]:
model_loader.clear_folder('qwen3b')

In [35]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [36]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [37]:
# mask the majority of the prompt for estimating
from trl import DataCollatorForCompletionOnlyLM
template = "The next period EPS is "
collator = DataCollatorForCompletionOnlyLM(tokenizer.encode(template, add_special_tokens = False)[2:], tokenizer=tokenizer)

In [38]:
lora_parameters = LoraConfig(
    lora_alpha = 8,
    lora_dropout = 0.1,
    target_modules = ['q_proj','v_proj','k_proj', 'o_proj'],
    r = 4,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

In [39]:
train_parameters = SFTConfig(
    output_dir = '/tmp/qwen_trained',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy='no',
    gradient_accumulation_steps=1,
    optim='paged_adamw_32bit',
    learning_rate= 1e-4,
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.03,
    save_steps=500,
    save_total_limit=10,
    logging_steps=5,
    weight_decay=0.001,
    max_grad_norm=0.3,
    max_steps=-1,
    push_to_hub=False,
    save_strategy="steps",
    max_seq_length=7000,
    dataset_text_field='prompt',
    group_by_length=True,
    run_name='Earnings',
    
)

In [41]:
fine_tuning = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=lora_parameters,
    tokenizer=tokenizer,
    args=train_parameters,
    data_collator=collator
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [40]:
train_dataset[0]

{'completion': '34.777',
 'prompt': 'You are a financial analyst.Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score.Income Statement:                                                        t           t-1           t-2           t-3           t-4           t-5\nitems                                                                                                                          \nRevenue                                      1.440986e+10  1.405686e+10  1.373781e+10  1.353268e+10  1.332756e+10  1.286484e+10\nCost of Revenue                              7.028154e+09  6.794572e+09  6.592245e+09  6.445449e+09  6.307614e+09  6.107292e+09\nGross Profit                                 7.381706e+09  7.262288e+09  7.145566e+09  7.087226e+09  7.019949e+09  6.757551e+09\nOperating

In [41]:
tokens = tokenizer.tokenize(train_dataset[0]['prompt'])

In [42]:
len(tokens)

1714

In [42]:
fine_tuning.train()

Step,Training Loss
5,4.8279
10,4.8563
15,4.8208
20,4.7883
25,4.5891
30,4.0814
35,3.434
40,3.5042
45,2.873
50,3.0552


TrainOutput(global_step=8000, training_loss=0.7448820173591375, metrics={'train_runtime': 19414.1536, 'train_samples_per_second': 0.412, 'train_steps_per_second': 0.412, 'total_flos': 3.0943070324834304e+17, 'train_loss': 0.7448820173591375, 'epoch': 1.0})

In [28]:
model.get_memory_footprint() / 1e9

2.640265472

In [28]:
# save model to s3
fine_tuning.model.save_pretrained('fine_tuned')