# Fine-tuning Open Source Model
Fine-tune the Qwen 2.5 3B model on 1,000 prompts from the S&P that do not appear in the Dow Jones Index

In [3]:
%package install trl

Running: micromamba install trl --yes --quiet --log-level=error

Note: Packages not from Bloomberg channels are not vetted by Bloomberg.
[93mPlease restart the Jupyter kernel if you run into any issues after installing or updating packages via %package.[0m



In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from tqdm import tqdm
from datasets import load_dataset
import random
import bql
import json

from company_data import SecurityData

import importlib
import company_data
import utils.model_helper as mh

In [2]:
importlib.reload(company_data)
bq = bql.Service()

### Set up training data

Need to be careful to remove the securities that will be used to test the strategy to make sure there is no contamination of the datasets. The Training data will use the S&P 500 index as the universe with the Dow Jones securities removed from the universe.

In [4]:
# get the actual Dow Jones datasets
dow_data = company_data.SecurityData('tmp/fs','data_quarterly_pit_indu_refresh_blended.json')
dow_secs = dow_data.get_unique_securities()

# get the training data from S&P
spx_data = company_data.SecurityData('tmp/fs','data_quarterly_pit_spx_refresh_blended.json') #'data_annual_pit_spx.json')#
spx_secs = spx_data.get_unique_securities()

In [5]:
#STEP 2: Get the unique securities that could be part of the training data set
training_secs = []
for sec in spx_secs:
    if sec not in dow_secs:
        training_secs.append(sec)

In [6]:
# STEP 3: select random securities and dates for the training set
dates = spx_data.get_dates()

In [7]:
random.seed(21)
random.shuffle(dates)

training_system_prompt = """You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score."""

# assemble the prompts - 1000 dates
prompts = []
count = 10000
for date in dates:
    # Pull out the securities reporting on that date
    securities = spx_data.get_securities_reporting_on_date(date)
    # Loop through the securities
    for security in securities:
        # check to see if the security is in the training set
        if security in training_secs:
            if len(prompts) < count:
                prompt = spx_data.get_prompt(date, security, training_system_prompt)
                record = {'security': security, 'date': date, 'prompt': training_system_prompt + prompt[1]['content'] + "\nAnswer in JSON format with the next period EPS, the direction, the magnitude and a confidence."}
                #record = {'security': security, 'date': date, 'prompt': training_system_prompt + prompt[1]['content'] + "\nThe next period EPS is "}
                prompts.append(record)
            else:
                break

In [20]:
# Get data from BQL for the actual and estimated EPS for each period
def get_eps_values(security:str, as_of_date:str) -> tuple[float, float]:
    """Return the actual and estimate EPS for a security and point in time date"""
    field = {'eps': bq.data.is_basic_eps_cont_ops(dates=as_of_date, 
                               fpo=bq.func.range('0Q','+1Q'), 
                               currency='USD', 
                               fpt='LTM', 
                               fa_period_year_end='C1231',
                               fa_period_type_source='Q', 
                               fa_act_est_data='AE')}
    # Request the data and unpack to a DataFrame
    req = bql.Request(security,field)
    dt = bq.execute(req)
    df = dt[0].df()
    # extract the actual and estimate
    actual_value = df['eps'][0]
    estimate_value = df['eps'][1]
    return actual_value, estimate_value

In [16]:
def construct_prompt_response(actual_value, est_value):
    earnings = 'INCREASING' if est_value > actual_value else 'DECREASING'
    magnitude = est_value / actual_value - 1
    if abs(magnitude) < 0.02 :
        mag_descr = 'FLAT'
    elif abs(magnitude) <= 0.05:
        mag_descr = 'SMALL'
    elif abs (magnitude) <= 0.1:
        mag_descr = 'LARGE'
    else:
        mag_descr = 'VERY LARGE'
    return json.dumps({'EPS': round(est_value,3), 'earnings': earnings, 'magnitude':mag_descr})



In [18]:
construct_prompt_response(3,2)

'{"EPS": 2, "earnings": "DECREASING", "magnitude": "VERY LARGE"}'

In [21]:
for prompt in prompts:
    actual_value, est_value = get_eps_values(prompt['security'],prompt['date'])
    #prompt['prompt'] = prompt['prompt'] + str(round(est_value,3))
    prompt['eps'] = construct_prompt_response(actual_value, est_value)

In [10]:
prompts[0]

{'security': 'CRL UN Equity',
 'date': '2024-08-07',
 'prompt': 'You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score.Income Statement:                                                        t           t-1           t-2           t-3           t-4           t-5\nitems                                                                                                                          \nRevenue                                      4.077776e+09  4.111596e+09  4.129409e+09  4.215776e+09  4.178310e+09  4.091504e+09\nCost of Revenue                              2.653295e+09  2.641851e+09  2.626853e+09  2.678421e+09  2.632572e+09  2.588017e+09\nGross Profit                                 1.424481e+09  1.469745e+09  1.502556e+09  1.537355e+09  1.545738

In [11]:
with open('/tmp/training_prompts3.json', 'w') as f:
    json.dump(prompts, f)

### Load the LLM for fine tuning

In [3]:
with open('/tmp/training_prompts3.json', 'rb') as f:
    prompts = json.load(f)

# convert to training sets needed jsonl format
data_set_for_training = []
for prompt in prompts:
    #all_data_for_training += "{\"prompt\":\"" + prompt['prompt'] + "\",\"completion\":\"" + str(prompt['eps']) + "\"}\n"
    #data_set_for_training.append({'prompt':prompt['prompt'][:prompt['prompt'].find('Balance Sheet:')] + "The next period EPS is ", 'completion': str(prompt['eps'])})
    data_set_for_training.append({'prompt':prompt['prompt'][:prompt['prompt'].find('Balance Sheet:')] + "Answer in JSON format with the next period EPS, the direction, the magnitude and a confidence:", 'completion': str(prompt['eps'])})

with open('/tmp/training_set.jsonl', 'w') as f:
     json.dump({'data': data_set_for_training}, f)

In [4]:

dataset = load_dataset("json", data_files="/tmp/training_set.jsonl", field='data', split='train').train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [6]:
model_loader = mh.ModelHelper('tmp/fs')
model = model_loader.load_model('qwen3b','auto')

qwen3b


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [7]:
model_loader.clear_folder('qwen3b')

In [8]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-3B-Instruct', trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [9]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

## Set up for training

In [15]:
# with open('pass.txt') as p:
#     wandb_login = p.read()
    
# wandb_login = wandb_login[wandb_login.find('WANDB_KEY=')+10:wandb_login.find('\n',wandb_login.find('WANDB_KEY='))]
# os.environ['WANDB_API_KEY'] = wandb_login
# os.environ['WANDB_PROJECT'] = 'Earnings'
# os.environ['WANDB_LOG_MODEL'] = "checkpoint"
# os.environ['WANDB_WATCH'] = "gradients"
# wandb.login()
#wandb_login

In [10]:
# mask the majority of the prompt for estimating
from trl import DataCollatorForCompletionOnlyLM
template = "confidence: "
collator = DataCollatorForCompletionOnlyLM(tokenizer.encode(template, add_special_tokens = False)[2:], tokenizer=tokenizer)

In [11]:
lora_parameters = LoraConfig(
    lora_alpha = 8,
    lora_dropout = 0.1,
    target_modules = ['q_proj','v_proj','k_proj', 'o_proj'],
    r = 4,
    bias = 'none',
    task_type = 'CAUSAL_LM'
)

In [32]:
train_parameters = SFTConfig(
    output_dir = '/tmp/qwen_trained',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    eval_strategy='no',
    gradient_accumulation_steps=1,
    optim='paged_adamw_32bit',
    learning_rate= 1e-4,
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.03,
    save_steps=500,
    save_total_limit=10,
    logging_steps=5,
    weight_decay=0.001,
    max_grad_norm=0.3,
    max_steps=-1,
    push_to_hub=False,
    save_strategy="steps",
    max_seq_length=3000,
    dataset_text_field='prompt',
    group_by_length=True,
    run_name='Earnings',
    remove_unused_columns=False,
)

In [13]:
fine_tuning = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=lora_parameters,
    tokenizer=tokenizer,
    args=train_parameters,
    data_collator=collator
)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
train_dataset[0]

{'completion': '{"EPS": 5.41, "earnings": "INCREASING", "magnitude": "FLAT"}',
 'prompt': 'You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score.Income Statement:                                                        t           t-1           t-2           t-3           t-4           t-5\nitems                                                                                                                          \nRevenue                                      1.669395e+09  1.662830e+09  1.653395e+09  1.637330e+09  1.606814e+09  1.563227e+09\nOperating Expenses                           1.028000e+06  2.692000e+06  2.891000e+06  2.896000e+06  2.132000e+06  2.870000e+05\nOperating Income or Losses                   5.251040e+08  5.284900e+08  5.253140e+

In [15]:
tokens = tokenizer.tokenize(train_dataset[0]['prompt'])

In [16]:
len(tokens)

1461

In [50]:
#fine_tuning.train()

In [28]:
model.get_memory_footprint() / 1e9

2.640265472

In [18]:
# save model to s3
fine_tuning.model.save_pretrained('fine_tuned_json')

In [55]:
fine_tuning.eval_dataset = test_dataset

In [27]:
fine_tuning.compute_metrics

In [33]:
fine_tuning.args = train_parameters

In [56]:
fine_tuning.evaluation_loop(test_dataset, "test")

RuntimeError: module must have its parameters and buffers on device cuda:0 (device_ids[0]) but found one of them on device: cuda:1

In [38]:
test_dataset[0]

{'completion': '{"EPS": 2.713, "earnings": "INCREASING", "magnitude": "SMALL"}',
 'prompt': 'You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score.Income Statement:                                                        t           t-1           t-2           t-3           t-4           t-5\nitems                                                                                                                          \nRevenue                                      1.173602e+10  1.206899e+10  1.211215e+10  1.211157e+10  1.174396e+10  1.156111e+10\nCost of Revenue                              1.013180e+10  1.050052e+10  1.051190e+10  1.053492e+10  1.021493e+10  1.001821e+10\nGross Profit                                 1.604226e+09  1.568467e+09  1.600252

In [43]:
eval_dataset = []
for x in range(0,10):
    dics = {'input_ids':{'prompt':test_dataset[x]['prompt'], 'completion':test_dataset[x]['completion']}}
    eval_dataset.append(dics)

In [44]:
eval_dataset[0]

{'input_ids': {'prompt': 'You are a financial analyst. Use the following income statement, balance sheet to estimate the Basic EPS for the next fiscal period. Use only the data in the prompt. Provide a confidence score for how confident you are of the decision. If you are not confident then lower the confidence score.Income Statement:                                                        t           t-1           t-2           t-3           t-4           t-5\nitems                                                                                                                          \nRevenue                                      1.173602e+10  1.206899e+10  1.211215e+10  1.211157e+10  1.174396e+10  1.156111e+10\nCost of Revenue                              1.013180e+10  1.050052e+10  1.051190e+10  1.053492e+10  1.021493e+10  1.001821e+10\nGross Profit                                 1.604226e+09  1.568467e+09  1.600252e+09  1.576646e+09  1.529031e+09  1.542897e+09\nOperating Expenses 