# Llama 3 fine-tuning for finance prediction

In [1]:
!pip install finnhub-python yfinance transformers bitsandbytes accelerate datasets peft wandb --quiet

## Collect data

### Libraries


In [2]:
import os
import re
import csv
import math
import time
import json
import random
import pandas as pd
from tqdm import tqdm
from functools import partial
from datetime import datetime
from collections import defaultdict

In [3]:
import finnhub
import yfinance as yf

import datasets
from datasets import Dataset

import wandb
import torch
import transformers
from torch.optim import AdamW
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
    GenerationConfig, pipeline, Trainer, TrainingArguments, DataCollatorForSeq2Seq
)

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from google.colab import userdata

2024-07-03 21:51:04.741007: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-03 21:51:04.741120: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-03 21:51:04.883236: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Get news and stocks of companies

In [4]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
finnhub_client = finnhub.Client(api_key=userdata.get("FINNHUB_API_KEY"))

In [None]:
def bin_mapping(ret):
    up_down = 'U' if ret >= 0 else 'D'
    integer = math.ceil(abs(100 * ret))
    return up_down + (str(integer) if integer <= 5 else '5+')

In [None]:
def get_returns(stock_ticker, start_date, end_date):
    stock_data = yf.download(stock_ticker, start=start_date, end=end_date)

    weekly_data = stock_data['Adj Close'].resample('W').ffill()
    weekly_returns = weekly_data.pct_change()[1:]
    weekly_start_prices = weekly_data[:-1]
    weekly_end_prices = weekly_data[1:]

    weekly_data = pd.DataFrame({
        'start_date': weekly_start_prices.index,
        'start_price': weekly_start_prices.values,
        'end_date': weekly_end_prices.index,
        'end_price': weekly_end_prices.values,
        'weekly_returns': weekly_returns.values
    })

    weekly_data['bin_label'] = weekly_data['weekly_returns'].map(bin_mapping)

    return weekly_data

In [None]:
def get_news(ticker, data):
    news_list = []

    for _, row in data.iterrows():
        start_date = row['start_date'].strftime('%Y-%m-%d')
        end_date = row['end_date'].strftime('%Y-%m-%d')
        time.sleep(1) # control qpm
        weekly_news = finnhub_client.company_news(ticker, _from=start_date, to=end_date)
        weekly_news = [
            {
                "date": datetime.fromtimestamp(n['datetime']).strftime('%Y%m%d%H%M%S'),
                "headline": n['headline'],
                "summary": n['summary'],
            } for n in weekly_news
        ]
        weekly_news.sort(key=lambda x: x['date'])
        news_list.append(json.dumps(weekly_news))

    data['news'] = news_list

    return data

In [None]:
def get_basics(ticker, data, start_date, always=False):
    basic_financials = finnhub_client.company_basic_financials(ticker, 'all')

    final_basics, basic_list, basic_dict = [], [], defaultdict(dict)

    for metric, value_list in basic_financials['series']['quarterly'].items():
        for value in value_list:
            basic_dict[value['period']].update({metric: value['v']})

    for k, v in basic_dict.items():
        v.update({'period': k})
        basic_list.append(v)

    basic_list.sort(key=lambda x: x['period'])

    for i, row in data.iterrows():

        start_date = row['end_date'].strftime('%Y-%m-%d')
        last_start_date = start_date if i < 2 else data.loc[i-2, 'start_date'].strftime('%Y-%m-%d')

        used_basic = {}
        for basic in basic_list[::-1]:
            if (always and basic['period'] < start_date) or (last_start_date <= basic['period'] < start_date):
                used_basic = basic
                break
        final_basics.append(json.dumps(used_basic))

    data['basics'] = final_basics

    return data

In [None]:
def prep_data_for_ticker(ticker, data_dir, start_date, end_date):

    _ = get_returns(ticker, start_date, end_date)
    data = get_news(ticker, _)

    data = get_basics(ticker, data, start_date)
    data.to_csv(f"{data_dir}/{ticker}_{start_date}_{end_date}.csv")

### CSV

In [None]:
def append_to_csv(filename, input_data, output_data):
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data, output_data])

In [None]:
def initialize_csv(filename):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "answer"])

### Prompts

In [None]:
def create_company_profile(ticker):
    profile = finnhub_client.company_profile2(symbol=ticker)
    company_template = "[Company Introduction]:\n\n{name} is a leading entity in the {finnhubIndustry} sector. " \
                      "Incorporated and publicly traded since {ipo}, the company has established its reputation " \
                      "as one of the key players in the market. \n\n{name} operates primarily in the {country}, " \
                      "trading under the ticker {ticker} on the {exchange}. As a dominant force in the {finnhubIndustry} space, " \
                      "the company continues to innovate and drive progress within the industry."

    formatted_str = company_template.format(**profile)

    return formatted_str

In [None]:
def map_bin_label(bin_lb):
    lb = bin_lb.replace('U', 'up by ')
    lb = lb.replace('D', 'down by ')
    lb = lb.replace('1', '0-1%')
    lb = lb.replace('2', '1-2%')
    lb = lb.replace('3', '2-3%')
    lb = lb.replace('4', '3-4%')
    if lb.endswith('+'):
        lb = lb.replace('5+', 'more than 5%')
    else:
        lb = lb.replace('5', '4-5%')

    return lb

In [None]:
def sample_news(news, n=5):
    if not 0 <= n <= len(news):
        raise ValueError(f"Bad N")
    sampled_indices = random.sample(range(len(news)), n)
    return [news[i] for i in sampled_indices]

In [None]:
def get_prompt_by_row(ticker, row):

    start_date = row['start_date'].strftime('%Y-%m-%d') if isinstance(row['start_date'], datetime) else str(row['start_date'])
    end_date = row['end_date'].strftime('%Y-%m-%d') if isinstance(row['end_date'], datetime) else str(row['end_date'])

    term = 'increased' if row['end_price'] > row['start_price'] else 'decreased'
    head = f"From {start_date} to {end_date}, {ticker}'s stock price {term} " \
           f"from {row['start_price']:.2f} to {row['end_price']:.2f}. News during this period are listed below:\n\n"

    news = json.loads(row["news"])
    news = [f"[Headline]: {n['headline']}\n[Summary]: {n['summary']}\n"
            for n in news
            if n['date'][:8] <= end_date.replace('-', '')
            and not n['summary'].startswith("Looking for stock market analysis and research with proves results?")]

    basics = json.loads(row['basics'])
    if not basics:
        basics_str = "[Basic Financials]:\n\nNo basic financial reported."
    else:
        basics_str = f"Some recent basic financials of {ticker}, reported at {basics['period']}, are presented below:\n\n[Basic Financials]:\n\n"
        basics_str += "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')

    return head, news, basics_str

In [None]:
def build_prompt(ticker, row, prev_rows, max_weeks, info_prompt):
    prompt = ""
    if prev_rows:
        num_prev_rows = min(random.choice(range(1, max_weeks+1)), len(prev_rows))
        for i in range(-num_prev_rows, 0):
            prompt += f"\n{prev_rows[i][0]}"  # Add Price Movement (Head)
            sampled_news = sample_news(prev_rows[i][1], min(5, len(prev_rows[i][1])))
            if sampled_news:
                prompt += "\n".join(sampled_news)
            else:
                prompt += "\nNo relative news reported."

    head, news, basics = get_prompt_by_row(ticker, row)
    prev_rows.append((head, news, basics))

    if len(prev_rows) > max_weeks:
        prev_rows.pop(0)

    if not prompt:
        return ""

    prediction = map_bin_label(row['bin_label'])
    prompt = f"{info_prompt}\n{prompt}\n{basics}"

    instruction_prompt = "\n\nBased on all the information before {start_date}, let's first analyze the positive developments and potential concerns for {ticker}. " \
                          "Come up with 2-4 most important factors respectively and keep them concise. Most factors should be inferred from company related news. " \
                          "Then let's assume your prediction for next week ({start_date} to {end_date}) is {prediction}. " \
                          "Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, " \
                          "and thus not appearing as a foundational factor of your analysis."

    prompt += instruction_prompt.format(
        start_date=row['start_date'],
        end_date=row['end_date'],
        ticker=ticker,
        prediction=prediction,
    )

    return prompt.strip()

In [None]:
def create_prompts(ticker, data_dir, start_date, end_date, max_weeks):
  print("CREATING PROMPTS")
  df = pd.read_csv(f'{data_dir}/{ticker}_{start_date}_{end_date}.csv')

  info_prompt = create_company_profile(ticker)
  prev_rows = []
  all_prompts = []

  for _, row in df.iterrows():
    prompt = build_prompt(ticker, row, prev_rows, max_weeks, info_prompt)
    if prompt:
      all_prompts.append(prompt)

  print("CREATING PROMPTS END")

  return all_prompts

### Llama3

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [10]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_id)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
generation_config = GenerationConfig.from_pretrained(model_id)
generation_config.temperature = 0.001

stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

llama3 = pipeline(
  "text-generation",
  model=model,
  tokenizer=tokenizer,
  model_kwargs={"torch_dtype": torch.bfloat16},
  num_return_sequences=1,
  generation_config=generation_config,
  eos_token_id=stop_token_ids,
)

### Llama 3 completion

In [None]:
def get_completion(messages):
  prompt = tokenizer.apply_chat_template(
      messages,
      tokenize=False
  )

  outputs = llama3(prompt)
  return outputs

def get_assistant_response(output):
  gen_text = output[0]['generated_text']
  assistant_text = gen_text.rsplit('<|eot_id|>', 1)[1]
  response = assistant_text.replace('assistant\n\n', "", 1)

  return response

In [None]:
def llama3_completion(tickers, data_dir, start_date, end_date, max_weeks=3):

    for ticker in tqdm(tickers):

        print("Processing ticker:", ticker)

        csv_file = f'{data_dir}/{ticker}_{start_date}_{end_date}_llama3.csv'

        if not os.path.exists(csv_file):
            initialize_csv(csv_file)
            pre_done = 0
        else:
            df = pd.read_csv(csv_file)
            pre_done = len(df)

        prompts = create_prompts(ticker, data_dir, start_date, end_date, max_weeks)
        system_prompt = "You are a seasoned stock market analyst. " \
                        "Your task is to list the positive developments and potential " \
                        "concerns for companies based on relevant news and basic financials from the past weeks, " \
                        "then provide an analysis and prediction for the companies' stock price movement for the upcoming week. " \
                        "Your answer format should be as follows: " \
                        "\n\n[Positive Developments]:\n1. ..." \
                        "\n\n[Potential Concerns]:\n1. ..." \
                        "\n\n[Prediction & Analysis]:\n...\n"

        for i, prompt in enumerate(prompts):
            if i < pre_done:
                continue

            completion = get_completion(
                  messages=[
                      {"role": "system", "content": system_prompt},
                      {"role": "user", "content": prompt}
                    ]
            )

            answer = get_assistant_response(completion)
            append_to_csv(csv_file, prompt, answer)

## Prepare data for training

In [None]:
TICKERS = [
    "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
    "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
    "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
    
    ## With my account there is access only to the US tickers
    
    # "ADS.DE", "ADYEN.AS", "AD.AS", "AI.PA", "AIR.PA", "ALV.DE",
    # "ABI.BR", "ASML.AS", "CS.PA", "BAS.DE", "BAYN.DE", "BBVA.MC",
    # "SAN.MC", "BMW.DE", "BNP.PA", "BN.PA", "DAI.DE", "DPW.DE", "DTE.DE",
    # "ENEL.MI", "ENGI.PA", "EL.PA", "FRE.DE", "IBE.MC", "ITX.MC", "IFX.DE",
    # "INGA.AS", "ISP.MI", "KER.PA", "AD.AS", "PHIA.AS", "OR.PA", "LIN.DE",
    # "MC.PA", "MUV2.DE", "NOKIA.SE", "ORA.PA", "RI.PA", "SAF.PA", "SAN.PA",
    # "SAP.DE", "SU.PA", "SIE.DE", "GLE.PA", "STM.PA", "TEF.MC", "TTE.PA",
    # "UNA.AS", "DG.PA", "VOW3.DE"
]

In [None]:
START_DATE = "2023-09-01"
END_DATE = "2024-06-01"

DATA_DIR = f"./llama_{START_DATE}_{END_DATE}"
os.makedirs(DATA_DIR, exist_ok=True)

In [None]:
for ticker in TICKERS:
    prep_data_for_ticker(ticker, DATA_DIR, START_DATE, END_DATE)

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*********************100%%**********************]  1 of 1 completed

[*******************

In [None]:
llama3_completion(TICKERS, DATA_DIR, START_DATE, END_DATE)

  0%|          | 0/30 [00:00<?, ?it/s]

Processing ticker: AXP

CREATING PROMPTS


  7%|▋         | 2/30 [00:00<00:08,  3.20it/s]

CREATING PROMPTS END

Processing ticker: AMGN

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: AAPL

CREATING PROMPTS


 13%|█▎        | 4/30 [00:00<00:05,  5.18it/s]

CREATING PROMPTS END

Processing ticker: BA

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: CAT

CREATING PROMPTS


 20%|██        | 6/30 [00:01<00:03,  6.54it/s]

CREATING PROMPTS END

Processing ticker: CSCO

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: CVX

CREATING PROMPTS


 27%|██▋       | 8/30 [00:01<00:02,  7.47it/s]

CREATING PROMPTS END

Processing ticker: GS

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: HD

CREATING PROMPTS


 33%|███▎      | 10/30 [00:01<00:02,  7.44it/s]

CREATING PROMPTS END

Processing ticker: HON

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: IBM

CREATING PROMPTS


 40%|████      | 12/30 [00:02<00:02,  7.16it/s]

CREATING PROMPTS END

Processing ticker: INTC

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: JNJ

CREATING PROMPTS


 47%|████▋     | 14/30 [00:02<00:02,  7.56it/s]

CREATING PROMPTS END

Processing ticker: KO

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: JPM

CREATING PROMPTS


 53%|█████▎    | 16/30 [00:02<00:01,  7.98it/s]

CREATING PROMPTS END

Processing ticker: MCD

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: MMM

CREATING PROMPTS


 60%|██████    | 18/30 [00:02<00:01,  7.95it/s]

CREATING PROMPTS END

Processing ticker: MRK

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: MSFT

CREATING PROMPTS


 67%|██████▋   | 20/30 [00:03<00:01,  8.01it/s]

CREATING PROMPTS END

Processing ticker: NKE

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: PG

CREATING PROMPTS


 73%|███████▎  | 22/30 [00:03<00:00,  8.58it/s]

CREATING PROMPTS END

Processing ticker: TRV

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: UNH

CREATING PROMPTS


 80%|████████  | 24/30 [00:03<00:00,  8.80it/s]

CREATING PROMPTS END

Processing ticker: CRM

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: VZ

CREATING PROMPTS


 87%|████████▋ | 26/30 [00:03<00:00,  8.71it/s]

CREATING PROMPTS END

Processing ticker: V

CREATING PROMPTS

CREATING PROMPTS END

Processing ticker: WBA

CREATING PROMPTS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


CREATING PROMPTS END

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


 90%|█████████ | 27/30 [09:19<08:20, 166.93s/it]

COMPLETION FINISHED

EXTRACTING RESPONSE

Processing ticker: WMT

CREATING PROMPTS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


CREATING PROMPTS END

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


 93%|█████████▎| 28/30 [36:38<20:17, 608.63s/it]

COMPLETION FINISHED

EXTRACTING RESPONSE

Processing ticker: DIS

CREATING PROMPTS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


CREATING PROMPTS END

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


 97%|█████████▋| 29/30 [1:04:19<15:24, 924.34s/it]

COMPLETION FINISHED

EXTRACTING RESPONSE

Processing ticker: DOW

CREATING PROMPTS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


CREATING PROMPTS END

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


COMPLETION FINISHED

EXTRACTING RESPONSE

COMPLETION IN PROCESS


100%|██████████| 30/30 [1:31:59<00:00, 183.97s/it] 

COMPLETION FINISHED

EXTRACTING RESPONSE





## Transform for training

### Methods

In [None]:
def gen2train(ticker, data_dir):
    csv_file = f'{data_dir}/{ticker}.csv'
    df = pd.read_csv(csv_file)
    prompts, answers, periods, labels = [], [], [], []

    for i, row in df.iterrows():
        prompt, answer = row['prompt'], row['answer']
        res = re.search(r"Then let's assume your prediction for next week \((.*)\) is ((:?up|down) by .*%).", prompt)
        period, label = res.group(1), res.group(2)
        prompt = re.sub(
            r"Then let's assume your prediction for next week \((.*)\) is (up|down) by ((:?.*)%). Provide a summary analysis to support your prediction. The prediction result need to be inferred from your analysis at the end, and thus not appearing as a foundational factor of your analysis.",
            f"Then make your prediction of the {ticker} stock price movement for next week ({period}). Provide a summary analysis to support your prediction.",
            prompt
        )
        answer = re.sub(
            r"\[Prediction & Analysis\]:\s*",
            f"[Prediction & Analysis]:\nPrediction: {label.capitalize()}\nAnalysis: ",
            answer
        )

        system_prompt = "You are a seasoned stock market analyst. " \
                        "Your task is to list the positive developments and potential " \
                        "concerns for companies based on relevant news and basic financials from the past weeks, " \
                        "then provide an analysis and prediction for the companies' stock price movement for the upcoming week. " \
                        "Your answer format should be as follows: " \
                        "\n\n[Positive Developments]:" \
                        "\n1. ..." \
                        "\n\n[Potential Concerns]:" \
                        "\n1. ..." \
                        "\n\n[Prediction & Analysis]:" \
                        "\nPrediction: ..." \
                        "\nAnalysis: ...\n"

        messages=[
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": prompt}
        ]
        train_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

        prompts.append(train_prompt)
        answers.append(answer)
        periods.append(period)
        labels.append(label)

    return {
        "prompt": prompts,
        "answer": answers,
        "period": periods,
        "label": labels,
    }

In [None]:
def transform2train(tickers, data_dir):
  train_list, test_list = [], []
  train_size = 0.8

  for ticker in tickers:
    data_dict = gen2train(ticker, data_dir)

    dataset = Dataset.from_dict(data_dict)
    train_split = round(train_size * len(dataset))

    train_list.append(dataset.select(range(train_split)))
    test_list.append(dataset.select(range(train_split, len(dataset))))

  train_dataset = datasets.concatenate_datasets(train_list)
  test_dataset = datasets.concatenate_datasets(test_list)

  dataset = datasets.DatasetDict({
      'train': train_dataset,
      'test': test_dataset
  })

  return dataset

### Transform

In [None]:
TICKERS = [
    "AXP", "AMGN", "AAPL", "BA", "CAT", "CSCO", "CVX", "GS", "HD", "HON",
    "IBM", "INTC", "JNJ", "KO", "JPM", "MCD", "MMM", "MRK", "MSFT", "NKE",
    "PG", "TRV", "UNH", "CRM", "VZ", "V", "WBA", "WMT", "DIS", "DOW"
]

In [None]:
DATA_DIR = "./llama_2023-09-01_2024-06-01"

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
llama3_dataset = transform2train(TICKERS, DATA_DIR)

In [None]:
llama3_dataset['train'][0]

{'prompt': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. Your answer format should be as follows: \n\n[Positive Developments]:\n1. ...\n\n[Potential Concerns]:\n1. ...\n\n[Prediction & Analysis]:\nPrediction: ...\nAnalysis: ...<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n[Company Introduction]:\n\nAmerican Express Co is a leading entity in the Financial Services sector. Incorporated and publicly traded since 1977-05-18, the company has established its reputation as one of the key players in the market. \n\nAmerican Express Co operates primarily in the US, trading under the ticker AXP on the NEW YORK STOCK EXCHANGE, INC.. As a dominant force in the Financial Services space

In [None]:
llama3_dataset.save_to_disk('./fin-prediction-2023-09-01_2024-06-01-llama3')

Saving the dataset (0/1 shards):   0%|          | 0/900 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

## Fine-tuning

### Libraries

In [7]:
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")
os.environ['WANDB_PROJECT'] = 'llama3-fin-pred'

### Llama 3

In [8]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
torch.cuda.get_device_capability()

(7, 5)

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", attn_implementation="eager")

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [12]:
model = prepare_model_for_kbit_training(model)

### Dataset

In [15]:
def tokenize(tokenizer, feature):
    prompt = feature['prompt'].strip()
    answer = feature['answer'].strip()

    prompt_ids = tokenizer.encode(prompt, add_special_tokens=False, truncation=True, max_length=2048)
    answer_ids = tokenizer.encode(answer, add_special_tokens=False, truncation=True, max_length=2048)

    input_ids = prompt_ids + answer_ids
    too_big = len(input_ids) >= 4096

    if input_ids[-1] != tokenizer.eos_token_id and not too_big:
        input_ids.append(tokenizer.eos_token_id)

    label_ids = [tokenizer.pad_token_id] * len(prompt_ids) + input_ids[len(prompt_ids):]

    return {
        "input_ids": input_ids,
        "labels": label_ids,
        "is_too_big": too_big
    }

In [16]:
dataset_name = "./fin-prediction-2023-09-01-2024-06-01-llama3"
dataset = datasets.load_from_disk(dataset_name)

In [17]:
tokenized_dataset = dataset.map(partial(tokenize, tokenizer))
print('original dataset length: ', len(dataset['train']))
tokenized_dataset = tokenized_dataset.filter(lambda x: not x['is_too_big'])
print('filtered dataset length: ', len(dataset['train']))
tokenized_dataset = tokenized_dataset.remove_columns(
    ['prompt', 'answer', 'label', 'period', 'is_too_big']
)

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

original dataset length:  900


Filter:   0%|          | 0/240 [00:00<?, ? examples/s]

filtered dataset length:  900


In [46]:
len(tokenized_dataset['train'][0]['labels'])

1210

### Params

In [18]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, lora_config)

In [20]:
current_time = datetime.now().strftime('%Y%m%d-%H-%M')

optimizer = AdamW(
    peft_model.parameters(),
    lr=6e-5,
    betas=(0.9, 0.97),
    eps=1e-8,
    weight_decay=0.005
)

training_args = TrainingArguments(
    output_dir=f"./results_{current_time}",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    log_level='info',
    gradient_accumulation_steps=32,
    eval_strategy='steps',
    save_strategy="steps",
    fp16=True,
    logging_steps=1,
    save_total_limit=3,
    save_steps=10,
    eval_steps=10,
    remove_unused_columns=False,
    report_to='wandb',
    run_name="llama3_fin_pred_run",
    max_grad_norm=0.3,
    warmup_ratio=0.03,
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(
      tokenizer, padding=True,
      return_tensors="pt"
    ),
    optimizers=(optimizer, None),  # (optimizer, scheduler)
)

trainer.is_model_parallel = True

peft_model.gradient_checkpointing_enable()
peft_model.enable_input_require_grads()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


### Train

In [21]:
trainer.train(resume_from_checkpoint=True)

trainer.save_model(training_args.output_dir)

Loading model from ./results_20240702-22-31/checkpoint-50.
***** Running training *****
  Num examples = 900
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 32
  Total optimization steps = 56
  Number of trainable parameters = 20,971,520
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 50
  Will skip the first 1 epochs then the first 704 batches in the first epoch.
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33maskador[0m ([33maskador-Kharkiv National University of Radio Electronics[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./results_20240702-22-31
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  

P.S. First part of the training:

```
Step	Training Loss	Validation Loss
10	2.051600	1.447994
20	0.303400	0.276916
30	0.109400	0.109720
40	0.083600	0.084822
50	0.079100	0.073767
***** Running Evaluation *****
  Num examples = 240
  Batch size = 1
Saving model checkpoint to ./results_20240702-22-31/checkpoint-10
/opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results_20240702-22-31/checkpoint-10/tokenizer_config.json
Special tokens file saved in ./results_20240702-22-31/checkpoint-10/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 240
  Batch size = 1
Saving model checkpoint to ./results_20240702-22-31/checkpoint-20
/opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results_20240702-22-31/checkpoint-20/tokenizer_config.json
Special tokens file saved in ./results_20240702-22-31/checkpoint-20/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 240
  Batch size = 1
Saving model checkpoint to ./results_20240702-22-31/checkpoint-30
/opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results_20240702-22-31/checkpoint-30/tokenizer_config.json
Special tokens file saved in ./results_20240702-22-31/checkpoint-30/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 240
  Batch size = 1
Saving model checkpoint to ./results_20240702-22-31/checkpoint-40
/opt/conda/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128009,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.41.2",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./results_20240702-22-31/checkpoint-40/tokenizer_config.json
Special tokens file saved in ./results_20240702-22-31/checkpoint-40/special_tokens_map.json
Deleting older checkpoint [results_20240702-22-31/checkpoint-10] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 240
  Batch size = 1
```

### Using pretrained

In [5]:
model_path = '/kaggle/input/fine-tuned-llama3-fin-pred-model/transformers/fin-pred-1/1'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config, device_map="auto", attn_implementation="eager")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [52]:
generation_config = GenerationConfig.from_pretrained(model_path)
generation_config.temperature = 0.001

stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

llama3 = pipeline(
  "text-generation",
  model=model,
  tokenizer=tokenizer,
  model_kwargs={"torch_dtype": torch.float16},
  generation_config=generation_config,
  eos_token_id=stop_token_ids,
)

In [8]:
def get_assistant_response(output):
  gen_text = output[0]['generated_text']
  assistant_text = gen_text.rsplit('<|eot_id|>', 1)[1]
  response = assistant_text.replace('assistant\n\n', "", 1)

  return response

In [53]:
system_prompt = "You are a seasoned stock market analyst. " \
                "Your task is to list the positive developments and potential " \
                "concerns for companies based on relevant news and basic financials from the past weeks, " \
                "then provide an analysis and prediction for the companies' stock price movement for the upcoming week. " \
                "Your answer format should be as follows: " \
                "\n\n[Positive Developments]:\n1. ..."\
                "\n\n[Potential Concerns]:\n1. ..." \
                "\n\n[Prediction & Analysis]" \
                "\nPrediction: ..." \
                "\nAnalysis: ..."


messages = [
    {
        'role': 'system',
        'content': system_prompt
    },
    {
        'role': 'user',
        'content': '''[Company Introduction]:

Apple Inc is a leading entity in the Technology sector. Incorporated and publicly traded since 1980-12-12, the company has established its reputation as one of the key players in the market. As of today, Apple Inc has a market capitalization of 3397265.94 in USD, with 15334.08 shares outstanding.

Apple Inc operates primarily in the US, trading under the ticker AAPL on the NASDAQ NMS - GLOBAL MARKET. As a dominant force in the Technology space, the company continues to innovate and drive progress within the industry.

From 2024-06-12 to 2024-06-20, AAPL's stock price decreased from 213.07 to 209.68. Company news during this period are listed below:

[Headline]: Is Apple Stock a Buy at $214? 1 Wall Street Analyst Thinks So
[Summary]: Trading at 33 times earnings, Apple will need to deliver a lot of growth to justify its current stock price.

[Headline]: Mar Vista Q1 2024 Strategic Growth Commentary
[Summary]: 

[Headline]: VGT: Last Year, IYW Was The Better Choice, But Now, SCHG Offers Safer Tech Exposure
[Summary]: Both VGT and IYW's 1-year results are largely due to NVDA's extreme surge. Read why SCHG would offer the best tech exposure going forward.

[Headline]: I Am Loading Up On These 8.5% Yielding REITs
[Summary]: 

[Headline]: This Undervalued Stock Could Join Nvidia in the $3 Trillion Club
[Summary]: Here's why Alphabet has what it takes to push its market cap above $3 trillion.

From 2024-06-20 to 2024-06-26, AAPL's stock price increased from 209.68 to 213.25. Company news during this period are listed below:

[Headline]: Nvidia, Microsoft, or Apple: Which Will Be the First to Reach a $4 Trillion Market Cap?
[Summary]: These AI-fueled companies are all within striking distance, but their paths to $4 trillion could look very different.

[Headline]: EU targets Apple's App Store with first charges using new digital competition rules
[Summary]: LONDON — European Union regulators on Monday leveled their first charges under the bloc’s new digital competition rulebook, accusing Apple of preventing app makers from pointing users to cheaper...

[Headline]: Nvidia, EU targets Microsoft, reactions to Yellen: Morning Brief
[Summary]: It's Tuesday in the final trading week of June 2024 and stocks (^DJI, ^IXIC, ^GSPC) are roaring to give it the old college try after tech-heavy indexes have been dragged down by Nvidia's (NVDA) recent sell-off. The Morning Brief Co-Hosts Seana Smith and Brad Smith guide investors through the market open, reporting on the leading business stories and stock trends of the day. Kace Capital Advisors Founder and CEO Kenny Polcari believes Nvidia's downturn and the market's reaction to it is "way overblown," telling the Morning Brief team that investors should focus on the chip company's long-term story. Bank of America Securities Managing Director and Chief US Economist Michael Gapen sits down to talk about US Secretary of Treasury Janet Yellen's comments on the Federal Reserve's inflation target and the likelihood of a recession made in an exclusive interview with Yahoo Finance. European Union regulators are targeting yet another tech giant, this time it's Microsoft (MSFT), alleging the company breached antitrust laws by bundling its Microsoft Teams software with other programs. Other stocks trending on Yahoo Finance include Boeing (BA) and Spirit AeroSystems (SPR), Airbus (AIR.PA, EADSY), and Carnival Corporation (CCL) as the cruise line operator reports fiscal second-quarter earnings results. Watch Yahoo Finance's full, exclusive interview with US Secretary of the Treasury Janet Yellen. This post was written by Luke Carberry Mogan.

[Headline]: Prediction: 3 Unstoppable Artificial Intelligence (AI) Stocks Will Be Bigger Than Nvidia in 2030
[Summary]: Nvidia is the fastest-growing artificial intelligence (AI) company right now, but it could run out of steam in the coming years.

[Headline]: Dow Jones Falls; Tesla Rival Rivian Soars 30% On Volkswagen Investment
[Summary]: Stock Market Today: The Dow Jones dropped Wednesday after key housing data. Rivian soared 30% on a $5 billion Volkswagen investment.

From 2024-06-26 to 2024-07-02, AAPL's stock price increased from 213.25 to 220.27. Company news during this period are listed below:

[Headline]: Apple to bring self-service repair program to Canada next year
[Summary]: TORONTO — Apple says Canadians will soon get access to a program providing them with the parts, tools and manuals they need to fix their own devices.The tech giant says the self-service repair...

[Headline]: Big Tech Stocks Are Picking Up Nvidia’s Slack. That’s a Positive for the Market Rally.
[Summary]: Microsoft, Amazon.com, and Google parent Alphabet have all been in the black since June 18 , while Apple and Facebook parent Meta Platforms recorded only modest declines.

[Headline]: If We Could Only Buy 5 Funds Today For Next 5 Years
[Summary]: Discover a 5-fund portfolio for income investors with exposure to various asset classes and potential for 7% income over the next five years. Click for the picks!

[Headline]: The U.S. State with the Fastest-Growing Economy
[Summary]: In this article, we will take a look at the U.S. State with the Fastest-Growing Economy. We have also compiled a full free list of the 20 U.S. States with the Fastest-Growing Economies. Navigating Global Economic Challenges Post-COVID-19: Insights and Trends Post the COVID-19 pandemic, the global economy is currently in a recovery phase after […]

[Headline]: Wedbush: Tech stocks should rise another 15% in 2H24
[Summary]: Wedbush analysts projected a 25% surge in tech stocks entering 2024, viewing the AI revolution as a key catalyst for a burgeoning tech bull market.

Some recent basic financials of AAPL, reported at 2024-03-30, are presented below:

[Basic Financials]:

assetTurnoverTTM: 1.1073
bookValue: 74194
cashRatio: 0.2640483920466476
currentRatio: 1.0371
ebitPerShare: 1.8041
eps: 1.5284
ev: 2719868.8
fcfMargin: 0.228
fcfPerShareTTM: 6.645
grossMargin: 0.4658
inventoryTurnoverTTM: 30.2839
longtermDebtTotalAsset: 0.2722
longtermDebtTotalCapital: 0.5136
longtermDebtTotalEquity: 1.2377
netDebtToTotalCapital: 0.4021
netDebtToTotalEquity: 0.969
netMargin: 0.2604
operatingMargin: 0.3074
payoutRatioTTM: 0.1508
pb: 35.6899
peTTM: 26.3771
pfcfTTM: 25.9812
pretaxMargin: 0.3092
psTTM: 6.9387
quickRatio: 0.9868
receivablesTurnoverTTM: 19.1901
roaTTM: 0.2913
roeTTM: 1.4833
roicTTM: 0.5706
rotcTTM: 0.6721
salesPerShare: 5.8684
sgaToSale: 0.5342
totalDebtToEquity: 1.4097
totalDebtToTotalAsset: 0.31
totalDebtToTotalCapital: 0.585
totalRatio: 1.2819'''
    }
]

In [64]:
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [65]:
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies\' stock price movement for the upcoming week. Your answer format should be as follows: \n\n[Positive Developments]:\n1. ...\n\n[Potential Concerns]:\n1. ...\n\n[Prediction & Analysis]\nPrediction: ...\nAnalysis: ...<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n[Company Introduction]:\n\nApple Inc is a leading entity in the Technology sector. Incorporated and publicly traded since 1980-12-12, the company has established its reputation as one of the key players in the market. As of today, Apple Inc has a market capitalization of 3397265.94 in USD, with 15334.08 shares outstanding.\n\nApple Inc operates primarily in the US, trading under the ticker AAPL on the NASDAQ NMS - G

In [66]:
completion = llama3(prompt)

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [69]:
completion[0]['generated_text'][len(prompt):]

"**Positive Developments**:\n\n1. Apple's stock price has increased from 209.68 to 220.27 over the past week, indicating a positive trend.\n2. The company's self-service repair program is expanding to Canada, which may lead to increased customer satisfaction and loyalty.\n3. Big Tech stocks, including Apple, Microsoft, and Alphabet, have been performing well, indicating a positive trend in the market.\n\n**Potential Concerns**:\n\n1. Apple's stock price has been trading at a high valuation, with a PE ratio of 26.3771, which may indicate a potential for a correction.\n2. The company's reliance on a single product category, such as iPhones, may lead to volatility in its stock price.\n3. The European Union's charges against Apple's App Store may lead to regulatory scrutiny and potential fines.\n\n**Prediction & Analysis**:\n\nPrediction: Up by 5-7%\n\nAnalysis: Based on the positive developments and the company's strong financials, I predict that Apple's stock price will continue to rise 