# Process Data

#### To Do:
- Process to save the model - cannot download from Huggingface each time
- How to store the results - what format should this be in for Equity Signal Lab
- Base case, look at following the trend.
- Backtesting process to ensure point in time. Will need to request on daily basis for the proper test

In [1]:
import json
import boto3
from s3fs import S3FileSystem
import os
import json

import transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

import pandas as pd

from IPython.display import Markdown, display

from helper import get_s3_folder
import s3_model
from s3_model import S3ModelHelper

In [None]:
import importlib
importlib.reload(s3_model)

## Bring in Financial Data from S3

In [3]:
## Load from S3
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

path_to_s3 = f's3://{user_bucket_name}/{bqnt_username}/tmp/fs/data.json'
s3 = S3FileSystem()

all_data = {}
with s3.open(path_to_s3, 'rb') as f:
    all_data = json.load(f)


In [4]:
all_data.keys()

dict_keys(['2023-12-31', '2022-12-31', '2021-12-31', '2020-12-31', '2019-12-31', '2018-12-31', '2017-12-31', '2016-12-31', '2015-12-31', '2014-12-31', '2013-12-31', '2012-12-31', '2011-12-31', '2010-12-31', '2009-12-31', '2008-12-31', '2007-12-31'])

## Check the data and reformat

In [5]:
fin_data = all_data['2007-12-31']
#fin_data
date_is_all = pd.DataFrame(fin_data['is']).set_index(['ID', 'level_1'])
date_bs_all = pd.DataFrame(fin_data['bs']).set_index(['ID', 'level_1'])
date_is_all

Unnamed: 0_level_0,Unnamed: 1_level_0,t,t-1,t-2,t-3,t-4,t-5
ID,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0111145D UN Equity,01 Revenue (Adj),2.960000e+09,3.357800e+09,2.739700e+09,2.662700e+09,1.897400e+09,2.366300e+09
0111145D UN Equity,02 Sales and Services Revenues (Adj),0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
0111145D UN Equity,03 Financing Revenue (Adj),0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
0111145D UN Equity,04 Other Revenue (Adj),0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
0111145D UN Equity,05 Cost of Revenue (Adj),1.743700e+09,2.212400e+09,1.695000e+09,1.692700e+09,9.701000e+08,1.477500e+09
...,...,...,...,...,...,...,...
ZION UW Equity,47 Basic EPS from Continuing Operations,5.700000e+00,5.310000e+00,4.550000e+00,3.770000e+00,3.680000e+00,3.260000e+00
ZION UW Equity,48 Diluted Weighted Average Shares,1.080280e+08,9.299400e+07,9.088200e+07,9.073400e+07,9.207900e+07,9.217400e+07
ZION UW Equity,49 Diluted EPS,5.360000e+00,5.160000e+00,4.470000e+00,3.720000e+00,2.780000e+00,3.070000e+00
ZION UW Equity,50 Diluted EPS from Continuing Operations,5.360000e+00,5.160000e+00,4.470000e+00,3.740000e+00,3.440000e+00,3.150000e+00


In [6]:
# Load a single security
def get_securities(df):
    return df.reset_index(inplace=False)['ID'].drop_duplicates(inplace=False)

securities = get_securities(date_is_all)
sec_test = date_is_all.loc[securities[0]]

In [7]:
# remove all non-zero values
test_sec = sec_test.loc[(sec_test!=0).any(axis=1)]
test_sec

Unnamed: 0_level_0,t,t-1,t-2,t-3,t-4,t-5
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01 Revenue (Adj),2960000000.0,3357800000.0,2739700000.0,2662700000.0,1897400000.0,2366300000.0
05 Cost of Revenue (Adj),1743700000.0,2212400000.0,1695000000.0,1692700000.0,970100000.0,1477500000.0
08 Gross Profit (Adj),1216300000.0,1145400000.0,1044700000.0,970000000.0,927300000.0,888800000.0
10 Operating Expenses (Adj),1010700000.0,974000000.0,874400000.0,780600000.0,733900000.0,685700000.0
14 Operating Income or Losses (Adj),205600000.0,171400000.0,170300000.0,189400000.0,193400000.0,203100000.0
17 Interest Expense (Adj),49100000.0,46800000.0,41200000.0,37300000.0,38500000.0,46000000.0
18 Interest Income (Adj),9000000.0,6000000.0,2300000.0,0.0,0.0,0.0
"21 Pretax Income (Loss), Adjusted (Adj)",174100000.0,171000000.0,105300000.0,169400000.0,185600000.0,181200000.0
22 Abnormal Losses (Gains),3100000.0,-47300000.0,32600000.0,-23700000.0,0.0,0.0
"28 Pretax Income (Loss), GAAP",174100000.0,171000000.0,105300000.0,169400000.0,185600000.0,181200000.0


## Set up the LLM

To download the llama model, run the following code to login with an access token and then run the code to download and access the model. This requires a login from Huggingface. We use the Llama model (1B parameters but can try this with the 70B parameters too)

In [None]:
from huggingface_hub import login

with open('pass.txt') as p:
    hf_login = p.read()
    
hf_login = hf_login[hf_login.find('=')+1:]
login(hf_login, add_to_git_credential=False)

In [None]:
# Execute with a small model first`

# Llama Small model
#model_id = "meta-llama/Llama-3.2-3B-Instruct"#"meta-llama/Llama-3.3-70B-Instruct"

# Qwen2 72B Insturct
model_id = "Qwen/Qwen2.5-7B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



Check out: https://huggingface.co/docs/transformers/conversations

In [2]:
# Check if the model is in S3 or download from scratch
USE_HF = False

model_id = "Qwen/Qwen2.5-7B-Instruct" #"meta-llama/Llama-3.2-7B-Instruct"

if USE_HF:
   
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )

    model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.bfloat16 )
    tokenizer = AutoTokenizer.from_pretrained(model_id)
else:
    qwen = s3_model.S3ModelHelper(s3_sub_folder='tmp/fs')
    model = qwen.load_model('qwen')
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )
    qwen.clear_folder()
    
    

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [8]:
qwen.clear_folder('qwen')

In [9]:
#messages = [
#    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#    {"role": "user", "content": "where do you live?"}
#]

messages = [
    {"role": "system", "content": "You are a financial analyst and must make a buy, sell or hold decision on a company based only on the provided datasets. \
        Compute common financial ratios and then determine the buy sell decision. Explain your reasons and answer in a format that compiles to a JSON object.\
        Answer as a JSON string with the following example format: \
        {'Investment Decision': BUY, 'Reason': 'Gross profit and EPS have both increased over time'}"},
    {"role": "user", "content": test_sec.to_string()}

]


In [10]:
formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("Chat: ", formatted_chat)

Chat:  <|im_start|>system
You are a financial analyst and must make a buy, sell or hold decision on a company based only on the provided datasets.         Compute common financial ratios and then determine the buy sell decision. Explain your reasons and answer in a format that compiles to a JSON object.        Answer as a JSON string with the following example format:         {'Investment Decision': BUY, 'Reason': 'Gross profit and EPS have both increased over time'}<|im_end|>
<|im_start|>user
                                                           t           t-1           t-2           t-3           t-4           t-5
level_1                                                                                                                           
01 Revenue (Adj)                                2.960000e+09  3.357800e+09  2.739700e+09  2.662700e+09  1.897400e+09  2.366300e+09
05 Cost of Revenue (Adj)                        1.743700e+09  2.212400e+09  1.695000e+09  1.692700e+09  9.70

## Get the output from the LLM

In [11]:
outputs = pipeline(
    messages,
    max_new_tokens=500,
)

test_output = outputs[0]['generated_text'][-1]

In [12]:
display(Markdown(test_output['content'])) #[8:-4]

```json
{
  "Investment Decision": "BUY",
  "Reason": "The company shows a steady increase in revenue and gross profit over the past five quarters. Additionally, the basic and diluted earnings per share (EPS) have shown improvement, indicating better profitability for shareholders."
}
```

In [None]:
# Needed for Llama 3.2 model - not needed for qwen
def format_json(llm_output):
    form = llm_output['content'].replace('\n','')
    eoj = form.find('}```')
    additional = form[eoj + 4:]
    json_obj = json.loads(form[7:eoj + 1])
    json_obj['AdditionalContext'] = additional
    return json_obj



In [None]:
obj = format_json(test_output)

In [None]:
model.save_pretrained('qwen')

In [13]:
## Inference set with Llama/ Qwen

#Loop through each security, 

#extract the IS and the BS, combine into a single prompt

# convert to json

# store in file with the date and security name

# upload to cloud

#change

## Unused code

In [None]:
# username = os.environ['BQUANT_USERNAME']
# username_folder = 'tmp/fs'

# def save_folder_to_s3(name):
#     client = boto3.client("s3")
#     bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
    
#     files = os.listdir(name)
#     for file in files:
#         local_path = f'{name}/{file}'
#         obj_name = f'{username}/{username_folder}/{name}/{file}'
#         res = client.upload_file(local_path, bucket, obj_name)
#     print(res)
    
# # Need to clear the files from local drive after downloading the model
# def clear_folder(name):
#     for root, dirs, files in os.walk(name, topdown=False):
#         for name in files:
#             os.remove(os.path.join(root, name))
#         for name in dirs:
#             os.rmdir(os.path.join(root, name))
            
# def list_model_files(model_name):
#     client = boto3.client("s3")
#     bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
#     folder = f'{username}/{username_folder}/{model_name}'
    
#     files = []
#     for file in client.list_objects(Bucket=bucket, Prefix=folder)['Contents']:
#         key = file['Key']
#         files.append(key)
#     return files

                      
# # re-load the model from s3
# def load_model(model_name):
#     client = boto3.client("s3")
#     bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
#     folder = f'{username}/{username_folder}/{model_name}'
    
#     if not os.path.exists(model_name):
#         os.makedirs(model_name)
        
#     for file in client.list_objects(Bucket=bucket, Prefix=folder)['Contents']:
#         key = file['Key']
#         file_name = model_name + '/' + key[key.find(model_name + '/') + len(model_name) + 1:]
#         client.download_file(bucket, key, file_name)
#     return AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.bfloat16 )

# def delete_model_in_s3(model_name):
#     pass