# Process Data

#### To Do:
- Process to save the model - cannot download from Huggingface each time
- How to store the results - what format should this be in for Equity Signal Lab
- Base case, look at following the trend.
- Backtesting process to ensure point in time. Will need to request on daily basis for the proper test

In [24]:
import json
import boto3
from s3fs import S3FileSystem
import os
import json

import transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

import pandas as pd

from IPython.display import Markdown, display

from helper import get_s3_folder
import s3_model
from s3_model import S3ModelHelper

In [25]:
import importlib
importlib.reload(s3_model)

<module 's3_model' from '/project/s3_model.py'>

## Bring in Financial Data from S3

In [None]:
## Load from S3
user_bucket_name = os.environ['BQUANT_SANDBOX_USER_BUCKET']
bqnt_username = os.environ['BQUANT_USERNAME']

path_to_s3 = f's3://{user_bucket_name}/{bqnt_username}/tmp/fs/data.json'
s3 = S3FileSystem()

all_data = {}
with s3.open(path_to_s3, 'rb') as f:
    all_data = json.load(f)


In [None]:
all_data.keys()

## Check the data and reformat

In [None]:
fin_data = all_data['2007-12-31']
#fin_data
date_is_all = pd.DataFrame(fin_data['is']).set_index(['ID', 'level_1'])
date_bs_all = pd.DataFrame(fin_data['bs']).set_index(['ID', 'level_1'])
date_is_all

In [None]:
# Load a single security
def get_securities(df):
    return df.reset_index(inplace=False)['ID'].drop_duplicates(inplace=False)

securities = get_securities(date_is_all)
sec_test = date_is_all.loc[securities[0]]

In [None]:
# remove all non-zero values
test_sec = sec_test.loc[(sec_test!=0).any(axis=1)]
test_sec

## Set up the LLM

To download the llama model, run the following code to login with an access token and then run the code to download and access the model. This requires a login from Huggingface. We use the Llama model (1B parameters but can try this with the 70B parameters too)

In [2]:
from huggingface_hub import login

with open('pass.txt') as p:
    hf_login = p.read()
    
hf_login = hf_login[hf_login.find('=')+1:]
login(hf_login, add_to_git_credential=False)

In [3]:
# Execute with a small model first`

# Llama Small model
#model_id = "meta-llama/Llama-3.2-3B-Instruct"#"meta-llama/Llama-3.3-70B-Instruct"

# Qwen2 72B Insturct
model_id = "Qwen/Qwen2.5-7B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)



config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


Check out: https://huggingface.co/docs/transformers/conversations

In [None]:
# Check if the model is in S3 or download from scratch
USE_HF = True

if USE_HF:
    model_id = "meta-llama/Llama-3.2-7B-Instruct"
    

    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
    )

    model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.bfloat16 )
    tokenizer = AutoTokenizer.from_pretrained(model_id )
else:
    model_id = 'llama'
    

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.bfloat16 )
tokenizer = AutoTokenizer.from_pretrained(model_id )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
#messages = [
#    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#    {"role": "user", "content": "where do you live?"}
#]

messages = [
    {"role": "system", "content": "You are a financial analyst and must make a buy, sell or hold decision on a company based only on the provided datasets. \
        Compute common financial ratios and then determine the buy sell decision. Explain your reasons and answer in a format that compiles to a JSON object.\
        Answer as a JSON string with the following example format: \
        {'Investment Decision': BUY, 'Reason': 'Gross profit and EPS have both increased over time'}"},
    {"role": "user", "content": test_sec.to_string()}

]


In [None]:
formatted_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print("Chat: ", formatted_chat)

## Get the output from the LLM

In [None]:
outputs = pipeline(
    messages,
    max_new_tokens=500,
)

test_output = outputs[0]['generated_text'][-1]

In [None]:
display(Markdown(test_output['content'])) #[8:-4]

In [None]:
def format_json(llm_output):
    form = llm_output['content'].replace('\n','')
    eoj = form.find('}```')
    additional = form[eoj + 4:]
    json_obj = json.loads(form[7:eoj + 1])
    json_obj['AdditionalContext'] = additional
    return json_obj

In [None]:
obj = format_json(test_output)

In [5]:
model.save_pretrained('qwen')

In [None]:
clear_folder('llama')

In [6]:
username = os.environ['BQUANT_USERNAME']
username_folder = 'tmp/fs'

def save_folder_to_s3(name):
    client = boto3.client("s3")
    bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
    
    files = os.listdir(name)
    for file in files:
        local_path = f'{name}/{file}'
        obj_name = f'{username}/{username_folder}/{name}/{file}'
        res = client.upload_file(local_path, bucket, obj_name)
    print(res)
    
# Need to clear the files from local drive after downloading the model
def clear_folder(name):
    for root, dirs, files in os.walk(name, topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))
        for name in dirs:
            os.rmdir(os.path.join(root, name))
            
def list_model_files(model_name):
    client = boto3.client("s3")
    bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
    folder = f'{username}/{username_folder}/{model_name}'
    
    files = []
    for file in client.list_objects(Bucket=bucket, Prefix=folder)['Contents']:
        key = file['Key']
        files.append(key)
    return files

                      
# re-load the model from s3
def load_model(model_name):
    client = boto3.client("s3")
    bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
    folder = f'{username}/{username_folder}/{model_name}'
    
    if not os.path.exists(model_name):
        os.makedirs(model_name)
        
    for file in client.list_objects(Bucket=bucket, Prefix=folder)['Contents']:
        key = file['Key']
        file_name = model_name + '/' + key[key.find(model_name + '/') + len(model_name) + 1:]
        client.download_file(bucket, key, file_name)
    return AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', torch_dtype=torch.bfloat16 )

def delete_model_in_s3(model_name):
    pass
    

In [7]:
save_folder_to_s3('qwen')

None


In [27]:
qwen = s3_model.S3ModelHelper(s3_sub_folder='tmp/fs')
qwen.load_model('qwen')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm):

In [28]:
qwen.clear_folder('qwen')

In [8]:
model = load_model('llama')
clear_folder('llama')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
clear_folder('qwen')

In [14]:
list_model_files('llama')

['bclarke16/tmp/fs/llama/config.json',
 'bclarke16/tmp/fs/llama/generation_config.json',
 'bclarke16/tmp/fs/llama/model-00001-of-00002.safetensors',
 'bclarke16/tmp/fs/llama/model-00002-of-00002.safetensors',
 'bclarke16/tmp/fs/llama/model.safetensors.index.json']

In [None]:
client = boto3.client("s3")
s3 = boto3.resource("s3")

In [None]:
bucket = os.environ['BQUANT_SANDBOX_USER_BUCKET']
folder = f'{username}/{username_folder}/llama'

my_bucket = s3.Bucket(bucket)

for file in client.list_objects(Bucket=bucket, Prefix=folder)['Contents']:
    key = file['Key']
    file_name = key[key.find('llama' + '/') + len('llama') + 1:]
    print(file_name)

In [None]:
ls = client.list_objects(Bucket=bucket, Prefix=folder)

In [None]:
client.download_file(bucket, 'bclarke16/tmp/fs/llama/config.json', 'config.json')

In [None]:
## Inference set with Llama/ Qwen

#Loop through each security, 

#extract the IS and the BS, combine into a single prompt

# convert to json

# store in file with the date and security name

# upload to cloud

#change