In [1]:
import sys
import os
from pathlib import Path

curr_path = Path(os.getcwd())
project_root = curr_path.parent.parent.parent
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from datasets import load_from_disk
from hybrid_stock_system import HybridStockSystem
import pandas as pd
import torch
import numpy as np
import pickle
# from configs.config import FUSION_DATA_DIR
FUSION_DATA_DIR = os.path.join(project_root, "data/fusion")


  from .autonotebook import tqdm as notebook_tqdm


# Create Fusion DS for fusion training

##### Already done


In [2]:
data_text = load_from_disk(os.path.join(FUSION_DATA_DIR, "data_text.arrow"))
data_price = pd.read_csv(os.path.join(FUSION_DATA_DIR, "data_price.csv"))

In [3]:
data_text

Dataset({
    features: ['Date', 'Label', 'input_ids', 'attention_mask', 'embedding'],
    num_rows: 1734
})

# Instanciate Hybrid_stock_system

## (Optional) Load TRAINED Sklearn model for indicators data

In [4]:
model = "checkpoints/12-09_13-01-06_RandomForestClassifier/model.pkl"

In [5]:
with open(os.path.join(project_root, model), 'rb') as f:
    model_price = pickle.load(f)

## (Optional) Load TRAINED torch model for news

In [6]:
model = "checkpoints/12-12_13-40-32_attention_text_0.521613832853026"

In [7]:
model_text = torch.load(os.path.join(project_root, model, "model.pt"), weights_only=False, map_location=torch.device('cpu'))

## Instanciate and preparing data

In [8]:
stock_system = HybridStockSystem()

In [9]:
y = data_price["target"]
data_price = data_price.drop(columns=["target", "Date"], errors='ignore')


In [10]:
tensor_text = torch.tensor(data_text["embedding"], dtype=torch.float32)
array_price = data_price.values

In [11]:
y = (y+1)/2 
y

0       0.0
1       1.0
2       1.0
3       0.0
4       0.0
       ... 
1729    0.0
1730    1.0
1731    1.0
1732    1.0
1733    1.0
Name: target, Length: 1734, dtype: float64

# Split data Train / test

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [13]:
price_train, price_test, y_train, y_test = train_test_split(np.array(array_price), np.array(y), test_size=0.2, shuffle=False)

text_train, text_test = train_test_split(np.array(tensor_text), test_size=0.2, shuffle=False)

In [14]:
print("train data") 
print(price_train.shape)
print(text_train.shape)
print(y_train.shape)
print()
print("test data")
print(price_test.shape)
print(text_test.shape)
print(y_test.shape)

train data
(1387, 42)
(1387, 25, 768)
(1387,)

test data
(347, 42)
(347, 25, 768)
(347,)


In [15]:
stock_system.fit(price_train, text_train, y_train)

Training Branch A (Numerical Model) on Technicals
Training Branch B (Text Model) on News
  Epoch 1/100 | Train Loss: 0.6955 | Val Loss: 0.6877
  Epoch 2/100 | Train Loss: 0.6884 | Val Loss: 0.6895
  Epoch 3/100 | Train Loss: 0.6810 | Val Loss: 0.6947
  Epoch 4/100 | Train Loss: 0.6731 | Val Loss: 0.6873
  Epoch 5/100 | Train Loss: 0.6707 | Val Loss: 0.6853
  Epoch 6/100 | Train Loss: 0.6550 | Val Loss: 0.6903
  Epoch 7/100 | Train Loss: 0.6475 | Val Loss: 0.6930
  Epoch 8/100 | Train Loss: 0.6439 | Val Loss: 0.6785
  Epoch 9/100 | Train Loss: 0.6276 | Val Loss: 0.6770
  Epoch 10/100 | Train Loss: 0.6144 | Val Loss: 0.7218
  Epoch 11/100 | Train Loss: 0.6041 | Val Loss: 0.7054
  Epoch 12/100 | Train Loss: 0.5940 | Val Loss: 0.6882
  Epoch 13/100 | Train Loss: 0.5903 | Val Loss: 0.7418
  Epoch 14/100 | Train Loss: 0.5836 | Val Loss: 0.6776
  Epoch 15/100 | Train Loss: 0.5704 | Val Loss: 0.6824
  Epoch 16/100 | Train Loss: 0.5590 | Val Loss: 0.6990
  Epoch 17/100 | Train Loss: 0.5518 | Va

#### Prediction

In [16]:
print()
prediction = stock_system.predict(price_test, text_test)




In [17]:
acc = accuracy_score(y_test, prediction)

In [18]:
acc

0.5360230547550432

In [19]:
prec = precision_score(y_test, prediction)
rec = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")


Accuracy: 0.5360, Precision: 0.5333, Recall: 0.6818, F1: 0.5985


In [29]:
pd.Series(y_test).value_counts(normalize=True)

target
1.0    0.507205
0.0    0.492795
Name: proportion, dtype: float64

# SAVE MODELS

In [21]:
price_model = stock_system.numerical_model
text_model = stock_system.text_model
meta_learner = stock_system.meta_learner





In [30]:
path_to_save = os.path.join(project_root, "checkpoints/hybrid_system")
if not os.path.exists(path_to_save):
    os.mkdir(path_to_save)

In [32]:
with open(os.path.join(path_to_save, "price_model.pkl"), "wb") as f:
    pickle.dump(price_model, f)

In [33]:
with open(os.path.join(path_to_save, "meta_learner.pkl"), "wb") as f:
    pickle.dump(meta_learner, f)

In [34]:
torch.save(text_model.state_dict(), os.path.join(path_to_save, "text_model.pt"))

# Re try with saved models

In [6]:
model_path = "checkpoints/hybrid_system"

with open(os.path.join(project_root, model_path, "price_model.pkl"), 'rb') as f:
    model_price = pickle.load(f)

with open(os.path.join(project_root, model_path, "meta_learner.pkl"), 'rb') as f:
    meta_learner = pickle.load(f)

model_text = torch.load(os.path.join(project_root, model, "text_model.pt"), weights_only=False, map_location=torch.device('cpu'))

In [8]:
hybrid_stock_system_v2 = HybridStockSystem(numerical_model=model_price, text_model=model_text, meta_learner=meta_learner)

In [16]:
prediction = hybrid_stock_system_v2.predict(price_test, text_test)

In [17]:
acc = accuracy_score(y_test, prediction)

In [18]:
acc

0.5360230547550432