In [None]:
from google.colab import drive
drive.mount('/content/drive')
cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot
%pip install -r requirements_clean.txt


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [20]:
pt_version = torch.__version__
print(f"[INFO] Current PyTorch version: {pt_version} (should be 2.x+)")

[INFO] Current PyTorch version: 2.1.0+cu121 (should be 2.x+)


In [12]:
import pandas as pd
import torch
import torch.nn as nn
import yaml
from dotmap import DotMap
from torch.optim import AdamW
from transformers import BertTokenizerFast, get_linear_schedule_with_warmup
from src.model.data_loading import create_dataloaders, get_text_and_labels, get_data_loader_from_dataset

from src.model.neural_network import (
    TRANSFORMER_HF_ID,
    MyBertModel,
    WeightedSquaredLoss,
    embed_inputs,
    train,
)

config = DotMap(yaml.safe_load(open("src/config_gcs.yaml")), _dynamic=False)
input_col_name = config.model.input_col_name
target_col_name = config.model.target_col_name

# Settings
FROM_SCRATCH = True
batch_size = 16
epochs = 3
tokenizer = BertTokenizerFast.from_pretrained(TRANSFORMER_HF_ID)


In [13]:
# Download dataset
dataset = pd.read_parquet(config.data.merged)

In [14]:
dataset.shape[0]

419094

In [15]:
# Filter out penny stocks
dataset = dataset[dataset["unadj_open"] >= 2]

In [16]:
dataset.shape[0]

388700

In [17]:
dataset.describe()

Unnamed: 0,r,r_spy,r_mkt_adj,std_252,dollar_volume,r_intra_(t-1),unadj_open
count,388700.0,388700.0,388700.0,370265.0,388700.0,388418.0,388700.0
mean,0.041601,0.00039,-0.041211,1.075474,311256.3,0.999981,48.207447
std,3.39326,0.007372,3.393262,72.026015,825653.4,0.026138,109.845293
min,-0.721714,-0.078634,-310.516494,0.022605,0.0,0.350619,2.0
25%,-0.010409,-0.002804,-0.010158,0.217405,17615.68,0.990991,14.5
50%,0.000509,0.000573,-0.000142,0.294301,69173.11,1.0,32.34
75%,0.011783,0.004102,0.009687,0.424232,274500.7,1.009047,58.38
max,310.5,0.061387,0.721219,9998.941949,95845020.0,3.69697,33750.0


In [18]:
validation_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="validation",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs={})

train_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="training",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs={})


func:'embed_inputs' took: 82.8882 sec


  input_tensor = torch.tensor(inputs)
  mask_tensor = torch.tensor(masks)


func:'embed_inputs' took: 323.5941 sec


In [22]:
model: nn.Module = MyBertModel()
if not FROM_SCRATCH:
    model.load_state_dict(torch.load("data/model")) # Use latest iteration of the model for training

# .compile currently isn't supported for Windows
# model = torch.compile(model)

if __name__ == "__main__":
    torch.cuda.empty_cache()

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using GPU.")
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")
    model.to(device)

    # Optimizer, scheduler and loss function
    optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    loss_function = nn.MSELoss()

    # Training
    model, training_stats = train(model,
                                  optimizer,
                                  scheduler,
                                  loss_function,
                                  epochs,
                                  train_dataloader,
                                  validation_dataloader,
                                  device,
                                  clip_value=2)

    df_stats = pd.DataFrame(data=training_stats)
    print(df_stats)

    # Store Model
    torch.save(model.state_dict(), "data/model")

Using GPU.

Average training loss: 0.00
Training epoch took: 0:00:02

Average training loss: 0.00
Training epoch took: 0:11:35

Average training loss: 0.00
Training epoch took: 0:23:08
   epoch  Training Loss  Valid. Loss Training Time  Valid.MAE  Valid.RW-MAE  \
0      1   3.696175e-07     0.195532       0:00:02   0.046325      0.027847   
1      2   2.329255e-07     0.195243       0:11:35   0.034689      0.027847   
2      3   6.162801e-08     0.195126       0:23:08   0.032609      0.027847   

   Valid.Accuracy  
0        0.497331  
1        0.502451  
2        0.504211  


In [None]:
from google.colab import runtime
runtime.unassign()