In [None]:
from google.colab import drive
drive.mount('/content/drive')
cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot
%pip install -r requirements_clean.txt


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [4]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizerFast, get_linear_schedule_with_warmup
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
from src.model.neural_network import (
    BERTRegressor,
    train,
)

input_col_name = MODEL_CONFIG.input_col_name
target_col_name = MODEL_CONFIG.target_col_name

# Settings
bert_model_name = MODEL_CONFIG.transformer_hugface_id
FROM_SCRATCH = True
batch_size = 16
epochs = 1
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
loss_function = nn.MSELoss()


In [10]:
pt_version = torch.__version__
print(f"[INFO] Current PyTorch version: {pt_version} (should be 2.x+)")

[INFO] Current PyTorch version: 2.1.1+cu118 (should be 2.x+)


In [11]:
# Download dataset
dataset = pd.read_parquet(config.data.merged)
dataset.shape[0]

419094

In [12]:
# Filter out Stocks...
dataset = dataset[
    (dataset["unadj_open"] >= 2) &          # penny stocks
    (dataset["dollar_volume"] >= 30_000) &  # illiquid stocks TODO: this has look-ahead bias
    (dataset["staleness"] <= 0.9)           # repeat news
                  ]
print(dataset.shape[0])
dataset.dropna(inplace=True)
print(dataset.shape[0])

254166

In [13]:
dataset.describe()

Unnamed: 0,r,r_spy,r_mkt_adj,std_252,dollar_volume,r_intra_(t-1),unadj_open
count,254166.0,254166.0,254166.0,247271.0,254166.0,253934.0,254166.0
mean,0.00641,0.000382,-0.006028,0.307716,469405.9,1.00009,64.103408
std,0.399078,0.007528,0.399001,0.230862,985009.5,0.023624,123.876843
min,-0.688716,-0.078634,-149.176953,0.022605,30000.35,0.350619,2.0
25%,-0.009266,-0.002865,-0.009156,0.201406,73012.04,0.992041,28.12
50%,0.00086,0.000573,-0.000314,0.260276,179574.7,1.000312,46.49
75%,0.010995,0.004144,0.008413,0.356565,464506.1,1.00843,73.6475
max,149.176471,0.061387,0.683067,31.557108,95845020.0,3.69697,33750.0


In [17]:
train_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="training",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

func:'embed_inputs' took: 697.7111 sec


In [18]:
validation_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="validation",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

func:'embed_inputs' took: 178.6951 sec


In [20]:
model: nn.Module = BERTRegressor(bert_model_name)
if not FROM_SCRATCH:
    model.load_state_dict(torch.load("data/model")) # Use latest iteration of the model for training

# .compile currently isn't supported for Windows
# model = torch.compile(model)

if __name__ == "__main__":
    torch.cuda.empty_cache()

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using GPU.")
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")
    model.to(device)

    # Optimizer, scheduler and loss function
    optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # Training
    model, training_stats = train(model,
                                  optimizer,
                                  scheduler,
                                  loss_function,
                                  epochs,
                                  train_dataloader,
                                  validation_dataloader,
                                  device,
                                  clip_value=2)

    df_stats = pd.DataFrame(data=training_stats)
    print(df_stats)

    # Store Model
    torch.save(model.state_dict(), "data/model")

Using GPU.


KeyboardInterrupt: 

In [None]:
from google.colab import runtime
runtime.unassign()