In [1]:
google_colab = False
if google_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
    %cd /content/drive/MyDrive/NewsTrading/trading_bot
    %pip install -r requirements_clean.txt
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !python rapidsai-csp-utils/colab/pip-install.py

In [26]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizerFast, get_linear_schedule_with_warmup
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
from src.model.neural_network import train, MyBertModule, evaluate
from sklearn.metrics import accuracy_score, balanced_accuracy_score

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [4]:
# Settings
target_col_name = MODEL_CONFIG.target_col_name
bert_model_name = MODEL_CONFIG.transformer_hugface_id
batch_size = 4
epochs = 3
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
loss_function = MODEL_CONFIG.loss
tracking_metrics = [accuracy_score, balanced_accuracy_score]
learning_rate = 0.01 # 5e-5 (slow) for bert, 0.3 (fast) for new feed forward
is_deactivated_bert_learning = True

In [5]:
pt_version = torch.__version__
print(f"[INFO] Current PyTorch version: {pt_version} (should be 2.x+)")
torch.cuda.is_available()

[INFO] Current PyTorch version: 2.1.1+cu118 (should be 2.x+)


True

In [6]:
# Download dataset
dataset = pd.read_parquet(config.data.merged).iloc[-40000:]
dataset.shape[0]

40000

In [7]:
# Filter out Stocks... TODO: put this into filter interface and make configurable in model_config
dataset = dataset[
    (dataset["unadj_open"] >= 2) &          # penny stocks
    (dataset["dollar_volume"] >= 30_000)  # illiquid stocks TODO: this has look-ahead bias
                  ]
# TODO: Staleness has yet to be calculated
if "staleness" in dataset.columns:
    dataset = dataset[(dataset["staleness"] <= 0.9)] # repeat news

print(dataset.shape[0])
dataset.dropna(inplace=True)
print(dataset.shape[0])

21535
20743


In [8]:
dataset: pd.DataFrame = MODEL_CONFIG.splitter.add_splits(dataset)

14520 samples in training set.
 4148 samples in validation set.
 2075 samples in testing set.


In [9]:
train_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="training",
                                                batch_size=batch_size,
                                                label_col=target_col_name,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

dataset.index.name=None


In [10]:
validation_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="validation",
                                                batch_size=batch_size,
                                                label_col=target_col_name,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

dataset.index.name=None


In [2]:
%load_ext tensorboard
%tensorboard --logdir lighning_logs

Launching TensorBoard...

In [None]:
dataset.loc[dataset.split == "validation", target_col_name].value_counts() / dataset[dataset.split == "validation"].shape[0]

In [None]:
dataset.loc[dataset.split == "training", target_col_name].value_counts() / dataset[dataset.split == "training"].shape[0]

In [None]:
from src.model.neural_network import BERTClassifier
import pytorch_lightning as pl

# model = BertClassifier.load_from_checkpoint(MODEL_CONFIG.output_params_path)

model: nn.Module = BertClassifier(bert_model_name, 3, True)
trainer = pl.Trainer()

trainer.fit(model, train_dataloader, validation_dataloader)

# Store Model
torch.save(model.state_dict(), MODEL_CONFIG.output_params_path)

In [13]:
if google_colab:
    from google.colab import runtime
    runtime.unassign()