In [31]:
google_colab = False
if google_colab:
    from google.colab import drive
    drive.mount('/content/drive')
    cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
    %cd /content/drive/MyDrive/NewsTrading/trading_bot
    %pip install -r requirements_clean.txt
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !python rapidsai-csp-utils/colab/pip-install.py

In [32]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizerFast
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
import pytorch_lightning as pl

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [34]:
# Settings
target_col_name = MODEL_CONFIG.target_col_name
bert_model_name = MODEL_CONFIG.transformer_hugface_id
batch_size = 4
epochs = 1
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
learning_rate = 0.01 # 5e-5 (slow) for bert, 0.3 (fast) for new feed forward
is_deactivated_bert_learning = True

In [35]:
pt_version = torch.__version__
print(f"[INFO] Current PyTorch version: {pt_version} (should be 2.x+)")
torch.cuda.is_available()

[INFO] Current PyTorch version: 2.1.1+cu118 (should be 2.x+)


True

In [36]:
# Download dataset
dataset = pd.read_parquet(config.data.merged).iloc[-20000:]
dataset.shape[0]

20000

In [37]:
# Filter out Stocks... TODO: put this into filter interface and make configurable in model_config
dataset = dataset[
    (dataset["unadj_open"] >= 2) &          # penny stocks
    (dataset["dollar_volume"] >= 30_000)  # illiquid stocks TODO: this has look-ahead bias
                  ]
# TODO: Staleness has yet to be calculated
if "staleness" in dataset.columns:
    dataset = dataset[(dataset["staleness"] <= 0.9)] # repeat news

print(dataset.shape[0])
dataset.dropna(inplace=True)
print(dataset.shape[0])

10819
10413


In [38]:
dataset: pd.DataFrame = MODEL_CONFIG.splitter.add_splits(dataset)

7289 samples in training set.
 2082 samples in validation set.
 1042 samples in testing set.


In [39]:
train_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="training",
                                                batch_size=batch_size,
                                                label_col=target_col_name,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

dataset.index.name=None


In [40]:
validation_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="validation",
                                                batch_size=batch_size,
                                                label_col=target_col_name,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

dataset.index.name=None


In [41]:
dataset.loc[dataset.split == "validation", target_col_name].value_counts() / dataset[dataset.split == "validation"].shape[0]

z_score_class
0    0.718060
2    0.154179
1    0.127762
Name: count, dtype: float64

In [54]:
weights = dataset.loc[dataset.split == "training", target_col_name].value_counts() / dataset[dataset.split == "training"].shape[0]

In [55]:
weights = (1/ weights).values
weights

array([1.48876634, 5.87348912, 6.32725694])

In [62]:
%load_ext autoreload
%autoreload 2
from src.model.neural_network import BERTClassifier

ckpt = None#"lightning_logs/version_3/checkpoints/epoch=2-step=5469.ckpt"
if ckpt:
    model = BERTClassifier.load_from_checkpoint(ckpt, deactivate_bert_learning=False)
else:
    model: nn.Module = BERTClassifier(bert_model_name=bert_model_name, 
                                    num_classes=3, 
                                    deactivate_bert_learning=True,
                                    learning_rate=0.01,
                                    class_weights=weights)

trainer = pl.Trainer(num_sanity_val_steps=2,
                     max_epochs=10)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [63]:
trainer.fit(model, 
            train_dataloader, 
            validation_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type               | Params
------------------------------------------------------
0 | train_accuracy | MulticlassAccuracy | 0     
1 | val_accuracy   | MulticlassAccuracy | 0     
2 | train_f1_score | MulticlassF1Score  | 0     
3 | val_f1_score   | MulticlassF1Score  | 0     
4 | bert           | BertModel          | 109 M 
5 | dropout        | Dropout            | 0     
6 | ff_layer       | Sequential         | 15.9 K
------------------------------------------------------
15.9 K    Trainable params
109 M     Non-trainable params
109 M     Total params
439.071   Total estimated model params size (MB)


Epoch 1:  60%|██████    | 1101/1823 [02:01<01:19,  9.09it/s, v_num=3, train_loss (weighted)_step=1.090, train_loss_step=1.070, train_f1_score_step=0.500, train_accuracy_step=0.500, train_loss (weighted)_epoch=1.060, train_loss_epoch=0.962, train_f1_score_epoch=0.569, train_accuracy_epoch=0.581]

In [None]:
if google_colab:
    from google.colab import runtime
    runtime.unassign()