In [None]:
from google.colab import drive
drive.mount('/content/drive')
cwd="/content/drive/MyDrive/NewsTrading/trading_bot"
%cd /content/drive/MyDrive/NewsTrading/trading_bot
%pip install -r requirements_clean.txt


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizerFast, get_linear_schedule_with_warmup
from src.model.data_loading import get_data_loader_from_dataset
from src.config import config, MODEL_CONFIG
from src.model.neural_network import train

# Settings
bert_model_name = MODEL_CONFIG.transformer_hugface_id
FROM_SCRATCH = True
batch_size = 16
epochs = 1
tokenizer = BertTokenizerFast.from_pretrained(bert_model_name)
loss_function = MODEL_CONFIG.loss_function
learning_rate = 5e-5 # 5e-5 (slow) for bert, 0.3 (fast) for new feed forward

In [None]:
pt_version = torch.__version__
print(f"[INFO] Current PyTorch version: {pt_version} (should be 2.x+)")

In [None]:
# Download dataset
dataset = pd.read_parquet(config.data.merged)
dataset.shape[0]

In [None]:
# Filter out Stocks... TODO: put this into filter interface and make configurable in model_config
dataset = dataset[
    (dataset["unadj_open"] >= 2) &          # penny stocks
    (dataset["dollar_volume"] >= 30_000) &  # illiquid stocks TODO: this has look-ahead bias
    (dataset["staleness"] <= 0.9)           # repeat news
                  ]
print(dataset.shape[0])
dataset.dropna(inplace=True)
print(dataset.shape[0])

In [None]:
dataset: pd.DataFrame = MODEL_CONFIG.splitter.add_splits(dataset)

In [None]:
train_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="training",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

In [None]:
validation_dataloader = get_data_loader_from_dataset(dataset=dataset,
                                                split="validation",
                                                tokenizer=tokenizer,
                                                batch_size=batch_size,
                                                data_loader_kwargs=dict(shuffle=True,
                                                                        pin_memory=True))

In [None]:
model: nn.Module = MODEL_CONFIG.BertClass(bert_model_name)
if not FROM_SCRATCH:
    model.load_state_dict(torch.load("data/model"))
    
model.deactivate_learning_for_layer(model.bert)

# .compile currently isn't supported for Windows
# model = torch.compile(model)

if __name__ == "__main__":
    torch.cuda.empty_cache()

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("Using GPU.")
    else:
        print("No GPU available, using the CPU instead.")
        device = torch.device("cpu")
    model.to(device)

    # Optimizer, scheduler and loss function
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # Training
    model, training_stats = train(model,
                                  optimizer,
                                  scheduler,
                                  loss_function,
                                  epochs,
                                  train_dataloader,
                                  validation_dataloader,
                                  device,
                                  clip_value=2)

    df_stats = pd.DataFrame(data=training_stats)
    print(df_stats)

    # Store Model
    torch.save(model.state_dict(), "data/model")

In [None]:
from google.colab import runtime
runtime.unassign()