In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

In [5]:
import os
import sys

notebook_dir = os.getcwd()

root_dir = os.path.abspath(os.path.join(notebook_dir, "../../"))
sys.path.append(root_dir)

In [6]:
from dotenv import load_dotenv

load_dotenv()

from service import EnvironmentSecretService

In [7]:
from core.models.exchange import ExchangeType
from core.models.lookback import Lookback
from core.models.timeframe import Timeframe
from exchange import ExchangeFactory

In [8]:
DEFAULT_EXCHANGE = ExchangeType.BYBIT
DEFAULT_TIMEFRAME = Timeframe.ONE_HOUR
DEFAULT_LOOKBACK = Lookback.SIX_MONTH

from datetime import datetime

emb_filename = f'ocean_emb_{datetime.now().strftime('%Y-%m-%d')}.npy'
volume_df_filename = f'ocean_ft_{datetime.now().strftime('%Y-%m-%d')}.csv'

In [9]:
exchange_factory = ExchangeFactory(EnvironmentSecretService())
exchange = exchange_factory.create(DEFAULT_EXCHANGE)

In [10]:
symbols = exchange.fetch_future_symbols()

data = []
for symbol in symbols:
    for ohlcv in exchange.fetch_ohlcv(symbol, DEFAULT_TIMEFRAME, DEFAULT_LOOKBACK):
        _timestamp, _open, _high, _low, _close, _volume = ohlcv
        data.append([symbol.name, _timestamp, _open, _high, _low, _close, _volume])

symbols_df = pd.DataFrame(data, columns=['Symbol', 'Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'])
symbols_df['Timestamp'] = pd.to_datetime(symbols_df['Timestamp'], unit='ms')
symbols_df.sort_values(by=['Symbol', 'Timestamp'], inplace=True)

In [7]:
def volume_features(df):
    df = df.copy()

    df['Date'] = df['Timestamp'].dt.date

    df['ADV'] = df.groupby(['Symbol', 'Date'])['Volume'].transform('mean')

    df['Typical Price'] = (df['High'] + df['Low'] + df['Close']) / 3
    df['Cumulative Price * Volume'] = df['Typical Price'] * df['Volume']

    df['Cum_Volume'] = df.groupby(['Symbol', 'Date'])['Volume'].cumsum()
    df['Cum_Price_Volume'] = df.groupby(['Symbol', 'Date'])['Cumulative Price * Volume'].cumsum()
    df['VWAP'] = df['Cum_Price_Volume'] / df['Cum_Volume']

    df['Price Change'] = df.groupby(['Symbol', 'Date'])['Close'].diff()

    def obv(group):
        obv = (group['Volume'].where(group['Price Change'] > 0, -group['Volume'])
               .where(group['Price Change'] != 0, 0).cumsum())
        return obv

    df['OBV'] = df.groupby(['Symbol', 'Date']).apply(obv, include_groups=False).reset_index(drop=True)

    df.drop(columns=['Date', 'Typical Price', 'Cum_Volume', 'Cum_Price_Volume', 'Cumulative Price * Volume', 'Price Change'], inplace=True)

    return df

In [14]:
volume_df = volume_features(symbols_df)
volume_df.to_csv(volume_df_filename, index=False)

In [11]:
volume_df = pd.read_csv(volume_df_filename)
volume_df.head()

Unnamed: 0,Symbol,Timestamp,Open,High,Low,Close,Volume,ADV,VWAP,OBV
0,10000000AIDOGEUSDT,2024-03-28 16:00:00,0.006066,0.006093,0.005933,0.00596,51224500.0,26869925.0,0.005995,-51224500.0
1,10000000AIDOGEUSDT,2024-03-28 17:00:00,0.00596,0.00598,0.005906,0.00594,20305900.0,26869925.0,0.00598,-71530400.0
2,10000000AIDOGEUSDT,2024-03-28 18:00:00,0.00594,0.005989,0.005927,0.005966,14693700.0,26869925.0,0.005977,-56836700.0
3,10000000AIDOGEUSDT,2024-03-28 19:00:00,0.005966,0.00601,0.005945,0.006003,17056200.0,26869925.0,0.005978,-39780500.0
4,10000000AIDOGEUSDT,2024-03-28 20:00:00,0.006003,0.006033,0.005968,0.005983,24093500.0,26869925.0,0.005981,-63874000.0


In [20]:
features = volume_df[['ADV', 'VWAP', 'OBV']].to_numpy()

scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

X = torch.tensor(normalized_features, dtype=torch.float32)

batch_size = 2
dataset = torch.utils.data.TensorDataset(X)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [22]:
class Autoencoder(nn.Module):
    def __init__(self, feature_dim, output_dim, dropout_prob=0.2):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(feature_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(output_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(128, feature_dim),
        )

        self.residual = nn.Linear(feature_dim, feature_dim)

    def forward(self, x):
        encoded = self.encoder(x)

        decoded = self.decoder(encoded)

        residual_output = self.residual(x)

        output = decoded + residual_output

        return output

    def get_latent(self, x):
        return self.encoder(x)

In [23]:
def train_autoencoder(model, dataloader, epochs=50, lr=1e-3, device=None):
    if device is None:
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    model.to(device)

    model.train()

    for epoch in range(epochs):
        running_loss = 0.0

        for _batch_idx, (data,) in enumerate(dataloader):
            data = data.to(device)

            optimizer.zero_grad()

            outputs = model(data)

            loss = criterion(outputs, data)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {running_loss / len(dataloader):.4f}")

    print("Training Complete")

In [24]:
def extract_embeddings(model, dataloader, device=None):
    if device is None:
        device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    model.eval()
    model.to(device)

    embeddings = []

    with torch.no_grad():
        for _batch_idx, (data,) in enumerate(dataloader):
            data = data.to(device)
            latent = model.get_latent(data)
            embeddings.append(latent.cpu())

    embeddings = torch.vstack(embeddings)

    return embeddings

In [25]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

input_dim = X.shape[1]
output_dim = 10

model = Autoencoder(feature_dim=input_dim, output_dim=output_dim)

train_autoencoder(model, dataloader, epochs=100, lr=1e-3, device=device)

mps


In [None]:
embeddings = extract_embeddings(model, dataloader, device=device)

np.save(emb_filename, embeddings.numpy())