In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import faiss
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
import sys
import os

notebook_dir = os.getcwd()

root_dir = os.path.abspath(os.path.join(notebook_dir, "../../"))
sys.path.append(root_dir)

In [3]:
from dotenv import load_dotenv

load_dotenv()

from service import EnvironmentSecretService

In [4]:
from core.models.exchange import ExchangeType
from exchange import ExchangeFactory
from core.models.timeframe import Timeframe
from core.models.lookback import Lookback
from core.models.cap import CapType

In [5]:
DEFAULT_EXCHANGE = ExchangeType.BYBIT
DEFAULT_TIMEFRAME = Timeframe.ONE_HOUR
DEFAULT_LOOKBACK = Lookback.SIX_MONTH

from datetime import datetime
emb_filename = f'ocean_emb_{datetime.now().strftime('%Y-%m-%d')}.npy'
volume_df_filename = f'ocean_ft_{datetime.now().strftime('%Y-%m-%d')}.csv'

In [6]:
exchange_factory = ExchangeFactory(EnvironmentSecretService())
exchange = exchange_factory.create(DEFAULT_EXCHANGE)

In [10]:
symbols = exchange.fetch_future_symbols()

data = []
for symbol in symbols:
    for ohlcv in exchange.fetch_ohlcv(symbol, DEFAULT_TIMEFRAME, DEFAULT_LOOKBACK):
        _timestamp, _open, _high, _low, _close, _volume = ohlcv
        data.append([symbol.name, _timestamp, _open, _high, _low, _close, _volume])

symbols_df = pd.DataFrame(data, columns=['Symbol', 'Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'])
symbols_df['Timestamp'] = pd.to_datetime(symbols_df['Timestamp'], unit='ms')
symbols_df.sort_values(by=['Symbol', 'Timestamp'], inplace=True)

In [7]:
def volume_features(df):
    df = df.copy()

    df['Date'] = df['Timestamp'].dt.date

    df['ADV'] = df.groupby(['Symbol', 'Date'])['Volume'].transform('mean')

    df['Typical Price'] = (df['High'] + df['Low'] + df['Close']) / 3
    df['Cumulative Price * Volume'] = df['Typical Price'] * df['Volume']

    df['Cum_Volume'] = df.groupby(['Symbol', 'Date'])['Volume'].cumsum()
    df['Cum_Price_Volume'] = df.groupby(['Symbol', 'Date'])['Cumulative Price * Volume'].cumsum()
    df['VWAP'] = df['Cum_Price_Volume'] / df['Cum_Volume']

    df['Price Change'] = df.groupby(['Symbol', 'Date'])['Close'].diff()
    
    def obv(group):
        obv = (group['Volume'].where(group['Price Change'] > 0, -group['Volume'])
               .where(group['Price Change'] != 0, 0).cumsum())
        return obv

    df['OBV'] = df.groupby(['Symbol', 'Date']).apply(obv, include_groups=False).reset_index(drop=True)

    df.drop(columns=['Date', 'Typical Price', 'Cum_Volume', 'Cum_Price_Volume', 'Cumulative Price * Volume', 'Price Change'], inplace=True)

    return df

In [14]:
volume_df.to_csv(volume_df_filename, index=False)

In [9]:
volume_df = pd.read_csv(volume_df_filename)
volume_df.head()

Unnamed: 0,Symbol,Timestamp,Open,High,Low,Close,Volume,ADV,VWAP,OBV
0,10000000AIDOGEUSDT,2024-03-28 16:00:00,0.006066,0.006093,0.005933,0.00596,51224500.0,26869925.0,0.005995,-51224500.0
1,10000000AIDOGEUSDT,2024-03-28 17:00:00,0.00596,0.00598,0.005906,0.00594,20305900.0,26869925.0,0.00598,-71530400.0
2,10000000AIDOGEUSDT,2024-03-28 18:00:00,0.00594,0.005989,0.005927,0.005966,14693700.0,26869925.0,0.005977,-56836700.0
3,10000000AIDOGEUSDT,2024-03-28 19:00:00,0.005966,0.00601,0.005945,0.006003,17056200.0,26869925.0,0.005978,-39780500.0
4,10000000AIDOGEUSDT,2024-03-28 20:00:00,0.006003,0.006033,0.005968,0.005983,24093500.0,26869925.0,0.005981,-63874000.0


In [10]:
features = volume_df[['ADV', 'VWAP', 'OBV']].to_numpy()

scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

batch_size = 2
dataset = tf.data.Dataset.from_tensor_slices(normalized_features)
dataset = dataset.shuffle(buffer_size=len(normalized_features)).batch(batch_size)

In [14]:
class Autoencoder(tf.keras.Model):
    def __init__(self, feature_dim, output_dim):
        super(Autoencoder, self).__init__()

        self.encoder = tf.keras.Sequential([
            tf.keras.Input(shape=(feature_dim,)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(output_dim)
        ])

        self.decoder = tf.keras.Sequential([
            tf.keras.Input(shape=(output_dim,)),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(feature_dim)
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [15]:
input_dim = normalized_features.shape[1]
output_dim = 10
autoencoder = Autoencoder(feature_dim=input_dim, output_dim=output_dim)

criterion = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

autoencoder.compile(optimizer=optimizer, loss=criterion)

In [16]:
num_epochs = 100

for epoch in range(num_epochs):
    for step, input_data in enumerate(dataset):
        with tf.GradientTape() as tape:
            decoded = autoencoder(input_data)
            loss = criterion(input_data, decoded)
        gradients = tape.gradient(loss, autoencoder.trainable_variables)
        optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables))
    
    print(f"Epoch {epoch + 1}, Loss: {loss.numpy()}")

In [None]:
encoder_model = autoencoder.encoder
embeddings = encoder_model(normalized_features)

np.save(emb_filename, embeddings.numpy())