In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [9]:
data = pd.read_csv('/Users/fipm/code/abefarkas/Thalassa_Regime_Classifier/raw_data/BTCUSDT_S_DEPTH_20220519.csv')
data.head(4)

Unnamed: 0,symbol,ts,last_update_id,bp1,bs1,bp2,bs2,bp3,bs3,bp4,...,ap16,as16,ap17,as17,ap18,as18,ap19,as19,ap20,as20
0,BTCUSDT,1652918400006,1519899788740,28700.6,0.155,28699.1,0.019,28699.0,2.043,28697.4,...,28704.8,0.801,28704.9,0.001,28705.1,0.04,28705.2,0.001,28705.3,0.192
1,BTCUSDT,1652918400385,1519899930450,28700.6,0.046,28699.1,0.016,28699.0,1.393,28697.4,...,28704.5,0.002,28704.8,0.801,28704.9,0.001,28705.1,0.04,28705.3,0.192
2,BTCUSDT,1652918400707,1519900081311,28700.6,0.046,28699.1,0.016,28699.0,1.393,28697.4,...,28704.5,0.002,28704.8,0.801,28704.9,0.001,28705.1,0.04,28705.3,0.192
3,BTCUSDT,1652918401133,1519900269449,28700.6,0.046,28699.1,0.016,28699.0,1.393,28697.4,...,28704.3,0.135,28704.4,0.001,28704.5,0.002,28704.8,0.151,28704.9,0.001


In [75]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

class FinancialFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.data = None
       
    def WAP(self):
        self.data['WAP'] = (self.data['bp1']*self.data['bs1']
                +self.data['bp2']*self.data['bs2']
                +self.data['ap1']*self.data['as1']
                +self.data['ap2']*self.data['as2'])/(self.data['bs1']+
                                            self.data['bs2']+
                                            self.data['as1']+
                                            self.data['as2'])
        return self.data

    def build_df(self):
        return self.data

    def spread(self):
        self.data['spread'] = ((self.data['ap1']/self.data['bp1']) - 1)
        return self.data

    def log_price(self):
        self.data['log_price'] = np.log(self.data['WAP'])
        return self.data

    def log_returns(self):
        self.data['log_returns'] = self.data.log_price.diff()
        return self.data

    def volatility_df(self):
        self.data['realized_volatility'] = np.std(self.data.log_returns)
        return self.data

    def volatility_next_period(self):
        self.data['volatility_t+1'] = self.data['realized_volatility'].shift(-1)
        return self.data

    def dropping_columns(self):
        self.data.drop(['Unnamed: 0'], axis = 1, inplace = True)
        self.data.drop(['realized_volatility'], axis = 1, inplace = True)
        return self.data

    def first2_bid_depth(self):
        self.data['first2_bid_depth'] = self.data[['bs1', 'bs2']].sum(axis=1)
        return self.data

    def first2_ask_depth(self):
        self.data['first2_ask_depth'] = self.data[['as1', 'as2']].sum(axis=1)
        return self.data

    def full_bid_depth(self):
        self.data['full_bid_depth'] = self.data[['bs1', 'bs2', 'bs3','bs4', 'bs5', 'bs6','bs7', 'bs8', 'bs9','bs10',
                            'bs11', 'bs12', 'bs13','bs14', 'bs15', 'bs16','bs17', 'bs18', 'bs19','bs20']].sum(axis=1)
        return self.data

    def full_ask_depth(self):
        self.data['full_ask_depth'] = self.data[['as1', 'as2', 'as3','as4', 'as5', 'as6','as7', 'as8', 'as9','as10',
                            'as11', 'as12', 'as13','as14', 'as15', 'as16','as17', 'as18', 'as19','as20']].sum(axis=1)
        return self.data

    def BBAOFI(self):
        self.data['BBAOFI'] = (self.data['bs1']-self.data['as1'])/(self.data['bs1']+self.data['as1'])
        return self.data

    def first2_OFI(self):
        self.data['First2OFI'] = ((self.data['bs1']+self.data['bs2']) - (self.data['as1']+self.data['as2']))/ ((self.data['bs1']+self.data['bs2']) + (self.data['as1']+self.data['as2']))
        return self.data

    def FDOFI(self):
        self.data['FDOFI'] = (self.data['full_bid_depth']-self.data['full_ask_depth'])/(self.data['full_bid_depth']+self.data['full_ask_depth'])
        return self.data

    def set_index(self):
        self.data.set_index('primary_key')
        return self.data

    def WPA_trend(self):
        self.data['WAP_trend5'] = self.data['WAP'].ewm(span=2).mean()
        self.data['WAP_trend10'] = self.data['WAP'].ewm(span=5).mean()
        self.data['WAP_trend20'] = self.data['WAP'].ewm(span=10).mean()
        self.data['WAP_trend50'] = self.data['WAP'].ewm(span=20).mean()
        self.data['WAP_trend100'] = self.data['WAP'].ewm(span=50).mean()
        self.data['WAP_trend200'] = self.data['WAP'].ewm(span=100).mean()
        self.data['WAP_trend1000'] = self.data['WAP'].ewm(span=200).mean()
        return self.data

    def first2_OFI_trend(self):
        self.data['First2OFI_trend5'] = self.data['First2OFI'].ewm(span=2).mean()
        self.data['First2OFI_trend10'] = self.data['First2OFI'].ewm(span=5).mean()
        self.data['First2OFI_trend20'] = self.data['First2OFI'].ewm(span=10).mean()
        self.data['First2OFI_trend50'] = self.data['First2OFI'].ewm(span=20).mean()
        self.data['First2OFI_trend100'] = self.data['First2OFI'].ewm(span=50).mean()
        self.data['First2OFI_trend200'] = self.data['First2OFI'].ewm(span=100).mean()
        self.data['First2OFI_trend1000'] = self.data['First2OFI'].ewm(span=200).mean()
        return self.data

    def FDOFI_trend(self):
        self.data['FDOFI_trend5'] = self.data['FDOFI'].ewm(span=2).mean()
        self.data['FDOFI_trend10'] = self.data['FDOFI'].ewm(span=5).mean()
        self.data['FDOFI_trend20'] = self.data['FDOFI'].ewm(span=10).mean()
        self.data['FDOFI_trend50'] = self.data['FDOFI'].ewm(span=20).mean()
        self.data['FDOFI_trend100'] = self.data['FDOFI'].ewm(span=50).mean()
        self.data['FDOFI_trend200'] = self.data['FDOFI'].ewm(span=100).mean()
        self.data['FDOFI_trend1000'] = self.data['FDOFI'].ewm(span=200).mean()
        return self.data
    
    def fit(self, data, y=None):
        self.data = data
        self.WAP()
        self.spread()
        self.log_price()
        self.log_returns()
        self.volatility_df()
        self.volatility_next_period()
        # self.dropping_columns()
        # self.first2_bid_depth()
        # self.first2_ask_depth()
        # self.full_bid_depth()
        # self.full_ask_depth()
        # self.BBAOFI()
        # self.first2_OFI()
        # self.FDOFI()
        # self.set_index()
        # self.WPA_trend()
        # self.first2_OFI_trend()
        # self.FDOFI_trend()
        return self
    
    def transform(self, data):
        return self.data
        

In [80]:
class my_model(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = LinearRegression()

    def fit(self, data, y=None):
        X = data[['log_price']]
        y = data['realized_volatility']
        self.model.fit(X, y)
        return self
    
    def transform(self, data):
        return self
    
    def predict(self, data):
        X = data[['log_price']]
        return self.model.predict(X)
        
        
        
        

In [81]:
preproc = Pipeline([
            ('FinFeatures', FinancialFeatures()),
            ('my_model', my_model())
            ])
preproc

In [82]:
preproc.fit_transform(data).predict(data)



array([8.57849646e-05, 8.57849646e-05, 8.57849646e-05, ...,
       8.57849646e-05, 8.57849646e-05, 8.57849646e-05])