In [1]:
import sys
print(sys.executable)
print(sys.path)



/Users/arthur/opt/anaconda3/envs/informer_env/bin/python
['/Users/arthur/opt/anaconda3/envs/informer_env/lib/python36.zip', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/lib-dynload', '', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/site-packages', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/site-packages/IPython/extensions', '/Users/arthur/.ipython']


In [2]:
import sklearn
print(sklearn.__version__)

0.21.3


In [3]:
import numpy as np
import pandas as pd
import sklearn
import torch
import matplotlib
import iisignature

print("Python Version:", sys.version)
print("Numpy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-learn Version:", sklearn.__version__)
print("Torch Version:", torch.__version__)
print("Matplotlib Version:", matplotlib.__version__)
print("iisignature Version:", iisignature.__version__)


Python Version: 3.6.13 |Anaconda, Inc.| (default, Feb 23 2021, 12:58:59) 
[GCC Clang 10.0.0 ]
Numpy Version: 1.19.4
Pandas Version: 0.25.1
Scikit-learn Version: 0.21.3
Torch Version: 1.8.0
Matplotlib Version: 3.1.1
iisignature Version: 0.24


In [4]:
import numpy as np
import pandas as pd
import iisignature
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import itertools

class Augment(BaseEstimator, TransformerMixin):
    def __init__(self, normalize=False, add_time=True, lead_lag=False, basepoint=True, t0=0.0, t1=1.0, lags=[1]):
        self.normalize = normalize
        self.add_time = add_time
        self.lead_lag = lead_lag
        self.basepoint = basepoint
        self.t0, self.t1 = t0, t1
        self.lags = lags

    def fit(self, X):
        return self

    def transform(self, X):
        if self.normalize:
            X /= X.max(axis=1)[:, None, :]
        if self.lead_lag:
            X_list = [X]
            for lag in self.lags:
                X_shifted = np.zeros_like(X)
                X_shifted[:, lag:, :] = X[:, :-lag, :]
                X_shifted[:, :lag, :] = np.expand_dims(X[:, 0, :], axis=1)
                X_list.append(X_shifted)
            X = np.concatenate(X_list, axis=-1)
        if self.add_time:
            time = np.linspace(self.t0, self.t1, X.shape[1])
            X = np.concatenate((np.tile(time[None, :, None], [X.shape[0], 1, 1]), X), axis=-1)
        if self.basepoint:
            X = np.concatenate((np.zeros_like(X[:, :1]), X), axis=1)
        return X


class FeatureExtraction(Augment):
    def __init__(self, 
                 depth=2, 
                 t0=0.0, 
                 t1=1.0, 
                 lags=[1], 
                 levels=1,
                 normalize=False, 
                 add_time=True, 
                 lead_lag=False, 
                 basepoint=True,
                 num_samples=100):
        super().__init__(normalize, add_time, lead_lag, basepoint, t0, t1, lags)
        self.depth = depth
        self.levels = levels
        self.num_samples = num_samples

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        num_features_per_level = 4  # bid price, ask price, bid volume, ask volume
        total_columns = num_features_per_level * self.levels

        X_truncated = X[:, :total_columns]
        X_midprice = X[:, -1:]  # Include midprice

        signature_features = []
        sampled_timesteps = np.sort(np.random.randint(1, X_truncated.shape[0], size=self.num_samples))

        for tk in sampled_timesteps:
            level_signatures = []

            for level in range(self.levels):
                start_idx = level * num_features_per_level
                end_idx = (level + 1) * num_features_per_level
                X_level = X_truncated[:, start_idx:end_idx]

                # Add midprice
                X_level_with_midprice = np.concatenate([X_level, X_midprice], axis=1)

                # Global Signature [0, tk]
                X_global_segment = X_level_with_midprice[:tk, :]
                X_global_aug = super().transform(X_global_segment[np.newaxis, :, :])
                Sig_0_tk = iisignature.sig(X_global_aug, self.depth)

                # Local Signature [tk-1, tk]
                X_local_segment = X_level_with_midprice[tk-1:tk, :]
                X_local_aug = super().transform(X_local_segment[np.newaxis, :, :])
                Sig_tk_1_tk = iisignature.sig(X_local_aug, self.depth)

                # Concatenate Global and Local Signatures
                level_signatures.append(np.concatenate([Sig_0_tk, Sig_tk_1_tk], axis=1))

            # Concatenate all levels
            Sig_concat = np.concatenate(level_signatures, axis=1)

            # Add time step information
            time_step_col = np.full((Sig_concat.shape[0], 1), tk)
            Sig_concat = np.concatenate([time_step_col, Sig_concat], axis=1)

            signature_features.append(Sig_concat)

        # Column Naming
        feature_columns = ["time_step"]
        sig_length_per_level = Sig_concat.shape[1] - 1  # excluding the time_step

        for level in range(self.levels):
            sig_length = (sig_length_per_level // self.levels) // 2  # divide equally between global and local

            for i in range(sig_length):
                feature_columns.append(f"sig_global_level_{level}_depth_{self.depth}_element_{i}")

            for i in range(sig_length, sig_length * 2):
                feature_columns.append(f"sig_local_level_{level}_depth_{self.depth}_element_{i - sig_length}")

        # Final DataFrame
        df_signatures = pd.DataFrame(np.vstack(signature_features), columns=feature_columns)

        return df_signatures


In [5]:
# Data Processing Pipeline
leave_out_start = 500
leave_out_end = 500
nrows = 500000

data = pd.read_csv('/Users/arthur/Documents/STUDY/Imperial/rough paths /salvi notebook/data.csv.gz', compression='gzip', nrows=nrows)
M, N = data.shape

# Extract target before reordering columns
y = data.iloc[:, -1].values

# Data Preprocessing
askSize_columns = [f'askSize{i}' for i in range(15)] 
bidSize_columns = [f'bidSize{i}' for i in range(15)]
size_columns = askSize_columns + bidSize_columns
data[size_columns] = data[size_columns].fillna(0.0)

askRate_columns = [f'askRate{i}' for i in range(15)] 
bidRate_columns = [f'bidRate{i}' for i in range(15)]
rate_columns = askRate_columns + bidRate_columns
data[rate_columns] = data[rate_columns].ffill().bfill()

data['midprice'] = 0.5 * (data['askRate0'] + data['bidRate0']) 
selected_columns = [[f'askRate{i}', f'bidRate{i}', f'askSize{i}', f'bidSize{i}'] for i in range(15)]
selected_columns = list(itertools.chain.from_iterable(selected_columns)) + ['midprice']
data = data[selected_columns]

X = data.values

In [6]:
# Signature Extraction
final_dataset = []
depths = [2]
levels = 15
max_depths=depths[-1]

for depth in tqdm(depths):
    feature_extraction = FeatureExtraction(depth=depth, levels=levels)
    X_transformed = feature_extraction.fit_transform(X)
    print(X_transformed.shape)

    df = pd.DataFrame(X_transformed)
    df["y"] = y[:df.shape[0]]

    filename = f"final_dataset_depth-{depth}.csv"
    df.to_csv(filename, index=False)
    final_dataset.append(df)

final_df = pd.concat(final_dataset, axis=0)
filename=f"Test_Sig_global_local_level_by_level_depth-{max_depths}.csv"
final_df.to_csv(filename, index=False)
print("Final dataset saved as complete_signature_dataset.csv")


100%|██████████| 1/1 [01:27<00:00, 87.13s/it]

(100, 1261)
Final dataset saved as complete_signature_dataset.csv



