## code the class of linear computing signature 

In [9]:
import numpy as np
import pandas as pd
import iisignature
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import itertools

class Augment(BaseEstimator, TransformerMixin):
    def __init__(self, normalize=False, add_time=True, lead_lag=False, basepoint=True, t0=0.0, t1=1.0, lags=[1]):
        self.normalize = normalize
        self.add_time = add_time
        self.lead_lag = lead_lag
        self.basepoint = basepoint
        self.t0, self.t1 = t0, t1
        self.lags = lags

    def fit(self, X):
        return self

    def transform(self, X):
        if self.normalize:
            X /= X.max(axis=1)[:, None, :]
        if self.lead_lag:
            X_list = [X]
            for lag in self.lags:
                X_shifted = np.zeros_like(X)
                X_shifted[:, lag:, :] = X[:, :-lag, :]
                X_shifted[:, :lag, :] = np.expand_dims(X[:, 0, :], axis=1)
                X_list.append(X_shifted)
            X = np.concatenate(X_list, axis=-1)
        if self.add_time:
            time = np.linspace(self.t0, self.t1, X.shape[1])
            X = np.concatenate((np.tile(time[None, :, None], [X.shape[0], 1, 1]), X), axis=-1)
        if self.basepoint:
            X = np.concatenate((np.zeros_like(X[:, :1]), X), axis=1)
        return X

class FeatureExtraction(Augment):
    def __init__(self, 
                 depth=2, 
                 t0=0.0, 
                 t1=1.0, 
                 lags=[1], 
                 levels=1,
                 normalize=False, 
                 add_time=True, 
                 lead_lag=False, 
                 basepoint=True,
                 num_samples=5000):
        super().__init__(normalize, add_time, lead_lag, basepoint, t0, t1, lags)
        self.depth = depth
        self.levels = levels
        self.num_samples = num_samples

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        num_features_per_level = 4  # askRate, bidRate, askSize, bidSize
        total_columns = num_features_per_level * self.levels

        # Truncate dataset up to the specified level
        X_truncated = X[:, :total_columns]
        X_midprice = X[:, -1:]  # Include midprice
        X_truncated = np.concatenate([X_truncated, X_midprice], axis=1)

        signature_features = []
        sampled_timesteps = np.sort(np.random.randint(1, X_truncated.shape[0], size=self.num_samples))

        for tk in sampled_timesteps:
            X_full_segment = X_truncated[:tk, :]
            X_full_segment_aug = super().transform(X_full_segment[np.newaxis, :, :])
            Sig_0_tk = iisignature.sig(X_full_segment_aug, self.depth)

            X_short_segment = X_truncated[tk-1:tk, :]
            X_short_segment_aug = super().transform(X_short_segment[np.newaxis, :, :])
            Sig_tk_1_tk = iisignature.sig(X_short_segment_aug, self.depth)

            Sig_concat = np.concatenate([Sig_0_tk, Sig_tk_1_tk], axis=1)

            time_step_col = np.full((Sig_concat.shape[0], 1), tk)
            Sig_concat = np.concatenate([time_step_col, Sig_concat], axis=1)

            signature_features.append(Sig_concat)

        feature_columns = ["time_step"]
        sig_length = Sig_concat.shape[1] - 1

        for i in range(sig_length // 2):
            feature_columns.append(f"sig_global_depth_{self.depth}_element_{i}")
        for i in range(sig_length // 2, sig_length):
            feature_columns.append(f"sig_local_depth_{self.depth}_element_{i - sig_length // 2}")

        df_signatures = pd.DataFrame(np.vstack(signature_features), columns=feature_columns)

        return df_signatures

In [10]:
# Data Processing Pipeline
leave_out_start = 500
leave_out_end = 500
nrows = 500000

data = pd.read_csv('/Users/arthur/Documents/STUDY/Imperial/rough paths /salvi notebook/data.csv.gz', compression='gzip', nrows=nrows)
M, N = data.shape

# Extract target before reordering columns
y = data.iloc[:, -1].values

# Data Preprocessing
askSize_columns = [f'askSize{i}' for i in range(15)] 
bidSize_columns = [f'bidSize{i}' for i in range(15)]
size_columns = askSize_columns + bidSize_columns
data[size_columns] = data[size_columns].fillna(0.0)

askRate_columns = [f'askRate{i}' for i in range(15)] 
bidRate_columns = [f'bidRate{i}' for i in range(15)]
rate_columns = askRate_columns + bidRate_columns
data[rate_columns] = data[rate_columns].ffill().bfill()

data['midprice'] = 0.5 * (data['askRate0'] + data['bidRate0']) 
selected_columns = [[f'askRate{i}', f'bidRate{i}', f'askSize{i}', f'bidSize{i}'] for i in range(15)]
selected_columns = list(itertools.chain.from_iterable(selected_columns)) + ['midprice']
data = data[selected_columns]

X = data.values

In [12]:
# Signature Extraction
final_dataset = []
depths = [1,2,3]
levels = 10

for depth in tqdm(depths):
    feature_extraction = FeatureExtraction(depth=depth, levels=levels)
    X_transformed = feature_extraction.fit_transform(X)
    print(X_transformed.shape)

    df = pd.DataFrame(X_transformed)
    df["y"] = y[:df.shape[0]]

    filename = f"final_dataset_depth-{depth}.csv"
    df.to_csv(filename, index=False)
    final_dataset.append(df)

final_df = pd.concat(final_dataset, axis=0)
final_df.to_csv("complete_signature_dataset.csv", index=False)
print("Final dataset saved as complete_signature_dataset.csv")


  0%|          | 0/3 [00:00<?, ?it/s]

(5000, 85)


 33%|███▎      | 1/3 [16:01<32:02, 961.09s/it]

(5000, 3613)


 67%|██████▋   | 2/3 [15:45:17<7:52:38, 28358.73s/it]


KeyboardInterrupt: 