In [1]:
import sys
print(sys.executable)
print(sys.path)
import numpy as np
import pandas as pd
import iisignature
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import tqdm
import itertools
import sklearn
print(sklearn.__version__)
import numpy as np
import pandas as pd
import sklearn
import torch
import matplotlib
import iisignature

print("Python Version:", sys.version)
print("Numpy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-learn Version:", sklearn.__version__)
print("Torch Version:", torch.__version__)
print("Matplotlib Version:", matplotlib.__version__)
print("iisignature Version:", iisignature.__version__)


/Users/arthur/opt/anaconda3/envs/informer_env/bin/python
['/Users/arthur/opt/anaconda3/envs/informer_env/lib/python36.zip', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/lib-dynload', '', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/site-packages', '/Users/arthur/opt/anaconda3/envs/informer_env/lib/python3.6/site-packages/IPython/extensions', '/Users/arthur/.ipython']
0.24.2
Python Version: 3.6.13 |Anaconda, Inc.| (default, Feb 23 2021, 12:58:59) 
[GCC Clang 10.0.0 ]
Numpy Version: 1.19.4
Pandas Version: 1.1.5
Scikit-learn Version: 0.24.2
Torch Version: 1.8.0
Matplotlib Version: 3.1.2
iisignature Version: 0.24


In [2]:


class Augment(BaseEstimator, TransformerMixin):
    def __init__(self, normalize=False, add_time=True, lead_lag=False, basepoint=True, t0=0.0, t1=1.0, lags=[1]):
        self.normalize = normalize
        self.add_time = add_time
        self.lead_lag = lead_lag
        self.basepoint = basepoint
        self.t0, self.t1 = t0, t1
        self.lags = lags

    def fit(self, X):
        return self

    def transform(self, X):
        if self.normalize:
            X /= X.max(axis=1)[:, None, :]
        if self.lead_lag:
            X_list = [X]
            for lag in self.lags:
                X_shifted = np.zeros_like(X)
                X_shifted[:, lag:, :] = X[:, :-lag, :]
                X_shifted[:, :lag, :] = np.expand_dims(X[:, 0, :], axis=1)
                X_list.append(X_shifted)
            X = np.concatenate(X_list, axis=-1)
        if self.add_time:
            time = np.linspace(self.t0, self.t1, X.shape[1])
            X = np.concatenate((np.tile(time[None, :, None], [X.shape[0], 1, 1]), X), axis=-1)
        if self.basepoint:
            X = np.concatenate((np.zeros_like(X[:, :1]), X), axis=1)
        return X


class FeatureExtraction(Augment):
    def __init__(self, 
                 depth=2, 
                 t0=0.0, 
                 t1=1.0, 
                 lags=[1], 
                 levels=1,
                 normalize=False, 
                 add_time=True, 
                 lead_lag=False, 
                 basepoint=True,
    ):
        super().__init__(normalize, add_time, lead_lag, basepoint, t0, t1, lags)
        self.depth = depth
        self.levels = levels

    def fit(self, X, y=None):
        return self
    

    def transform(self, X, X_path1,X_path2,X_path3,X_path4):
        num_features_per_level = 4  # bid price, ask price, bid volume, ask volume
        total_columns = num_features_per_level * self.levels
        X_midprice = X[:, -1:]  # Include midprice

        X_truncated = X[:, :total_columns]
        

        signature_features = []

        for t in range(100,len(X)):
            level_signatures = []
            
            ### SIGNATURE PER LEVELS ###########################################
            
            for level in range(self.levels):
                start_idx = level * num_features_per_level
                end_idx = (level + 1) * num_features_per_level
                X_level = X_truncated[:, start_idx:end_idx]

                # Add midprice
                X_level_with_midprice = np.concatenate([X_level, X_midprice], axis=1)

                # Global Signature [0, tk]
                X_global_segment = X_level_with_midprice[:t, :]
                X_global_aug = super().transform(X_global_segment[np.newaxis, :, :])
                Sig_0_t = iisignature.sig(X_global_aug, self.depth)

                # Local Signature [tk-1, tk]
                X_local_segment = X_level_with_midprice[t-87:t, :]
                X_local_aug = super().transform(X_local_segment[np.newaxis, :, :])
                Sig_87_t = iisignature.sig(X_local_aug, self.depth)

                # Concatenate Global and Local Signatures
                level_signatures.append(np.concatenate([Sig_0_t, Sig_87_t], axis=1))

            # Concatenate all levels
            Sig_concat_level = np.concatenate(level_signatures, axis=1)

            #### SIGNATURE PER CATEGORY ################
            
            #Global Sgnature per category 
            X_global_segment_path1 = X_path1[:t, :]
            X_global_aug_path1 = super().transform(X_global_segment_path1[np.newaxis, :, :])
            Sig_0_t_path1 = iisignature.sig(X_global_aug_path1, self.depth)
            
            X_global_segment_path2 = X_path2[:t, :]
            X_global_aug_path2 = super().transform(X_global_segment_path2[np.newaxis, :, :])
            Sig_0_t_path2 = iisignature.sig(X_global_aug_path2, self.depth)
            
            X_global_segment_path3 = X_path3[:t, :]
            X_global_aug_path3 = super().transform(X_global_segment_path3[np.newaxis, :, :])
            Sig_0_t_path3 = iisignature.sig(X_global_aug_path3, self.depth)
            
            X_global_segment_path4 = X_path4[:t, :]
            X_global_aug_path4 = super().transform(X_global_segment_path4[np.newaxis, :, :])
            Sig_0_t_path4 = iisignature.sig(X_global_aug_path4, self.depth)

            signature_global_path = np.concatenate([Sig_0_t_path1,Sig_0_t_path2,Sig_0_t_path3,Sig_0_t_path4],axis=1)
            
            # Local Signature per category

            
            X_local_segment_path1 = X_path1[t-87:t, :]
            X_local_aug_path1 = super().transform(X_local_segment_path1[np.newaxis, :, :])                
            Sig_87_t_path1 = iisignature.sig(X_local_aug_path1, self.depth)
            
            X_local_segment_path2 = X_path2[t-87:t, :]
            X_local_aug_path2 = super().transform(X_local_segment_path2[np.newaxis, :, :])
            Sig_87_t_path2 = iisignature.sig(X_local_aug_path2, self.depth)                
            
            X_local_segment_path3 = X_path3[t-87:t, :]
            X_local_aug_path3 = super().transform(X_local_segment_path3[np.newaxis, :, :])
            Sig_87_t_path3 = iisignature.sig(X_local_aug_path3, self.depth)
                
            X_local_segment_path4 = X_path4[t-87:t, :]
            X_local_aug_path4 = super().transform(X_local_segment_path4[np.newaxis, :, :])
            Sig_87_t_path4 = iisignature.sig(X_local_aug_path4, self.depth)
                
            signature_local_path = np.concatenate([Sig_87_t_path1,Sig_87_t_path2,Sig_87_t_path3,Sig_87_t_path4],axis=1)
            
            # concatenate both local and global for categories 
               
            signature_category = np.concatenate([signature_local_path,signature_global_path],axis=1)
            
            # final signature concatenation 
            
            Sig_concat=np.concatenate([signature_category, Sig_concat_level],axis=1)
            
            # Add time step information
            time_step_col = np.full((Sig_concat.shape[0], 1), t)
            Sig_concat = np.concatenate([time_step_col, Sig_concat], axis=1)

            signature_features.append(Sig_concat)

        #################### COLUMN NAMING ######################################## 

        # Stack all signature features into a single array
        signature_features_array = np.vstack(signature_features)

        # Create a DataFrame with "time_step" as the first column and the rest as unnamed columns
        df_signatures = pd.DataFrame(signature_features_array)
        df_signatures = df_signatures.rename(columns={0: "time_step"})
        return df_signatures


In [3]:
## important variables 
depths = [2]
levels = 10
max_depths=depths[-1]
num_chunk=10000
threshold_per_category=10

In [4]:
# Data Processing Pipeline
nrows = 500000

data = pd.read_csv('/Users/arthur/Documents/STUDY/Imperial/rough paths /salvi notebook/data.csv.gz', compression='gzip', nrows=nrows)
M, N = data.shape

# Extract target before reordering columns
y = data.iloc[:, -1].values

# Data Preprocessing
askSize_columns = [f'askSize{i}' for i in range(15)] 
bidSize_columns = [f'bidSize{i}' for i in range(15)]
size_columns = askSize_columns + bidSize_columns
data[size_columns] = data[size_columns].fillna(0.0)

askRate_columns = [f'askRate{i}' for i in range(15)] 
bidRate_columns = [f'bidRate{i}' for i in range(15)]
rate_columns = askRate_columns + bidRate_columns
data[rate_columns] = data[rate_columns].ffill().bfill()

data['midprice'] = 0.5 * (data['askRate0'] + data['bidRate0']) 
selected_columns = [[f'askRate{i}', f'bidRate{i}', f'askSize{i}', f'bidSize{i}'] for i in range(15)]
selected_columns = list(itertools.chain.from_iterable(selected_columns)) + ['midprice']
data = data[selected_columns]
data.shape


(500000, 61)

In [5]:
## CATEGORY PATHS FOR CATEGORY SIGNATURES

# Convert to NumPy array and truncate to the first 400,000 rows
X_train = data.values[:400000]

X_ask_rate_train= data[ [f'askRate{i}' for i in range(threshold_per_category)]].values
X_bid_rate_train=data[[f'bidRate{i}' for i in range(threshold_per_category)]].values
X_ask_size_train=data[[f'askSize{i}' for i in range(threshold_per_category)]].values
X_bid_size_train=data[[f'bidSize{i}' for i in range(threshold_per_category)]].values



In [30]:
## main function to compute the signature, we will divide the set into small subset, then train on these small subset then make an average on the different model
final_df = []


sub_final_dataset=[] 
for depth in tqdm(depths):
    feature_extraction = FeatureExtraction(depth=depth, levels=levels)
    timesteps = np.arange(100, 400000) ## because we start at 100 to compute the signatures
    feature_extraction.fit(X_train)  # Fit separately
    X_transformed = feature_extraction.transform(X_train,X_ask_rate_train,X_bid_rate_train,X_ask_size_train, X_bid_size_train)
    print(X_transformed.shape)

    df = pd.DataFrame(X_transformed)
    df["y"] = y[timesteps]
    sub_final_dataset.append(df)

sub_final_df = pd.concat(sub_final_dataset, axis=0)
final_df.append(sub_final_df)

    
filename=f"Sig_glo_loc_lev_by_lev_depth-{max_depths}_fixed_87_num_train.csv"
sub_final_df.to_csv(filename, index=False)
print("Sub final dataset saved .csv")


  0%|          | 0/1 [00:00<?, ?it/s]