# Recovering Time Structure

This notebook aims to recover the temporal structure in the data so it can be incorporated into model training, ensuring that our models do not inadvertently access future information. We apply the Isomap algorithm, which constructs a k-nearest neighbors (KNN) graph using Euclidean distances, computes shortest path distances on the graph (approximating geodesics), and then applies classical multidimensional scaling (MDS) to those distances.


In [1]:
# libraries and settings
import pandas as pd
from sklearn.manifold import Isomap
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading the training data 
df_train = pd.read_csv('../data/df_train.csv')
train = pd.read_csv('../data/train.csv')

df_train['target'] = train['target']

df_train.dropna(inplace=True)

In [3]:
# recovering the time structure (to be used later for training the models)

df_train['stock_id'] = df_train['row_id'].apply(lambda x: int(x.split('-')[0])).astype('category')

df_train['time_id'] = df_train['row_id'].apply(lambda x: int(x.split('-')[1]))
df_train['n'] = df_train.groupby('time_id')['time_id'].transform('count')


###### finding a balanced panel of stocks #########
all_stocks = df_train['stock_id'].unique()
all_times = df_train['time_id'].unique()

# create the full cartesian product (all combinations)
full_df = pd.DataFrame( 
    index =
        pd.MultiIndex.from_product(
            [all_times, all_stocks], names=['time_id', 'stock_id']
        )
    ).reset_index()

# find the missing combinations of stock id and time id
df_existing = df_train[['time_id', 'stock_id']].drop_duplicates()

missing_pairs = pd.merge(
    full_df,
    df_existing,
    on = ['time_id', 'stock_id'],
    how = 'left',
    indicator = True
    ).query('_merge == "left_only"').drop(columns='_merge')


df_existing = df_existing[~ df_existing['stock_id'].isin(missing_pairs['stock_id'].unique())]

# check if now all the time id's have the data for all the remaining stocks
df_existing['n'] = df_existing.groupby('time_id')['time_id'].transform('count')
df_existing['n'].describe()
df_train.drop(columns = ['n' ], inplace = True )

cols_to_keep = [col for col in df_train.columns if 'trade' in col]
cols_to_keep.extend(['time_id', 'stock_id'])

# selecting features to be used for recovering the time structure
time_features = df_train[cols_to_keep][
    ~ df_train['stock_id'].isin(missing_pairs['stock_id'].unique())
    ].groupby('time_id').mean(numeric_only=True).reset_index()


time_ids = time_features['time_id'].values
X = time_features.drop(columns='time_id').values 
X = StandardScaler().fit_transform(X)

# fit Isomap to get 1D embedding
iso = Isomap(n_neighbors=10, n_components=1)
Z = iso.fit_transform(X)


time_ordered_df = pd.DataFrame({
    'time_id': time_ids,
    'isomap_order_value': Z[:, 0]
})
time_ordered_df['time_id_ordered'] = time_ordered_df['isomap_order_value'].rank(method='first').astype(int)

time_ordered_df.drop(columns = ['isomap_order_value']).to_csv('../data/time_order.csv', index = False)

