In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import subprocess

def install_packages(packages):
    """
    Install a list of packages using pip.

    Args:
        packages (list): A list of package names to install.
    """
    for package in packages:
        subprocess.run(["pip", "install", package], check=True)


In [3]:
packages_to_install = ["keras_hub", "polars"]

# Install the packages
install_packages(packages_to_install)

# Core data processing and numerical libraries
import os

os.environ["KERAS_BACKEND"] = "jax"
import keras
import numpy as np
import pandas as pd
import polars as pl

from typing import Dict, List, Tuple


# Visualization
import matplotlib.pyplot as plt

import tensorflow as tf

# Keras imports
from keras import layers
from keras import Model
from keras import ops
from keras_hub.layers import TransformerEncoder
from keras import regularizers

from sklearn.metrics import r2_score

from pathlib import Path
import gc

DATA_DIR = Path("/content/drive/MyDrive/Colab Notebooks/Jane_Street/data/train.parquet/")

In [4]:
feature_09_dict = {
    2: 0,
    4: 1,
    9: 2,
    11: 3,
    12: 4,
    14: 5,
    15: 6,
    25: 7,
    26: 8,
    30: 9,
    34: 10,
    42: 11,
    44: 12,
    46: 13,
    49: 14,
    50: 15,
    57: 16,
    64: 17,
    68: 18,
    70: 19,
    81: 20,
    82: 21
}
feature_10_dict = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 10: 7, 12: 8}
feature_11_dict = {
    9: 0,
    11: 1,
    13: 2,
    16: 3,
    24: 4,
    25: 5,
    34: 6,
    40: 7,
    48: 8,
    50: 9,
    59: 10,
    62: 11,
    63: 12,
    66: 13,
    76: 14,
    150: 15,
    158: 16,
    159: 17,
    171: 18,
    195: 19,
    214: 20,
    230: 21,
    261: 22,
    297: 23,
    336: 24,
    376: 25,
    388: 26,
    410: 27,
    522: 28,
    534: 29,
    539: 30
}

In [5]:
model = keras.Sequential()
model.add(keras.layers.Embedding(1000, 64))
# The model will take as input an integer matrix of size (batch,
# input_length), and the largest integer (i.e. word index) in the input
# should be no larger than 999 (vocabulary size).
# Now model.output_shape is (None, 10, 64), where `None` is the batch
# dimension.
input_array = np.random.randint(10, size=(5))
model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
print(output_array.shape)
(32, 10, 64)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
(5, 64)


(32, 10, 64)

In [6]:
output_array.shape

(5, 64)

In [7]:
input_array.shape

(5,)

In [8]:
def reduce_memory_usage() -> pl.Expr:
    expressions = [
        pl.col(pl.Float64).cast(pl.Float32),
        pl.col("date_id", "time_id").cast(pl.Int16),
        pl.col("symbol_id").cast(pl.Int8),
        ]
    return expressions

def get_lag_responders() -> pl.Expr:
    cols = [f"responder_{i}" for i in range(9)]
    expressions = [
        pl.col(col)
        .shift(i)
        .over('symbol_id', 'time_id')
        .alias(f"{col}_lag_{i}")
        for col in cols for i in [1] # lags
        ]
    return expressions

def map_category() -> pl.Expr:
    expressions = [
        pl.col('symbol_id').replace({i: i for i in range(39)}).fill_null(99),
        pl.col('feature_09').replace(feature_09_dict).fill_null(99),
        pl.col('feature_10').replace(feature_10_dict).fill_null(99),
        pl.col('feature_11').replace(feature_11_dict).fill_null(99),
    ]
    return expressions

def get_temporal_features() -> pl.Expr:
    expressions = [
        (pl.col('date_id') % 170).alias('day'),
        (pl.col('date_id') * 2 * np.pi / 170).sin().cast(pl.Float32).alias('date_sin'),
        (pl.col('date_id') * 2 * np.pi / 170).cos().cast(pl.Float32).alias('date_cos'),
        # (pl.col('time_id') * 2 * np.pi / 967).sin().cast(pl.Float32).alias('time_id_sin'),
        # (pl.col('time_id') * 2 * np.pi / 967).cos().cast(pl.Float32).alias('time_id_cos')
    ]
    return expressions

def get_lag_stats_per_day() -> pl.Expr:
    group = ["date_id", "symbol_id"]
    # cols = [f"responder_{i}_lag_1" for i in range(9)]
    cols = ['responder_6_lag_1']
    expressions = []
    for col in cols:
        exprs = [
            # pl.col(col).min().over(group).alias(f"{col}_min"),
            pl.col(col).max().over(group).alias(f"{col}_max"),
            # pl.col(col).std().over(group).alias(f"{col}_std")
            # pl.col(col).median().over(group).alias(f"{col}_median_per_day")
        ]
        expressions.extend(exprs)
    return expressions

def get_lag_features() -> pl.Expr:
    group = ['symbol_id', 'date_id']
    expressions = [
        pl.col('feature_07').shift(2).over(group).alias('feature_07_lag'),
        pl.col('feature_06').shift(1).over(group).alias('feature_06_lag'),
        pl.col('feature_60').shift(2).over(group).alias('feature_60_lag'),
        # pl.col('feature_04').shift(4).over(group).alias('feature_04_lag'),
        # pl.col('feature_05').shift(5).over(group).alias('feature_05_lag'),
        # pl.col('feature_36').shift(4).over(group).alias('feature_36_lag'),
        # pl.col('feature_58').shift(4).over(group).alias('feature_58_lag'),
        # pl.col('feature_59').shift(1).over(group).alias('feature_59_lag'),
        # pl.col('feature_38').shift(8).over(group).alias('feature_38_lag'),
        # pl.col('feature_52').shift(3).over(group).alias('feature_52_lag'),
    ]
    return expressions

def get_tag_means() -> pl.Expr:
    expressions = [
        pl.mean_horizontal(tag_11_cols).alias('tag_11_mean'),
    ]
    return expressions

def generate_features(df):
    exprs_1 = [
        map_category(),
        get_temporal_features(),
        get_lag_features(),
        get_tag_means(),
        get_lag_responders(),
    ]
    expressions = [e for sublist in exprs_1 for e in sublist]
    df = df.with_columns(reduce_memory_usage())
    df = df.with_columns(
        pl.col('feature_09').cast(pl.Int8),
        pl.col('feature_10').cast(pl.Int8),
        pl.col('feature_11').cast(pl.Int16),
    )
    df = df.with_columns(expressions)
    df = df.with_columns(get_lag_stats_per_day())

    df = df.with_columns(
        pl.col('time_id') / 968,
        pl.col('day') / 170
    )

    ign_cols = [f"responder_{i}_lag_1" for i in range(9)]
    return df.select(pl.all().exclude(ign_cols))

In [9]:
df_feat = pl.read_csv("/content/drive/MyDrive/Colab Notebooks/Jane_Street/data/features.csv")
tag_11_cols = df_feat.filter(pl.col('tag_11')==True)['feature'].to_list()

In [10]:
train = pl.scan_parquet(DATA_DIR).filter(pl.col('partition_id')>5)
train = generate_features(train)
# remove nulls resulting from day 1 lags
start_date = train.select(pl.first('date_id')).collect()
train = train.filter(pl.col('date_id') > start_date)
train = train.fill_null(0)

train.head(5).collect()

date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,…,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id,day,date_sin,date_cos,feature_07_lag,feature_06_lag,feature_60_lag,tag_11_mean,responder_6_lag_1_max
i16,f64,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64,f64,f32,f32,f32,f32,f32,f64,f32
1021,0.0,0,2.758212,-0.88871,-0.280876,-0.709766,-1.114013,2.130691,-0.675838,-0.217391,-0.15832,0.157831,3,6,14,-0.824303,1.321677,-0.211726,0.0,0.028558,0.0,-1.903073,-1.309708,0.56428,-0.113246,0.460829,0.874078,-0.634349,-0.989553,0.363487,1.548323,0.658012,-0.147794,-0.0248,-0.11404,0.0,…,-0.031517,0.822857,-0.120644,-0.096939,-0.165281,-1.85763,-1.983462,-0.761288,1.461293,-0.095967,-0.949327,0.945887,-0.22943,0.0,0.0,-0.086589,-0.138892,-0.339265,-0.208284,0.143218,0.036284,0.605471,-2.160379,-2.312699,-1.693053,-3.237157,-2.757349,-2.678967,6,0.005882,0.036952,0.999317,0.0,0.0,0.0,0.201346,3.252243
1021,0.0,1,5.128181,-0.667143,-0.156818,-0.551448,-1.022433,1.902604,-0.591912,-0.243567,-0.178642,0.154012,3,6,14,-1.341635,0.35188,-0.521197,0.0,-0.344777,0.0,-1.983181,-1.479607,0.43023,0.102232,1.243619,1.429466,-0.268186,-0.352289,0.443074,1.538062,1.902193,-0.358995,-0.486083,0.120412,0.0,…,-0.110262,0.822857,-0.224284,-0.379963,-0.184559,-1.470559,-1.480629,-0.691343,0.60478,-0.246037,-1.084746,-0.031681,-0.598442,0.0,0.0,-0.292409,-0.29329,-0.189932,-0.304927,0.080642,0.031724,0.098682,-0.183446,-0.492214,0.865066,-0.417081,-0.854174,1.2233,6,0.005882,0.036952,0.999317,0.0,0.0,0.0,-0.110867,4.152138
1021,0.0,2,2.990457,-0.592874,-0.843829,-1.172633,-0.977152,1.665111,-0.780339,-0.248836,-0.295433,0.212535,20,1,10,-0.68717,4.194336,0.333165,0.0,-0.048717,0.0,-2.749262,-1.98124,-0.741161,-0.110016,0.762164,0.384524,-1.003869,-0.887358,0.924972,1.867121,1.078736,-0.892071,-0.564501,-0.161899,0.0,…,3.020605,0.822857,-0.163303,-0.217823,-0.203042,-1.729299,-1.913308,-0.555369,4.493433,0.638722,-0.765278,2.414558,-0.003537,0.0,0.0,0.36877,0.558933,-0.099664,-0.018457,0.105283,0.214745,0.353996,-1.960483,-1.242982,-2.430258,-3.529213,-2.412281,-3.481411,6,0.005882,0.036952,0.999317,0.0,0.0,0.0,1.525596,5.0
1021,0.0,3,1.580349,-0.949817,-0.223031,-0.797525,-0.895173,2.020643,-0.434421,-0.165368,-0.207286,0.163731,1,2,1,-0.966018,1.740896,-0.26865,0.0,-0.274083,0.0,-0.462178,-1.663668,-0.468623,-0.010726,-0.718504,-0.659684,0.657763,0.178212,-0.276047,-1.019963,-0.891104,-0.443206,-0.595868,-0.0089,0.0,…,0.785277,0.822857,-0.211538,-0.221234,-0.141883,-1.854496,-1.293521,-0.906615,1.082458,-0.1356,-0.96185,1.193984,-0.453,0.0,0.0,1.113736,1.278999,0.051255,0.033749,0.078974,0.088442,0.710223,1.622804,2.530923,1.304712,2.790201,2.580255,1.266655,6,0.005882,0.036952,0.999317,0.0,0.0,0.0,0.013414,3.490906
1021,0.0,4,3.252043,-0.611066,-0.523858,-0.769166,-1.01305,1.424418,-0.265226,-0.093243,-0.09816,0.076199,6,0,0,-0.771626,4.367999,0.230222,0.0,-0.341415,0.0,-1.277562,-1.053409,-0.160347,0.884322,0.977945,-0.11547,2.352565,2.582544,-1.674891,-0.515939,0.555117,-0.733854,-1.190635,0.581692,0.0,…,1.005107,0.822857,-0.216605,-0.376726,-0.490268,-2.035804,-2.006988,-0.55399,3.02505,0.347953,-0.657172,2.611223,-0.002669,0.0,0.0,8.576644,8.451904,9.772758,6.689439,-1.08646,-0.860899,-2.133953,1.432439,-2.483309,-1.849198,3.356596,-2.83107,-3.252782,6,0.005882,0.036952,0.999317,0.0,0.0,0.0,0.939671,5.0


In [11]:
def min_max_scaler(df: pl.DataFrame, columns: List) -> pl.DataFrame:
    for col in columns:
        col_min = df[col].min()
        col_max = df[col].max()
        df = df.with_columns(
            ((pl.col(col) - col_min) / ((col_max - col_min)+1e-10)).alias(col)
            )
    return df

In [12]:
temporal_features = ['day', 'date_sin', 'date_cos', 'time_id']
features = [f"feature_{i:02}" for i in range(79)]
cat_features = ['feature_09', 'feature_10', 'feature_11']
lag_features = [f"responder_{i}_lag_1" for i in range(9)]
other_features = ['weight',
                  'feature_07_lag',
                  'feature_06_lag',
                  'feature_60_lag',
                  'tag_11_mean',
                  'responder_6_lag_1_max'
                  ]
trend_features = features + other_features
trend_features = [x for x in trend_features if x not in cat_features]
targets = ['responder_6']

df_train = train.filter(pl.col('date_id')<1576).select(
    temporal_features+trend_features+targets+['date_id', 'symbol_id']
    ).collect()
df_train = min_max_scaler(df_train, temporal_features + trend_features)

df_valid = train.filter(pl.col('date_id')>1576).select(
    temporal_features+trend_features+targets+['date_id', 'symbol_id']
    ).collect()
df_valid = min_max_scaler(df_valid, temporal_features + trend_features)

In [13]:
def get_sequences(df, seq_length=8, stride=8):
    data = df.to_numpy()
    num_sequences = (len(data) - seq_length) // stride + 1
    sequences = np.zeros((num_sequences, seq_length, df.shape[1]))
    for i in range(num_sequences):
        start_idx = i * stride
        end_idx = start_idx + seq_length
        sequences[i, :, :] = data[start_idx:end_idx]
    return sequences

In [14]:
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, df: pl.DataFrame, sequence_length=8, stride=8):
        self.df = df
        self.date_ids = df['date_id'].unique().to_list()
        self.sequence_length = sequence_length
        self.stride = stride

    def __len__(self):
        return len(self.date_ids)

    def __getitem__(self, index):
        date_id = self.date_ids[index]
        df_ = self.df.filter(pl.col('date_id') == date_id)

        temporal_data = []
        trend_data = []
        target_data = []

        for _, data in df_.group_by('symbol_id'):
            temporal_data.append(
                get_sequences(
                    data.select(temporal_features)[:-self.sequence_length],
                    seq_length=self.sequence_length,
                    stride=self.stride
                    )
            )
            trend_data.append(
                get_sequences(
                    data.select(trend_features)[:-self.sequence_length],
                    seq_length=self.sequence_length,
                    stride=self.stride
                    )
            )
            target_data.append(
                get_sequences(
                    data.select(targets)[self.sequence_length:],
                    seq_length=self.sequence_length,
                    stride=self.stride
                    )
            )

        temporal_data = np.vstack(temporal_data)
        trend_data = np.vstack(trend_data)
        target_data = np.vstack(target_data)

        return [temporal_data, trend_data], target_data

    @property
    def num_batches(self):
        return len(self)

train_generator = DataGenerator(df_train)
valid_generator = DataGenerator(df_valid)

In [15]:
for [x1, x2], y in train_generator:
  break
x2.shape

(4080, 8, 82)

In [16]:
# Adapted from
# https://keras.io/examples/generative/customer_lifetime_value/
keras.config.set_dtype_policy("mixed_float16")

def build_model(
    input_sequence_length: int,
    output_sequence_length: int,
    # num_symbols: int,
    d_model: int = 8,
    num_heads: int = 4,
    ):

    keras.utils.set_random_seed(seed=42)

    # Inputs
    temporal_inputs = layers.Input(
        shape=(input_sequence_length, 4), name="temporal_inputs"
    )
    trend_inputs = layers.Input(shape=(input_sequence_length, 82), name="trend_inputs")

    symbol_inputs = layers.Input(
        shape=(input_sequence_length,), dtype="int32", name="symbol_inputs"
    )

    # Process symbol features
    symbol_embedding = layers.Embedding(
        input_dim=40,
        output_dim=d_model,
        mask_zero=False,
        name="symbl_embedding",
    )(symbol_inputs)  # Output shape: (batch_size, 1, d_model)

    # Flatten the embedding output
    symbol_embedding = layers.Flatten(name="flatten_symbol_embedding")(
        symbol_embedding
    )


    temporal_projection = layers.Dense(
        d_model, activation="tanh", name="temporal_projection"
    )(temporal_inputs)

    transformer_output = temporal_projection
    for _ in range(2):
        transformer_output = TransformerEncoder(
            intermediate_dim=16, num_heads=num_heads
        )(transformer_output)

    lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)

    transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
        transformer_output
    )
    transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)

    # Concatenate flattened Transformer output with LSTM output
    merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
        [transformer_flattened, lstm_output]
    )

    # Repeat the merged features to match the output sequence length
    decoder_initial = layers.RepeatVector(
        output_sequence_length, name="repeat_merged_features"
    )(merged_features)

    decoder_lstm = layers.LSTM(
        units=64,
        return_sequences=True,
        recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
    )(decoder_initial)

    # Output Dense layer
    output = layers.Dense(units=1, activation="linear", name="output_dense")(
        decoder_lstm
    )

    model = Model(
        inputs=[temporal_inputs, trend_inputs], outputs=output
    )

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="mse",
        metrics=["mae"],
    )

    return model

In [17]:
model = build_model(
    input_sequence_length=8,
    output_sequence_length=8,
    # num_symbols: int,
    d_model=8,
    num_heads=4,
    )
model.summary()

In [18]:
# keras.utils.plot_model(model, "my_first_model_with_shape_info.png", show_shapes=True)

In [19]:
%%time
callback = keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=3, restore_best_weights=True
        )

history = model.fit(
        train_generator,
        epochs=10,
        validation_data=(
            valid_generator
            ),
        callbacks=[callback],
        )

  self._warn_if_super_not_called()


Epoch 1/10
[1m554/555[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 252ms/step - loss: 0.7707 - mae: 0.5599

  self._warn_if_super_not_called()


[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 283ms/step - loss: 0.7705 - mae: 0.5599 - val_loss: 0.6456 - val_mae: 0.5347
Epoch 2/10
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 131ms/step - loss: 0.7145 - mae: 0.5444 - val_loss: 0.6445 - val_mae: 0.5351
Epoch 3/10
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 126ms/step - loss: 0.7080 - mae: 0.5435 - val_loss: 0.6437 - val_mae: 0.5340
Epoch 4/10
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 129ms/step - loss: 0.6988 - mae: 0.5401 - val_loss: 0.6448 - val_mae: 0.5366
Epoch 5/10
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 133ms/step - loss: 0.7087 - mae: 0.5435 - val_loss: 0.6436 - val_mae: 0.5345
Epoch 6/10
[1m555/555[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 134ms/step - loss: 0.7071 - mae: 0.5422 - val_loss: 0.6436 - val_mae: 0.5342
Epoch 7/

In [20]:
model.save("model_nn5.keras")

In [21]:
preds = model.predict(valid_generator).reshape(-1)

[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 109ms/step


In [22]:
preds.max()

3.303

In [23]:
preds.min()

-1.355

In [24]:
scores = []
start = 0
for i in range(len(valid_generator)):
    y_true = valid_generator[i][1].reshape(-1)
    y_pred = preds[start: start+len(y_true)]
    start = start + len(y_true)
    scores.append(r2_score(y_true, y_pred))

np.array(scores).mean()

-0.0029407748505858224