In [2]:
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import joblib
import warnings

warnings.filterwarnings('ignore')



In [23]:
# Load the dataset
filename = "/Users/arthur/Documents/GITHUB/Rough_informer/Rough_Informer_for_High_freq_Order_Book/Sig_global_local_level_by_level_depth-2_row_10000.csv"
df = pd.read_csv(filename)

# Separate features and target
X = df.drop(columns=['y', 'time_step']).values  # Drop 'time_step' as it's metadata
y = df['y'].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Replace train_test_split with:
split_2 = int(len(df) * 0.8)
split_1 = int(len(df) * 0.6)
X_train, X_test, X_val = X_scaled[:split_1], X_scaled[split_1:split_2],X_scaled[split_2:]
y_train, y_test,y_val = y[:split_1], y[split_1:split_2], y[split_2:]

In [24]:
print("X_train", X_train.shape)
print("X_val", X_val.shape)
print("y_train", y_train.shape)
print("y_val", y_val.shape)

X_train (6000, 840)
X_val (2000, 840)
y_train (6000,)
y_val (2000,)


In [15]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
import warnings

warnings.filterwarnings('ignore')


In [25]:
# ===============================
# 2️⃣ Elastic Net Regression Model
# ===============================

# Initialize and train Elastic Net model
elastic_net = ElasticNet(alpha=0.3, l1_ratio=0.3, random_state=42)
elastic_net.fit(X_train, y_train)

# Make predictions
y_pred = elastic_net.predict(X_test)

# Calculate R² score
r2 = r2_score(y_test, y_pred)

# Print R² score
print(f"R² Score on Validation Set: {r2:.4f}")

R² Score on Validation Set: -0.0039


In [26]:

# ===============================
# 2️⃣ Hyperparameter Tuning with Hyperopt
# ===============================

param_space = {
    'boosting_type': hp.choice('boosting_type', ['dart']),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.6),
    'drop_rate': hp.uniform('drop_rate', 0.4, 0.9),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'max_depth': hp.choice('max_depth', [3, 4, 5, 6]),
    'min_child_samples': hp.randint('min_child_samples', 20, 70),
    'min_child_weight': hp.uniform('min_child_weight', 0.01, 0.05),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300,400,500,600]),
    'objective': 'regression',
    'skip_drop': hp.uniform('skip_drop', 0.3, 0.9),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
}

def train_lgb_model(params, X_train, y_train, X_test, y_test, seed=42):
    """Train LightGBM and return (model, R^2, RMSE) on validation."""
    model = lgb.LGBMRegressor(**params)
    
    
    # Correct way to pass early stopping
    model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='l2',  # Use MSE for early stopping
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
    
    y_test_pred = model.predict(X_test)
    r2_test = r2_score(y_test, y_test_pred)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    return model, r2_test, rmse_test


def objective_func(space):
    _, r2_test, rmse_test = train_lgb_model(space, X_train, y_train, X_test, y_test)
    return {'loss': 1.0 - r2_test, 'status': STATUS_OK}

# Hyperparameter Optimization
trials = Trials()
best_params = fmin(
    fn=objective_func,
    space=param_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42)
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

[1]	valid_0's l2: 0.571026                            
[2]	valid_0's l2: 0.570607                            
[3]	valid_0's l2: 0.57053                             
[4]	valid_0's l2: 0.570422                            
[5]	valid_0's l2: 0.570161                            
[6]	valid_0's l2: 0.569941                            
[7]	valid_0's l2: 0.570033                            
[8]	valid_0's l2: 0.569767                            
[9]	valid_0's l2: 0.569614                            
[10]	valid_0's l2: 0.569576                           
[11]	valid_0's l2: 0.569539                           
[12]	valid_0's l2: 0.569597                           
[13]	valid_0's l2: 0.56946                            
[14]	valid_0's l2: 0.569511                           
[15]	valid_0's l2: 0.569401                           
[16]	valid_0's l2: 0.56916                            
[17]	valid_0's l2: 0.568419                           
[18]	valid_0's l2: 0.568349                           
[19]	valid

In [27]:
print(best_params)

{'boosting_type': 0, 'colsample_bytree': 0.5311625437141356, 'drop_rate': 0.42600232892666556, 'learning_rate': 0.012219226125458085, 'max_depth': 0, 'min_child_samples': 66, 'min_child_weight': 0.02963271633472271, 'n_estimators': 3, 'reg_alpha': 0.7530962640046979, 'reg_lambda': 0.3804290123331761, 'skip_drop': 0.3781579432088977, 'subsample': 0.5348621215669586}


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import joblib
import warnings

warnings.filterwarnings('ignore')

# ===============================
# 1️⃣ Data Loading & Preprocessing
# ===============================

# Load the dataset
filename = "/Users/arthur/Documents/GITHUB/Rough_informer/Rough_Informer_for_High_freq_Order_Book/final_dataset_depth-2_row_10_000.csv"
df = pd.read_csv(filename)

# Separate features and target
X = df.drop(columns=['y', 'time_step']).values  # Drop 'time_step' as it's metadata
y = df['y'].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Replace train_test_split with:
split_idx = int(len(df) * 0.8)
X_train, X_val = X_scaled[:split_idx], X_scaled[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]
print(X.shape)


(10000, 1260)


In [18]:
print("X_train", X_train.shape)
print("X_val", X_val.shape)
print("y_train", y_train.shape)
print("y_val", y_val.shape)

X_train (8000, 1260)
X_val (2000, 1260)
y_train (8000,)
y_val (2000,)


In [8]:


# ===============================
# 3️⃣ Final Model Training
# ===============================



# Train the final model
final_model, final_r2, final_rmse = train_lgb_model(best_params, X_train, y_train, X_val, y_val)

# Save the model
joblib.dump(final_model, 'lgb_model_final.joblib')

# ===============================
# 4️⃣ Evaluation
# ===============================

print(f"Final R² on Validation Set: {final_r2:.4f}")
print(f"Final RMSE on Validation Set: {final_rmse:.4f}")


[1]	valid_0's l2: 0.477084
[2]	valid_0's l2: 0.477084
[3]	valid_0's l2: 0.477084
[4]	valid_0's l2: 0.477084
[5]	valid_0's l2: 0.477084
[6]	valid_0's l2: 0.477084
[7]	valid_0's l2: 0.477084
[8]	valid_0's l2: 0.477084
[9]	valid_0's l2: 0.477084
[10]	valid_0's l2: 0.477084
[11]	valid_0's l2: 0.477084
[12]	valid_0's l2: 0.477084
[13]	valid_0's l2: 0.477084
[14]	valid_0's l2: 0.477084
[15]	valid_0's l2: 0.477084
[16]	valid_0's l2: 0.477084
[17]	valid_0's l2: 0.477084
[18]	valid_0's l2: 0.477084
[19]	valid_0's l2: 0.477084
[20]	valid_0's l2: 0.477084
[21]	valid_0's l2: 0.477084
[22]	valid_0's l2: 0.477084
[23]	valid_0's l2: 0.477084
[24]	valid_0's l2: 0.477084
[25]	valid_0's l2: 0.477084
[26]	valid_0's l2: 0.477084
[27]	valid_0's l2: 0.477084
[28]	valid_0's l2: 0.477084
[29]	valid_0's l2: 0.477084
[30]	valid_0's l2: 0.477084
[31]	valid_0's l2: 0.477084
[32]	valid_0's l2: 0.477084
[33]	valid_0's l2: 0.477084
[34]	valid_0's l2: 0.477084
[35]	valid_0's l2: 0.477084
[36]	valid_0's l2: 0.477084
[