In [2]:
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import joblib
import warnings

warnings.filterwarnings('ignore')

# ===============================
# 1️⃣ Data Loading & Preprocessing
# ===============================

# Load the dataset
filename = "/Users/arthur/Documents/STUDY/Imperial/rough paths /salvi notebook/Sig_global_local_level_by_level_depth_{max_depths}.csv"
df = pd.read_csv(filename)

# Separate features and target
X = df.drop(columns=['y', 'time_step']).values  # Drop 'time_step' as it's metadata
y = df['y'].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [4]:
print("X_train", X_train.shape)
print("X_val", X_val.shape)
print("y_train", y_train.shape)
print("y_val", y_val.shape)

X_train (80, 1260)
X_val (20, 1260)
y_train (80,)
y_val (20,)


In [5]:

# ===============================
# 2️⃣ Hyperparameter Tuning with Hyperopt
# ===============================

param_space = {
    'boosting_type': hp.choice('boosting_type', ['dart']),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 0.6),
    'drop_rate': hp.uniform('drop_rate', 0.4, 0.9),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.001), np.log(0.1)),
    'max_depth': hp.choice('max_depth', [3, 4, 5, 6]),
    'min_child_samples': hp.randint('min_child_samples', 500, 3001),
    'min_child_weight': hp.uniform('min_child_weight', 0.01, 0.05),
    'n_estimators': hp.choice('n_estimators', [500, 1000, 1500]),
    'objective': 'regression',
    'skip_drop': hp.uniform('skip_drop', 0.5, 0.9),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
}

def train_lgb_model(params, X_train, y_train, X_val, y_val, seed=42):
    """Train LightGBM and return (model, R^2, RMSE) on validation."""
    model = lgb.LGBMRegressor(
        boosting_type=params['boosting_type'],
        colsample_bytree=params['colsample_bytree'],
        drop_rate=params['drop_rate'],
        learning_rate=params['learning_rate'],
        max_depth=params['max_depth'],
        min_child_samples=params['min_child_samples'],
        min_child_weight=params['min_child_weight'],
        n_estimators=params['n_estimators'],
        objective=params['objective'],
        skip_drop=params['skip_drop'],
        subsample=params['subsample'],
        random_state=seed,
        n_jobs=-2,
        verbosity=-1
    )
    
    # Correct way to pass early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    y_val_pred = model.predict(X_val)
    r2_val = r2_score(y_val, y_val_pred)
    rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    return model, r2_val, rmse_val


def objective_func(space):
    _, r2_val, rmse_val = train_lgb_model(space, X_train, y_train, X_val, y_val)
    return {'loss': 1.0 - r2_val, 'status': STATUS_OK}

# Hyperparameter Optimization
trials = Trials()
best_params = fmin(
    fn=objective_func,
    space=param_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(42)
)

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

[1]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[2]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[3]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[4]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[5]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[6]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[7]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[8]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[9]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289   
[10]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[11]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[12]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[13]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[14]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[15]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[16]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[17]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[18]	valid_0's rmse: 0.660522	valid_0's l2: 0.436289  
[19]	valid

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
import joblib
import warnings

warnings.filterwarnings('ignore')

# ===============================
# 1️⃣ Data Loading & Preprocessing
# ===============================

# Load the dataset
filename = "/Users/arthur/Documents/STUDY/Imperial/rough paths /salvi notebook/Sig_global_local_level_by_level_depth_{max_depths}.csv"
df = pd.read_csv(filename)

# Separate features and target
X = df.drop(columns=['y', 'time_step']).values  # Drop 'time_step' as it's metadata
y = df['y'].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [3]:


# ===============================
# 3️⃣ Final Model Training
# ===============================

# Map categorical choices
best_params['boosting_type'] = 'dart'
best_params['objective'] = 'regression'
best_params['max_depth'] = [3, 4, 5, 6][best_params['max_depth']]
best_params['n_estimators'] = [500, 1000, 1500][best_params['n_estimators']]

# Train the final model
final_model, final_r2, final_rmse = train_lgb_model(best_params, X_train, y_train, X_val, y_val)

# Save the model
joblib.dump(final_model, 'lgb_model_final.joblib')

# ===============================
# 4️⃣ Evaluation
# ===============================

print(f"Final R² on Validation Set: {final_r2:.4f}")
print(f"Final RMSE on Validation Set: {final_rmse:.4f}")


NameError: name 'best_params' is not defined