In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp310-cp310-win_amd64.whl (102.5 MB)
     ---------------------------------------- 0.0/102.5 MB ? eta -:--:--
     ---------------------------------------- 0.1/102.5 MB 2.0 MB/s eta 0:00:53
     ---------------------------------------- 0.6/102.5 MB 6.0 MB/s eta 0:00:18
      -------------------------------------- 1.5/102.5 MB 11.6 MB/s eta 0:00:09
      -------------------------------------- 2.3/102.5 MB 13.5 MB/s eta 0:00:08
     - ------------------------------------- 3.0/102.5 MB 14.6 MB/s eta 0:00:07
     - ------------------------------------- 3.2/102.5 MB 14.7 MB/s eta 0:00:07
     - ------------------------------------- 4.3/102.5 MB 16.0 MB/s eta 0:00:07
     - ------------------------------------- 4.3/102.5 MB 16.0 MB/s eta 0:00:07
     - ------------------------------------- 4.6/102.5 MB 12.8 MB/s eta 0:00:08
     -- ------------------------------------ 5.7/102.5 MB 14.1 MB/s eta 0:00:07
     -- ---------------------------------


[notice] A new release of pip is available: 23.0.1 -> 25.2
[notice] To update, run: C:\Users\Aryan\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
import optuna
import warnings

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [4]:
try:
    print ("\nLoading the data")
    train_df = pd.read_csv("C:/Users/Aryan/Downloads/Quant_Challenge_2025/train.csv")
    test_df = pd.read_csv("C:/Users/Aryan/Downloads/Quant_Challenge_2025/test.csv")
    print("data loaded")
except FileNotFoundError as e:
    print(f"Error: {e}")
    exit()


Loading the data
data loaded


In [5]:
combined_df = pd.concat([train_df.drop(['Y1', 'Y2'], axis=1), test_df], ignore_index=True)
features = [col for col in train_df.columns if col not in ['time', 'Y1', 'Y2']]

for feature in features:
    # Lags
    combined_df[f'{feature}_lag1'] = combined_df[feature].shift(1)
    # Multiple Rolling Windows
    for window in [10, 30]:
        combined_df[f'{feature}_rolling_mean{window}'] = combined_df[feature].rolling(window=window).mean()
        combined_df[f'{feature}_rolling_std{window}'] = combined_df[feature].rolling(window=window).std()
    # Exponentially Weighted Mean
    combined_df[f'{feature}_ewm10'] = combined_df[feature].ewm(span=10, adjust=False).mean()

# Interaction Features (based on previous feature importance analysis)
combined_df['C_minus_A'] = combined_df['C'] - combined_df['A']
combined_df['J_div_M'] = combined_df['J'] / (combined_df['M'] + 1e-6)

train_processed_df = combined_df.iloc[:len(train_df)].copy()
test_processed_df = combined_df.iloc[len(train_df):].copy()
train_processed_df['Y1'] = train_df['Y1']
train_processed_df['Y2'] = train_df['Y2']

# Fill NaNs created by time-series features
train_processed_df.bfill(inplace=True); train_processed_df.ffill(inplace=True)

In [6]:
features_to_use = [col for col in train_processed_df.columns if col not in ['id', 'time', 'Y1', 'Y2']]
X_train = train_processed_df[features_to_use]
y1_train = train_processed_df['Y1']
y2_train = train_processed_df['Y2']
X_test = test_processed_df[features_to_use]
X_test.bfill(inplace=True); X_test.ffill(inplace=True)

In [7]:
def objective(trial, X, y):
    params = {
        'objective': 'regression_l1',
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.04),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.1, 2.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.1, 2.0),
        'verbose': -1, 'n_jobs': -1
    }
    tscv = TimeSeriesSplit(n_splits=5)
    scores = []
    for train_index, val_index in tscv.split(X):
        X_t, X_v, y_t, y_v = X.iloc[train_index], X.iloc[val_index], y.iloc[train_index], y.iloc[val_index]
        model = lgb.LGBMRegressor(**params)
        model.fit(X_t, y_t, eval_set=[(X_v, y_v)], eval_metric='r2', callbacks=[lgb.early_stopping(100, verbose=False)])
        scores.append(r2_score(y_v, model.predict(X_v)))
    return np.mean(scores)

# Run study for Y1
print("Optimizing for Y1...")
study_y1 = optuna.create_study(direction='maximize')
study_y1.optimize(lambda trial: objective(trial, X_train, y1_train), n_trials=75) # Increase trials for best results
lgb_params_y1 = study_y1.best_params
print(f"Best R² for Y1 (CV): {study_y1.best_value:.4f}")

# Run study for Y2
print("Optimizing for Y2...")
study_y2 = optuna.create_study(direction='maximize')
study_y2.optimize(lambda trial: objective(trial, X_train, y2_train), n_trials=75)
lgb_params_y2 = study_y2.best_params
print(f"Best R² for Y2 (CV): {study_y2.best_value:.4f}")

Optimizing for Y1...
Best R² for Y1 (CV): 0.7260
Optimizing for Y2...
Best R² for Y2 (CV): 0.6503


In [8]:
lgb_params_y1.update({'objective': 'regression_l1', 'n_estimators': 2000, 'verbose': -1, 'n_jobs': -1})
lgb_params_y2.update({'objective': 'regression_l1', 'n_estimators': 2000, 'verbose': -1, 'n_jobs': -1})
xgb_params = {'objective':'reg:squarederror','n_estimators':1500,'learning_rate':0.03,'max_depth':5, 'n_jobs': -1}
cat_params = {'iterations': 2000, 'learning_rate': 0.03, 'depth': 5, 'loss_function': 'RMSE', 'verbose': 0}

# Train all models for Y1 and Y2
models_y1 = {
    'lgbm': lgb.LGBMRegressor(**lgb_params_y1).fit(X_train, y1_train),
    'xgb': xgb.XGBRegressor(**xgb_params).fit(X_train, y1_train),
    'cat': CatBoostRegressor(**cat_params).fit(X_train, y1_train),
    'ridge': Ridge().fit(X_train, y1_train)
}
models_y2 = {
    'lgbm': lgb.LGBMRegressor(**lgb_params_y2).fit(X_train, y2_train),
    'xgb': xgb.XGBRegressor(**xgb_params).fit(X_train, y2_train),
    'cat': CatBoostRegressor(**cat_params).fit(X_train, y2_train),
    'ridge': Ridge().fit(X_train, y2_train)
}
print("All base models trained.")


All base models trained.


In [9]:
train_preds_y1 = {name: model.predict(X_train) for name, model in models_y1.items()}
test_preds_y1 = {name: model.predict(X_test) for name, model in models_y1.items()}
train_preds_y2 = {name: model.predict(X_train) for name, model in models_y2.items()}
test_preds_y2 = {name: model.predict(X_test) for name, model in models_y2.items()}

# Create meta-model features
X_train_meta_y1 = pd.DataFrame(train_preds_y1)
X_test_meta_y1 = pd.DataFrame(test_preds_y1)
X_train_meta_y2 = pd.DataFrame(train_preds_y2)
X_test_meta_y2 = pd.DataFrame(test_preds_y2)

# Train meta-models
meta_model_y1 = Ridge().fit(X_train_meta_y1, y1_train)
stacked_preds_y1 = meta_model_y1.predict(X_test_meta_y1)
meta_model_y2 = Ridge().fit(X_train_meta_y2, y2_train)
stacked_preds_y2 = meta_model_y2.predict(X_test_meta_y2)
print("Meta-models trained.")

Meta-models trained.


In [10]:
y1_min, y1_max = train_df['Y1'].min(), train_df['Y1'].max()
final_preds_y1 = np.clip(stacked_preds_y1, y1_min, y1_max)
y2_min, y2_max = train_df['Y2'].min(), train_df['Y2'].max()
final_preds_y2 = np.clip(stacked_preds_y2, y2_min, y2_max)

# Create submission file
submission = pd.DataFrame({'id': test_df['id'], 'Y1': final_preds_y1, 'Y2': final_preds_y2})
submission.to_csv('submission_championship.csv', index=False)
print("\nSuccess! 'submission_championship.csv' created.")


Success! 'submission_championship.csv' created.
