In [1]:
import os
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error, r2_score
from scipy.stats import pearsonr
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import wandb

In [2]:
load_dotenv()
os.environ["WANDB_API_KEY"] = os.getenv("WANDB_API_KEY")
os.environ["WANDB_DIR"] = "../data/wandb_logs"
wandb.login(key=os.environ["WANDB_API_KEY"])

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/brupesh/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbrupeshmit[0m ([33mbrupeshmit-massachusetts-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
train = pd.read_parquet("../data/train_clean_v2.parquet")
test = pd.read_parquet("../data/test_clean_v2.parquet")
target_col = "label"
y = train[target_col]
X = train.drop(columns=[target_col])

test_ids = test.iloc[:, 0]
test = test.drop(columns=[test.columns[0]])

feature_df = pd.read_csv("../data/resources/lgbm_feature_importance.csv")
top_features = feature_df.sort_values(by="importance", ascending=False)["feature"].iloc[:400].tolist()
X = X[top_features]
test = test[top_features]

In [4]:
params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    "verbosity": -1
}

params.update({
    "learning_rate": 0.01,
    "num_leaves": 63,
    "max_depth": 6,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l2": 1.0,
    "lambda_l1": 0.1,
})

wandb.init(
    project="kaggle-drw-crypto",
    name="lightgbm_default_with_regularization",
    config=params
)

tscv = TimeSeriesSplit(n_splits=5)
rmse_scores = []
overfit_scores = []

In [5]:
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        valid_sets=[dval],
        # callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)],
        callbacks=[lgb.log_evaluation(100)]
    )

    preds_val = model.predict(X_val)
    preds_train = model.predict(X_train)

    try:
        pearson_val = pearsonr(y_val, preds_val)[0]
    except:
        pearson_val = 0.0
        
    rmse_val = root_mean_squared_error(y_val, preds_val)
    rmse_train = root_mean_squared_error(y_train, preds_train)
    r2_val = r2_score(y_val, preds_val)

    overfit_score = rmse_val - rmse_train

    rmse_scores.append(rmse_val)
    overfit_scores.append(overfit_score)

    wandb.log({
        f"fold_{fold}_rmse": rmse_val,
        f"fold_{fold}_overfit": overfit_score,
        f"fold_{fold}_pearson": pearson_val
    })

    print(f"Fold {fold} Pearson correlation: {pearson_val:.4f}")


[100]	valid_0's rmse: 0.999795
[200]	valid_0's rmse: 1.01578
[300]	valid_0's rmse: 1.03035
[400]	valid_0's rmse: 1.04147
[500]	valid_0's rmse: 1.04975
[600]	valid_0's rmse: 1.05612
[700]	valid_0's rmse: 1.06176
[800]	valid_0's rmse: 1.06596
[900]	valid_0's rmse: 1.06959
[1000]	valid_0's rmse: 1.07384
Fold 0 Pearson correlation: 0.0580
[100]	valid_0's rmse: 1.02459
[200]	valid_0's rmse: 1.0409
[300]	valid_0's rmse: 1.05579
[400]	valid_0's rmse: 1.06725
[500]	valid_0's rmse: 1.07702
[600]	valid_0's rmse: 1.08263
[700]	valid_0's rmse: 1.08815
[800]	valid_0's rmse: 1.09398
[900]	valid_0's rmse: 1.09839
[1000]	valid_0's rmse: 1.10166
Fold 1 Pearson correlation: 0.0496
[100]	valid_0's rmse: 1.02854
[200]	valid_0's rmse: 1.05699
[300]	valid_0's rmse: 1.08161
[400]	valid_0's rmse: 1.09273
[500]	valid_0's rmse: 1.10366
[600]	valid_0's rmse: 1.11343
[700]	valid_0's rmse: 1.12562
[800]	valid_0's rmse: 1.13426
[900]	valid_0's rmse: 1.13972
[1000]	valid_0's rmse: 1.14481
Fold 2 Pearson correlation:

In [6]:
avg_rmse = np.mean(rmse_scores)
avg_overfit = np.mean(overfit_scores)

wandb.log({
    "avg_rmse": avg_rmse,
    "avg_overfit": avg_overfit
})

print(f"Average RMSE: {avg_rmse:.6f}")
print(f"Average Overfit Score (val - train RMSE): {avg_overfit:.6f}")

Average RMSE: 1.125465
Average Overfit Score (val - train RMSE): 0.645703


In [7]:
dtrain_full = lgb.Dataset(X, label=y)
final_model = lgb.train(
    params,
    dtrain_full,
    num_boost_round=1000,
    valid_sets=[dtrain_full],
    callbacks=[lgb.log_evaluation(100)]
)

final_model.save_model("../data/models/lightgbm_default_ts_model.txt")
preds = final_model.predict(test)
pd.DataFrame({"id": test_ids, "prediction": preds}).to_csv("../data/preds/lightgbm_default_ts_preds.csv", index=False)

wandb.finish()

[100]	training's rmse: 0.940859
[200]	training's rmse: 0.886913
[300]	training's rmse: 0.841394
[400]	training's rmse: 0.800667
[500]	training's rmse: 0.766387
[600]	training's rmse: 0.738115
[700]	training's rmse: 0.71185
[800]	training's rmse: 0.687675
[900]	training's rmse: 0.667356
[1000]	training's rmse: 0.647941


0,1
avg_overfit,▁
avg_rmse,▁
fold_0_overfit,▁
fold_0_pearson,▁
fold_0_rmse,▁
fold_1_overfit,▁
fold_1_pearson,▁
fold_1_rmse,▁
fold_2_overfit,▁
fold_2_pearson,▁

0,1
avg_overfit,0.6457
avg_rmse,1.12547
fold_0_overfit,0.76184
fold_0_pearson,0.05797
fold_0_rmse,1.07384
fold_1_overfit,0.67498
fold_1_pearson,0.04962
fold_1_rmse,1.10166
fold_2_overfit,0.64814
fold_2_pearson,0.03958


In [8]:
# import pandas as pd
# import numpy as np
# import lightgbm as lgb

# def generate_submission(top_n_features: int, model_dir="../data", feature_importance_path="lgbm_feature_importance.csv", test_path="../data/test_clean_v2.parquet"):
    
#     model_path = f"{model_dir}/lightgbm_top{top_n_features}_features.txt"
#     model = lgb.Booster(model_file=model_path)

#     top_features = feat_importance_df.sort_values("importance", ascending=False).head(top_n_features)["feature"].tolist()
    
#     preds = model.predict(test_df[top_features], num_iteration=model.best_iteration)

#     submission = pd.DataFrame({
#         "ID": np.arange(1, len(test_df) + 1),  # 1-indexed IDs oops mine were 0-indexed
#         "prediction": preds
#     })

#     sub_path = f"{model_dir}/lightgbm_top{top_n_features}_submission.csv"
#     submission.to_csv(sub_path, index=False)
#     print(f"Saved submission to {sub_path}")

#     return submission

In [9]:
# import matplotlib.pyplot as plt
# n_features, pearsons = zip(*results)
# plt.figure()
# plt.plot(n_features, pearsons)
# plt.xlabel("Number of Features")
# plt.ylabel("Pearson Correlation")
# plt.title("Pearson vs. Number of Top Features")
# plt.grid(True)
# # plt.savefig("../data/resources/pearson_vs_n_features.png")
# plt.show()

In [10]:
# best_model.save_model(f"../data/lightgbm_top{best_n}_features.txt")

In [11]:
# generate_submission(400)

In [12]:
# test