In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
import lightgbm as lgb

# Carregar dataset
df = pd.read_csv("data/dataset.csv")
df = df.drop(columns=["id"])

X = df.drop(columns=["target_variable"])
y = df["target_variable"]

# One-hot encoding per variables categòriques
X = pd.get_dummies(X, drop_first=True)

# Train/test split estratificat
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Escalat (opcional per LightGBM, però mantinc per coherència)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model base LightGBM
lgbm = lgb.LGBMClassifier(
    objective="binary",
    random_state=42,
    n_jobs=-1
)

# Distribució de paràmetres
param_dist = {
    "n_estimators": [300, 500, 800],
    "max_depth": [-1, 6, 8],
    "learning_rate": [0.03, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
    "reg_lambda": [0.5, 1, 2, 5],
    "min_child_samples": [20, 50, 100]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    lgbm,
    param_distributions=param_dist,
    n_iter=25,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42
)

search.fit(X_train_scaled, y_train)
best_model = search.best_estimator_

print("Best CV F1:", search.best_score_)
print("Best params:", search.best_params_)

# Threshold tuning
y_val_proba = best_model.predict_proba(X_test_scaled)[:, 1]
thresholds = np.linspace(0.1, 0.9, 81)

best_f1, best_t = -1, None
for t in thresholds:
    y_val_pred = (y_val_proba >= t).astype(int)
    f1 = f1_score(y_test, y_val_pred)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print("Best threshold:", best_t)
print("Test F1 at best threshold:", best_f1)

y_test_pred = (y_val_proba >= best_t).astype(int)
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1151
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in




[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266







[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470491 -> initscore=-0.118174
[LightGBM] [Info] Start training from score -0.118174







[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1191
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091





[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[L



[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266





[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470491 -> initscore=-0.118174
[LightGBM] [Info] Start training from score -0.118174
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 13
[LightGBM] [Info] 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1191
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266




[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266
[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1179
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 13
[LightGBM] [Info] 







[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] 



[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015227 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266
[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 15
[LightGBM] [Info] 







[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1151
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091





[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1161
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091

[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1150
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info




[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266




[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1152
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470491 -> initscore=-0.118174
[LightGBM] [Info] Start training from score -0.118174




[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1151
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1161
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] 



[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1150
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266











[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1160
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266
[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1152
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 12
[LightGBM] [Info] 




[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091




[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266




[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1202
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266
[LightGBM] [Info] Number of positive: 10810, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22976, number of used features: 15
[LightGBM] [Info] 



[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470511 -> initscore=-0.118091
[LightGBM] [Info] Start training from score -0.118091
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010585 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1191
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 14
[LightGBM] [Info] 







[LightGBM] [Info] Number of positive: 10809, number of negative: 12166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1190
[LightGBM] [Info] Number of data points in the train set: 22975, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.470468 -> initscore=-0.118266
[LightGBM] [Info] Start training from score -0.118266
No further splits with positive gain, best gain: -inf
No further splits with positive gain, best gain: -inf
No further splits with positive gain, best gain: -inf
No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Number of positive: 10810, number of negative: 12165
No further splits with positive gain, best gain: -inf
No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing col-wise multi-

Exception ignored on calling ctypes callback function <function _log_callback at 0x7fd89efd7600>:
Traceback (most recent call last):
  File "/home/backo/Documents/datathon-2025/.venv/lib64/python3.13/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function <function _log_callback at 0x7fdad888f600>:
Traceback (most recent call last):
  File "/home/backo/Documents/datathon-2025/.venv/lib64/python3.13/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function <function _log_callback at 0x7f1edfd43600>:
Traceback (most recent call last):
  File "/home/backo/Documents/datathon-2025/.venv/lib64/python3.13/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling



KeyboardInterrupt: 