In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import yaml
import importlib
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

import dataprocessing
import utils

In [31]:
importlib.reload(dataprocessing)

<module 'dataprocessing' from '/home/alexandre-tonon/SDD/Hackathons/Hackathon_Heart_Rate_Los_Tigros/dataprocessing.py'>

In [1]:
import pandas as pd

# 2024 data
xpt_file = "data/LLCP2024.XPT"

df = pd.read_sas(xpt_file, format="xport", encoding="utf-8")
print(df.head())

   _STATE  FMONTH     IDATE IMONTH IDAY IYEAR  DISPCODE       SEQNO  \
0     1.0     2.0  02282024     02   28  2024    1100.0  2024000001   
1     1.0     2.0  02212024     02   21  2024    1100.0  2024000002   
2     1.0     2.0  02212024     02   21  2024    1100.0  2024000003   
3     1.0     2.0  02282024     02   28  2024    1100.0  2024000004   
4     1.0     2.0  02212024     02   21  2024    1100.0  2024000005   

           _PSU  CTELENM1  ...  _LCSCTSN  _LCSPSTF  DRNKANY6      DROCDY4_  \
0  2.024000e+09       1.0  ...       NaN       9.0       2.0  5.397605e-79   
1  2.024000e+09       1.0  ...       4.0       9.0       2.0  5.397605e-79   
2  2.024000e+09       1.0  ...       4.0       2.0       1.0  1.000000e+02   
3  2.024000e+09       1.0  ...       NaN       9.0       2.0  5.397605e-79   
4  2.024000e+09       1.0  ...       3.0       9.0       2.0  5.397605e-79   

   _RFBING6      _DRNKWK3  _RFDRHV9  _FLSHOT7  _PNEUMO3  _AIDTST4  
0       1.0  5.397605e-79       1.0 

In [3]:
train = pd.read_csv("data/train.csv")

In [None]:
def compute_target(sample):
    if not sample["CVDINFR4"] or not sample["CVDCRHD4"]:
        return 0
    if sample["CVDINFR4"] == 1 or sample["CVDCRHD4"] == 1:
        return 1
    else:
        return 0
       

df["TARGET"] = df.apply(compute_target, axis=1)


In [5]:
df.to_csv("data/train2.csv", index=False)

In [25]:
import xgboost as xgb
import pandas as pd

df = pd.read_csv("data/train2.csv")

import yaml
with open("features.yaml", "r") as f:
    config = yaml.safe_load(f)

selected_columns = config["features"]
selected_columns = [col for col in selected_columns if col in df.columns]
selected_columns.append('TARGET')
df = df[selected_columns]

In [26]:
def data_processing(
    df: pd.DataFrame,
    test: bool = False,
    feature_info=None,            # laissé pour compatibilité (non utilisé ici)
    scaler: StandardScaler | None = None,
    config_path: str = "features.yaml",
    codebook_html: str = "data/USCODE22_LLCP_102523.HTML"
):
    """
    Mode entraînement (test=False) -> retourne (X, y, features_99, scaler)
    Mode test (test=True)         -> retourne X (transformé avec scaler fourni)
    """
    # 1) Config + codebook
    config, codebook = dataprocessing._load_config_and_codebook(config_path, codebook_html)
    features = config['features']

    # 2) Sélection colonnes
    X = df.copy()

    # 3) Typage features via codebook
    features_classification = dataprocessing._classify_features(codebook, features)

    # 4) Nettoyage / encodage
    X, continuous_columns = dataprocessing._clean_and_engineer(X, features_classification)

    # 5) Scaling
    if test:
        if scaler is not None:
            X[continuous_columns] = scaler.transform(X[continuous_columns])

        if feature_info is not None:
            feature_info.remove('ID' if 'ID' in feature_info else None)
            X = X[feature_info]
            
        return X
    else:
        # y et fit scaler
        if 'TARGET' not in df.columns:
            raise ValueError("Colonne TARGET absente du DataFrame en mode entraînement.")
        y = df['TARGET'].astype(int)

        if scaler is not None:
            X[continuous_columns] = scaler.fit_transform(X[continuous_columns])

        return X, y, scaler

In [27]:
df = df[selected_columns]
Xtrain, ytrain, scaler = data_processing(df, test=False, scaler=Normalizer())

In [31]:
import xgboost as xgb

params ={'learning_rate': 0.12072635856604813, 
         'max_depth': 7,
        'min_child_weight': 52,
        'threshold': 0.18900926428900747,
        'num_boost_round': 109}


models_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': params['learning_rate'],
    'max_depth': params['max_depth'],
    'min_child_weight': params['min_child_weight'],
    'seed': 42,
}

Xtrain.drop('TARGET', axis=1, inplace=True)

Xtrain_train, Xtrain_val, ytrain_train, ytrain_val = train_test_split(Xtrain, ytrain, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(Xtrain_train, label=ytrain_train)
dval = xgb.DMatrix(Xtrain_val, label=ytrain_val)
dtest = xgb.DMatrix(Xtrain, label=ytrain)

model_xgb = xgb.train(models_params, dtrain, num_boost_round=params['num_boost_round'])
predictions_val = model_xgb.predict(dval)
predictions_val_binary = (predictions_val > params['threshold']).astype(int)

f1 = f1_score(ytrain_val, predictions_val_binary)
print(f"F1-score on validation set: {f1}")

F1-score on validation set: 0.4205701791270709


In [39]:
import optuna

def objective_mano(trial):
    threshold = trial.suggest_float('threshold', 0.1, 0.3)

    params_mano ={
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'seed': 42,
    }

    model = xgb.train(params_mano, dtrain, num_boost_round=params['num_boost_round'])
    predictions = model.predict(dval)
    predictions_binary = (predictions > threshold).astype(int)

    f1 = f1_score(ytrain_val, predictions_binary)
    return f1

study_mano = optuna.create_study(direction='maximize')
study_mano.optimize(objective_mano, n_trials=50)

[I 2025-11-05 14:02:25,799] A new study created in memory with name: no-name-0d6bd350-2bc4-4ad3-8f41-463e3282c55a
[I 2025-11-05 14:02:28,931] Trial 0 finished with value: 0.41622209049594944 and parameters: {'threshold': 0.2320215415678009, 'learning_rate': 0.27115112292925025, 'max_depth': 7, 'min_child_weight': 91}. Best is trial 0 with value: 0.41622209049594944.
[I 2025-11-05 14:02:31,415] Trial 1 finished with value: 0.4250605435465064 and parameters: {'threshold': 0.20511387956298652, 'learning_rate': 0.10218879740479005, 'max_depth': 6, 'min_child_weight': 73}. Best is trial 1 with value: 0.4250605435465064.
[I 2025-11-05 14:02:34,035] Trial 2 finished with value: 0.3892431033394356 and parameters: {'threshold': 0.29642849684126094, 'learning_rate': 0.08204271259654036, 'max_depth': 6, 'min_child_weight': 2}. Best is trial 1 with value: 0.4250605435465064.
[I 2025-11-05 14:02:35,536] Trial 3 finished with value: 0.4237271945744579 and parameters: {'threshold': 0.2192058479303605

In [41]:
extra = pd.read_csv("data/extra.csv")
target = extra['TARGET']
best_params_mano = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': study_mano.best_params['learning_rate'],
    'max_depth': study_mano.best_params['max_depth'],
    'min_child_weight': study_mano.best_params['min_child_weight'],
    'num_boost_round': params['num_boost_round'],
    'threshold': study_mano.best_params['threshold']
}

columns = [c for c in Xtrain.columns]
columns.append("ID")

extra = extra[columns]
Xextra = data_processing(extra, test=True, scaler=scaler, feature_info=columns)
dextra = xgb.DMatrix(Xextra)
model = xgb.train(best_params_mano, dtrain, num_boost_round=best_params_mano['num_boost_round'])

predictions_extra = model.predict(dextra)
predictions_extra_binary = (predictions_extra > best_params_mano['threshold']).astype(int)
f1_extra = f1_score(target, predictions_extra_binary)
print(f"F1-score on extra set: {f1_extra}")

Parameters: { "num_boost_round", "threshold" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


F1-score on extra set: 0.19070794745611655


In [None]:
test = pd.read_csv("data/test.csv")
columns = [c for c in Xtrain.columns]

columns.append("ID")
test = test[columns]
Xtest = data_processing(test, test=True, scaler=scaler, feature_info=columns)

dtest_final = xgb.DMatrix(Xtest)
predictions_test = model_xgb.predict(dtest_final)
predictions_test_binary = (predictions_test > params['threshold']).astype(int)

submission = pd.DataFrame({
    "ID": test["ID"],
    "TARGET": predictions_test_binary
})

submission.to_csv("submission_2024.csv", index=False)