### Importation des données

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import seaborn as sns
import yaml
import importlib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import xgboost as xgb

import dataprocessing
import utils

In [2]:
importlib.reload(utils)
importlib.reload(dataprocessing)

<module 'dataprocessing' from '/home/alexandre-tonon/SDD/Hackathons/Hackathon_Heart_Rate_Los_Tigros/dataprocessing.py'>

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

### Data Processing

In [4]:
def data_processing(
    df: pd.DataFrame,
    test: bool = False,
    feature_info=None,            # laissé pour compatibilité (non utilisé ici)
    scaler: StandardScaler | None = None,
    config_path: str = "features.yaml",
    codebook_html: str = "data/USCODE22_LLCP_102523.HTML"
):
    """
    Mode entraînement (test=False) -> retourne (X, y, features_99, scaler)
    Mode test (test=True)         -> retourne X (transformé avec scaler fourni)
    """
    # 1) Config + codebook
    config, codebook = dataprocessing._load_config_and_codebook(config_path, codebook_html)
    features = config['features']

    # 2) Sélection colonnes
    X = df[features].copy()

    # 3) Typage features via codebook
    features_classification = dataprocessing._classify_features(codebook, features)

    # 4) Nettoyage / encodage
    X, continuous_columns = dataprocessing._clean_and_engineer(X, features_classification)

    # 5) Scaling
    if test:
        if scaler is None:
            raise ValueError("En mode test, vous devez fournir un scaler entraîné.")
        X[continuous_columns] = scaler.transform(X[continuous_columns])

        if feature_info is not None:
            X = X[feature_info]
            
        return X
    else:
        # y et fit scaler
        if 'TARGET' not in df.columns:
            raise ValueError("Colonne TARGET absente du DataFrame en mode entraînement.")
        y = df['TARGET'].astype(int)

        fitted_scaler = StandardScaler()
        X[continuous_columns] = fitted_scaler.fit_transform(X[continuous_columns])

        return X, y, fitted_scaler

In [5]:
df_train_processed, y_train, scaler = data_processing(df=df_train, 
                                                      scaler=None,
                                                      test=False)

In [6]:
df_train_final = df_train_processed.copy()

### Scoring

In [7]:
X_train_train, X_val_train, y_train_split, y_val_train = train_test_split(
    df_train_final, y_train, test_size=0.2, random_state=42)

dtrain_scoring = xgb.DMatrix(data=X_train_train, label=y_train_split)
dtest_scoring = xgb.DMatrix(data=X_val_train, label=y_val_train)

In [12]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.1,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

threshold = 0.15

model_xgb = xgb.train(params, dtrain_scoring, num_boost_round=100)

In [13]:
predictions_val = model_xgb.predict(dtest_scoring)
predictions_val_binary = (predictions_val > threshold).astype(int)

f1_score_train = f1_score(y_val_train, predictions_val_binary, average=None)
print("F1 Score (Validation):", f1_score_train)

F1 Score (Validation): [0.89151135 0.39323127]
