Installation des packages

In [None]:
!pip install -q numpy pandas scikit-learn lightgbm xgboost catboost optuna

Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")
import csv
from typing import Tuple
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

SEED = 42
np.random.seed(SEED)
print("Imports completed")

 Fonctions de nettoyage


In [None]:
_STR_NAN_TOKENS = {"nan", "NaN", "None", "none", ""}

def _strip_norm(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize string tokens to NaN"""
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == object:
            s = df[c].astype(str).str.strip()
            df[c] = s.replace(list(_STR_NAN_TOKENS), np.nan)
    return df

def _add_missing_flags(df: pd.DataFrame) -> pd.DataFrame:
    """Add missing indicator flags"""
    df = df.copy()
    df["is_missing_indicator"] = df["indicator"].isna().astype(int) if "indicator" in df else 0
    df["is_missing_indicator_description"] = (
        df["indicator_description"].isna().astype(int) if "indicator_description" in df else 0
    )
    df["is_missing_time_observed"] = df["time_observed"].isna().astype(int) if "time_observed" in df else 0
    return df

def _impute_text_unknown(df: pd.DataFrame) -> pd.DataFrame:
    """Impute 'Unknown' for missing text columns"""
    df = df.copy()
    for col in ["indicator", "indicator_description", "time_observed"]:
        if col in df.columns:
            df[col] = df[col].fillna("Unknown")
    return df

def _canonical_columns_with_flags(original_csv_path: str) -> list:
    """Get canonical column order"""
    base_cols = list(pd.read_csv(original_csv_path, nrows=0).columns)
    return base_cols + [
        "is_missing_indicator",
        "is_missing_indicator_description",
        "is_missing_time_observed",
    ]

def _save_canonical(df: pd.DataFrame, out_path: str) -> None:
    """Save DataFrame with canonical format"""
    df.to_csv(
        out_path,
        index=False,
        encoding="utf-8",
        lineterminator="\n",
        quoting=csv.QUOTE_MINIMAL,
    )

 Fonction principale de nettoyage

In [None]:
def rebuild_clean_files(
    raw_train_path: str,
    raw_test_path: str,
    out_train_clean_path: str,
    out_test_clean_path: str,
) -> Tuple[Tuple[int, int], Tuple[int, int]]:
    """Clean raw CSV files and save cleaned versions"""

    train = pd.read_csv(raw_train_path)
    test = pd.read_csv(raw_test_path)

    train = _strip_norm(train)
    test = _strip_norm(test)

    train = _add_missing_flags(train)
    test = _add_missing_flags(test)

    train = _impute_text_unknown(train)
    test = _impute_text_unknown(test)

    if "ID" in train.columns:
        train["ID"] = train["ID"].astype(str).str.strip()
    if "ID" in test.columns:
        test["ID"] = test["ID"].astype(str).str.strip()
    if "Target" in train.columns and train["Target"].dtype == object:
        train["Target"] = train["Target"].str.upper().str.strip()

    train_cols = _canonical_columns_with_flags(raw_train_path)
    test_cols = _canonical_columns_with_flags(raw_test_path)

    train = train.reindex(columns=train_cols)
    test = test.reindex(columns=test_cols)

    _save_canonical(train, out_train_clean_path)
    _save_canonical(test, out_test_clean_path)

    return train.shape, test.shape

Exécution du nettoyage

In [None]:
print("Starting data cleaning...")
train_shape, test_shape = rebuild_clean_files(
    raw_train_path="/content/train.csv",
    raw_test_path="/content/test.csv",
    out_train_clean_path="train_clean.csv",
    out_test_clean_path="test_clean.csv",
)
print(f"Clean files generated - Train: {train_shape}, Test: {test_shape}")

 Chargement et exploration

In [None]:
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nTarget distribution:")
print(train['Target'].value_counts(normalize=True))

Fonction de feature engineering

In [None]:
def advanced_features(df, is_train=True, text_vectorizer=None, fit_text=False):
    """Advanced feature engineering"""
    df = df.copy()

    # Time features
    df['prediction_time'] = pd.to_datetime(df['prediction_time'])
    df['hour'] = df['prediction_time'].dt.hour
    df['day'] = df['prediction_time'].dt.day
    df['month'] = df['prediction_time'].dt.month
    df['dayofweek'] = df['prediction_time'].dt.dayofweek
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['week_of_year'] = df['prediction_time'].dt.isocalendar().week
    df['day_of_year'] = df['prediction_time'].dt.dayofyear

    # Cyclical features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)

    # Time of day indicators
    df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
    df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
    df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 22)).astype(int)
    df['is_night'] = ((df['hour'] >= 22) | (df['hour'] < 6)).astype(int)

    # Seasonal patterns
    df['rainy_main'] = ((df['month'] >= 4) & (df['month'] <= 6)).astype(int)
    df['rainy_secondary'] = ((df['month'] >= 9) & (df['month'] <= 11)).astype(int)
    df['dry_season'] = ((df['rainy_main'] == 0) & (df['rainy_secondary'] == 0)).astype(int)

    # Feature interactions
    df['conf_x_intensity'] = df['confidence'] * df['predicted_intensity']
    df['conf_squared'] = df['confidence'] ** 2
    df['conf_cubed'] = df['confidence'] ** 3
    df['intensity_x_forecast'] = df['predicted_intensity'] * df['forecast_length']
    df['conf_x_forecast'] = df['confidence'] * df['forecast_length']
    df['conf_x_hour'] = df['confidence'] * df['hour']
    df['conf_x_rainy'] = df['confidence'] * (df['rainy_main'] + df['rainy_secondary'])

    df['conf_bins'] = pd.cut(df['confidence'], bins=10, labels=False)

    # Missing indicators
    df['has_indicator'] = df['indicator'].notna().astype(int)
    df['has_indicator_desc'] = df['indicator_description'].notna().astype(int)
    df['has_time_observed'] = df['time_observed'].notna().astype(int)
    df['missing_count'] = (3 - df['has_indicator'] - df['has_indicator_desc'] - df['has_time_observed'])

    # Text features
    if 'indicator_description' in df.columns:
        if fit_text:
            text_data = df['indicator_description'].fillna('missing').astype(str)
            text_vectorizer = TfidfVectorizer(
                max_features=50,
                ngram_range=(1, 2),
                min_df=2,
                stop_words='english'
            )
            text_matrix = text_vectorizer.fit_transform(text_data)
            text_df = pd.DataFrame(
                text_matrix.toarray(),
                columns=[f'tfidf_{i}' for i in range(text_matrix.shape[1])]
            )
            df = pd.concat([df.reset_index(drop=True), text_df], axis=1)
        elif text_vectorizer is not None:
            text_data = df['indicator_description'].fillna('missing').astype(str)
            text_matrix = text_vectorizer.transform(text_data)
            text_df = pd.DataFrame(
                text_matrix.toarray(),
                columns=[f'tfidf_{i}' for i in range(text_matrix.shape[1])]
            )
            df = pd.concat([df.reset_index(drop=True), text_df], axis=1)

    return df, text_vectorizer

Application du feature engineering

In [None]:
print("Applying feature engineering...")
train, text_vec = advanced_features(train, is_train=True, fit_text=True)
test, _ = advanced_features(test, is_train=False, text_vectorizer=text_vec)
print("Feature engineering completed")

Agrégations statistiques - User

In [None]:
print("Creating user statistics...")
train_original = train.copy()
test_original = test.copy()

user_stats_global = train.groupby('user_id').agg({
    'confidence': ['mean', 'std', 'min', 'max', 'count'],
    'predicted_intensity': ['mean', 'sum'],
    'forecast_length': 'mean'
}).reset_index()
user_stats_global.columns = ['user_id'] + [f'user_{col[0]}_{col[1]}' for col in user_stats_global.columns[1:]]
user_stats_global['user_confidence_std'] = user_stats_global['user_confidence_std'].fillna(0)

print(f"User stats shape: {user_stats_global.shape}")

 Agrégations statistiques - Community & District

In [None]:
print("Creating community and district statistics...")

comm_stats_global = train.groupby('community').agg({
    'confidence': ['mean', 'std'],
    'predicted_intensity': 'mean',
    'Target': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'NORAIN'
}).reset_index()
comm_stats_global.columns = ['community', 'comm_conf_mean', 'comm_conf_std', 'comm_intensity_mean', 'comm_target_mode']
comm_stats_global['comm_conf_std'] = comm_stats_global['comm_conf_std'].fillna(0)

dist_stats_global = train.groupby('district').agg({
    'confidence': 'mean',
    'predicted_intensity': 'mean',
    'Target': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'NORAIN'
}).reset_index()
dist_stats_global.columns = ['district', 'dist_conf_mean', 'dist_intensity_mean', 'dist_target_mode']

print(f"Community stats: {comm_stats_global.shape}, District stats: {dist_stats_global.shape}")

Agrégations temporelles

In [None]:
print("Creating time-based statistics...")

train['hour_block'] = (train['hour'] // 4)
test['hour_block'] = (test['hour'] // 4)

time_stats_global = train.groupby('hour_block').agg({
    'confidence': 'mean',
    'Target': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'NORAIN'
}).reset_index()
time_stats_global.columns = ['hour_block', 'hourblock_conf_mean', 'hourblock_target_mode']

print(f"Time stats shape: {time_stats_global.shape}")

Fusion des agrégations

In [None]:
print("Merging aggregations...")

train = train.merge(user_stats_global, on='user_id', how='left')
train = train.merge(comm_stats_global, on='community', how='left')
train = train.merge(dist_stats_global, on='district', how='left')
train = train.merge(time_stats_global, on='hour_block', how='left')

test = test.merge(user_stats_global, on='user_id', how='left')
test = test.merge(comm_stats_global, on='community', how='left')
test = test.merge(dist_stats_global, on='district', how='left')
test = test.merge(time_stats_global, on='hour_block', how='left')

# Fill missing values
for col in train.columns:
    if train[col].dtype in ['float64', 'int64']:
        train[col] = train[col].fillna(0)
        test[col] = test[col].fillna(0)

for col in ['comm_target_mode', 'dist_target_mode', 'hourblock_target_mode']:
    if col in train.columns:
        train[col] = train[col].fillna('NORAIN')
        test[col] = test[col].fillna('NORAIN')

print(f"Final train shape: {train.shape}, test shape: {test.shape}")

Encoding des variables catégorielles

In [None]:
print("Encoding categorical variables...")

for col in ['community', 'district', 'indicator']:
    le = LabelEncoder()
    train[col] = train[col].fillna('missing').astype(str)
    test[col] = test[col].fillna('missing').astype(str)
    all_vals = pd.concat([train[col], test[col]]).unique()
    le.fit(all_vals)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

for col in ['comm_target_mode', 'dist_target_mode', 'hourblock_target_mode']:
    if col in train.columns:
        le = LabelEncoder()
        all_vals = pd.concat([train[col], test[col]]).unique()
        le.fit(all_vals)
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

print("Encoding completed")

Fonction d'agrégation leak-safe pour CV

In [None]:
def add_aggregations_cv(train_df, val_df, test_df):
    """Recalculate user statistics on training fold only (leak-safe)"""
    user_stats = train_df.groupby('user_id').agg({
        'confidence': ['mean', 'std', 'min', 'max', 'count'],
        'predicted_intensity': ['mean', 'sum'],
        'forecast_length': 'mean'
    }).reset_index()
    user_stats.columns = ['user_id'] + [f'user_{col[0]}_{col[1]}' for col in user_stats.columns[1:]]
    user_stats['user_confidence_std'] = user_stats['user_confidence_std'].fillna(0)

    for df in [val_df, test_df]:
        user_cols_to_drop = [c for c in df.columns if c.startswith('user_') and c != 'user_id']
        df.drop(columns=user_cols_to_drop, inplace=True, errors='ignore')

    val_df = val_df.merge(user_stats, on='user_id', how='left')
    test_df = test_df.merge(user_stats, on='user_id', how='left')

    for col in user_stats.columns[1:]:
        val_df[col] = val_df[col].fillna(0)
        test_df[col] = test_df[col].fillna(0)

    return val_df, test_df

Préparation du target et class weights

In [None]:
target_encoder = LabelEncoder()
y_train = target_encoder.fit_transform(train['Target'])
n_classes = len(target_encoder.classes_)

print(f"Target classes: {target_encoder.classes_}")

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))
print(f"Class weights: {class_weight_dict}")

Sélection des features

In [None]:
drop_cols = ['ID', 'prediction_time', 'indicator_description', 'time_observed', 'Target']
feature_cols = [col for col in train.columns if col not in drop_cols]

print(f"Number of features: {len(feature_cols)}")
print(f"Sample features: {feature_cols[:10]}")

Configuration du Cross-Validation

In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)

oof_preds_lgb = np.zeros((len(train), n_classes))
oof_preds_xgb = np.zeros((len(train), n_classes))
oof_preds_cat = np.zeros((len(train), n_classes))

test_preds_lgb = np.zeros((len(test), n_classes))
test_preds_xgb = np.zeros((len(test), n_classes))
test_preds_cat = np.zeros((len(test), n_classes))

f1_scores = []

print(f"Stratified {n_splits}-Fold Cross-Validation configured")

 Entraînement des modèles (Boucle principale)


In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(train, y_train)):
    print(f"\n{'='*60}")
    print(f"Fold {fold + 1}/{n_splits}")
    print(f"{'='*60}")

    train_fold_raw = train_original.iloc[train_idx].copy()
    val_fold = train.iloc[val_idx].copy()
    test_fold = test.copy()

    val_fold, test_fold = add_aggregations_cv(train_fold_raw, val_fold, test_fold)

    X_tr = train.iloc[train_idx][feature_cols].fillna(-999)
    X_val = val_fold[feature_cols].fillna(-999)
    X_te = test_fold[feature_cols].fillna(-999)

    y_tr = y_train[train_idx]
    y_val = y_train[val_idx]

    # LightGBM
    print("\nTraining LightGBM...")
    lgb_params = {
        'objective': 'multiclass',
        'num_class': n_classes,
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'num_leaves': 127,
        'max_depth': 10,
        'min_child_samples': 10,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': SEED,
        'verbose': -1,
        'is_unbalance': True
    }

    sample_weights_tr = np.array([class_weights[y] for y in y_tr])
    sample_weights_val = np.array([class_weights[y] for y in y_val])

    lgb_train = lgb.Dataset(X_tr, label=y_tr, weight=sample_weights_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, weight=sample_weights_val, reference=lgb_train)

    lgb_model = lgb.train(
        lgb_params,
        lgb_train,
        num_boost_round=3000,
        valid_sets=[lgb_train, lgb_val],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(500)]
    )

    oof_preds_lgb[val_idx] = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    test_preds_lgb += lgb_model.predict(X_te, num_iteration=lgb_model.best_iteration) / n_splits

    # XGBoost
    print("\nTraining XGBoost...")
    xgb_params = {
        'objective': 'multi:softprob',
        'num_class': n_classes,
        'eval_metric': 'mlogloss',
        'learning_rate': 0.01,
        'max_depth': 10,
        'min_child_weight': 1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 0.5,
        'reg_lambda': 0.5,
        'random_state': SEED,
        'tree_method': 'hist'
    }

    sample_weights = np.array([class_weights[y] for y in y_tr])
    dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=sample_weights)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_te)

    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=3000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=200,
        verbose_eval=500
    )

    oof_preds_xgb[val_idx] = xgb_model.predict(dval)
    test_preds_xgb += xgb_model.predict(dtest) / n_splits

    # CatBoost
    print("\nTraining CatBoost...")
    cat_model = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.01,
        depth=10,
        l2_leaf_reg=10,
        auto_class_weights='Balanced',
        random_state=SEED,
        loss_function='MultiClass',
        eval_metric='TotalF1:average=Macro',
        early_stopping_rounds=200,
        verbose=500,
        task_type='CPU'
    )

    cat_model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        use_best_model=True
    )

    oof_preds_cat[val_idx] = cat_model.predict_proba(X_val)
    test_preds_cat += cat_model.predict_proba(X_te) / n_splits

    # Ensemble evaluation
    oof_ensemble = (oof_preds_lgb[val_idx] * 0.35 +
                    oof_preds_xgb[val_idx] * 0.35 +
                    oof_preds_cat[val_idx] * 0.30)

    fold_preds = np.argmax(oof_ensemble, axis=1)
    fold_f1 = f1_score(y_val, fold_preds, average='macro')
    f1_scores.append(fold_f1)

    print(f"\nFold {fold + 1} Macro F1: {fold_f1:.6f}")

print(f"\nMean CV F1 Score: {np.mean(f1_scores):.6f} (+/- {np.std(f1_scores):.6f})")

Stacking Meta-Learner

In [None]:
print("Training stacking meta-learner...")

oof_stack = np.hstack([oof_preds_lgb, oof_preds_xgb, oof_preds_cat])
meta_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=2000,
    C=0.1,
    class_weight='balanced',
    random_state=SEED
)
meta_model.fit(oof_stack, y_train)

oof_meta_preds = meta_model.predict(oof_stack)
oof_meta_f1 = f1_score(y_train, oof_meta_preds, average='macro')

print(f"Meta-learner F1 Score: {oof_meta_f1:.6f}")
print(f"Base ensemble F1 Score: {np.mean(f1_scores):.6f}")
print(f"Improvement: +{oof_meta_f1 - np.mean(f1_scores):.6f}")

Prédictions finales

In [None]:
test_stack = np.hstack([test_preds_lgb, test_preds_xgb, test_preds_cat])
final_predictions = meta_model.predict(test_stack)
final_predictions_labels = target_encoder.inverse_transform(final_predictions)

print("Final predictions completed")
print(f"Prediction distribution:\n{pd.Series(final_predictions_labels).value_counts()}")

Résumé des résultats

In [None]:
print("="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)
print(f"Cross-validation folds: {n_splits}")
print(f"Number of features: {len(feature_cols)}")
print(f"Training samples: {len(train)}")
print(f"Test samples: {len(test)}")
print(f"\nPerformance:")
print(f"  Mean CV F1: {np.mean(f1_scores):.6f}")
print(f"  Std CV F1: {np.std(f1_scores):.6f}")
print(f"  Meta F1: {oof_meta_f1:.6f}")
print(f"\nOptimizations applied:")
print("  - Cyclical time features")
print("  - TF-IDF text vectorization")
print("  - Leak-safe aggregations")
print("  - Stratified K-fold CV")
print("  - Ensemble of 3 models")
print("  - Stacking meta-learner")
print("="*60)