In [1]:
# ===== 0) Imports =====
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# ===== 1) Load =====
train = pd.read_csv("train.csv")

# y / X de base (on exclut Id et Cover_Type)
y = train["Cover_Type"].astype(int)
X = train.drop(columns=["Cover_Type", "Id"])

# ===== 2) Feature engineering minimal mais utile =====
# Aspect est angulaire → sin/cos
X["Aspect_sin"] = np.sin(np.deg2rad(train["Aspect"]))
X["Aspect_cos"] = np.cos(np.deg2rad(train["Aspect"]))

# Distance combinée à l’hydrologie
X["Hydro_Dist"] = np.hypot(train["Horizontal_Distance_To_Hydrology"],
                           train["Vertical_Distance_To_Hydrology"])

# Le plus proche entre route et feu
X["Near_RoadOrFire"] = np.minimum(train["Horizontal_Distance_To_Roadways"],
                                  train["Horizontal_Distance_To_Fire_Points"])

# Différence d’altitude vs hydrologie (souvent très discriminant)
X["Elev_minus_VertHydro"] = train["Elevation"] - train["Vertical_Distance_To_Hydrology"]

# Quelques sommes de distances classiques sur ce dataset
X["Road_Fire"] = train["Horizontal_Distance_To_Roadways"] + train["Horizontal_Distance_To_Fire_Points"]
X["Hydro_Road"] = train["Horizontal_Distance_To_Hydrology"] + train["Horizontal_Distance_To_Roadways"]
X["Hydro_Fire"] = train["Horizontal_Distance_To_Hydrology"] + train["Horizontal_Distance_To_Fire_Points"]

# NB: Pas besoin de reconstruire Soil_Type/Wilderness en une colonne.
# Les 40+4 dummies 0/1 peuvent rester telles quelles — CatBoost s’en sort très bien.

# ===== 3) Split =====
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ===== 4) Modèle (simple et efficace) =====
model = CatBoostClassifier(
    iterations=2000,
    depth=10,
    learning_rate=0.05,
    l2_leaf_reg=3,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    random_seed=42,
    # task_type="GPU",  # <- décommente si tu as un GPU, c'est (beaucoup) plus rapide
    verbose=200
)

model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# ===== 5) Évaluation =====
y_pred = model.predict(X_val).ravel().astype(int)
print("Accuracy:", accuracy_score(y_val, y_pred))

# (Option) Top features pour comprendre ce qui aide
imp = pd.Series(model.get_feature_importance(), index=X.columns).sort_values(ascending=False)
print("\nTop 10 features:\n", imp.head(10))


0:	learn: 0.6681548	test: 0.6431878	best: 0.6431878 (0)	total: 324ms	remaining: 10m 46s
200:	learn: 0.8843419	test: 0.8230820	best: 0.8230820 (200)	total: 23.6s	remaining: 3m 31s
400:	learn: 0.9383267	test: 0.8475529	best: 0.8475529 (392)	total: 45.8s	remaining: 3m 2s
600:	learn: 0.9638724	test: 0.8594577	best: 0.8594577 (598)	total: 1m 7s	remaining: 2m 38s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8601190476
bestIteration = 614

Shrink model to first 615 iterations.
Accuracy: 0.8601190476190477

Top 10 features:
 Elev_minus_VertHydro                  17.399888
Elevation                             15.365929
Aspect_cos                             5.930192
Hydro_Dist                             5.555803
Wilderness_Area3                       4.311700
Vertical_Distance_To_Hydrology         4.285341
Horizontal_Distance_To_Fire_Points     3.863735
Hillshade_9am                          3.564890
Hydro_Road                             3.560580
Horizontal_Distance_To

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

# ===== 1) Load =====
df = pd.read_csv("train.csv")

# ===== 2) Rebuild categorical Soil_Type & Wilderness_Area =====
soil_cols = [c for c in df.columns if c.startswith("Soil_Type")]
wild_cols = [c for c in df.columns if c.startswith("Wilderness_Area")]

df["Soil_Type"] = df[soil_cols].idxmax(axis=1).str.extract(r"(\d+)").astype(int).astype("category")
df["Wilderness_Area"] = df[wild_cols].idxmax(axis=1).str.extract(r"(\d+)").astype(int).astype("category")

# Option: drop dummies (on garde le dataset compact)
df = df.drop(columns=soil_cols + wild_cols)

# ===== 3) Features simples mais payantes =====
df["Aspect_sin"] = np.sin(np.deg2rad(df["Aspect"]))
df["Aspect_cos"] = np.cos(np.deg2rad(df["Aspect"]))
df["Hydro_Dist"] = np.hypot(df["Horizontal_Distance_To_Hydrology"], df["Vertical_Distance_To_Hydrology"])
df["Near_RoadOrFire"] = np.minimum(df["Horizontal_Distance_To_Roadways"], df["Horizontal_Distance_To_Fire_Points"])
df["Elev_minus_VertHydro"] = df["Elevation"] - df["Vertical_Distance_To_Hydrology"]

# 3 nouvelles
df["Abs_VertHydro"] = df["Vertical_Distance_To_Hydrology"].abs()
df["Hillshade_mean"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].mean(axis=1)
df["Hillshade_range"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1) - \
                        df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)

# ===== 4) Split =====
y = df["Cover_Type"].astype(int)
X = df.drop(columns=["Cover_Type", "Id"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# indices des colonnes catégorielles pour CatBoost
cat_features = [X.columns.get_loc("Soil_Type"), X.columns.get_loc("Wilderness_Area")]

# ===== 5) CatBoost (réglages simples & efficaces) =====
model = CatBoostClassifier(
    iterations=3000,          # assez haut + early stopping
    learning_rate=0.07,       # convergence plus rapide
    depth=8,                  # souvent optimal ici
    l2_leaf_reg=4,            # row subsample
    rsm=0.8,                  # feature subsample
    loss_function="MultiClass",
    eval_metric="Accuracy",
    random_seed=42,
    # task_type="GPU",        # <--- décommente si GPU dispo
    verbose=200
)

model.fit(X_train, y_train,
          eval_set=(X_val, y_val),
          cat_features=cat_features,
          early_stopping_rounds=100)

# ===== 6) Eval =====
y_pred = model.predict(X_val).ravel().astype(int)
print("Accuracy:", accuracy_score(y_val, y_pred))

# pour comprendre ce qui aide
imp = pd.Series(model.get_feature_importance(), index=X.columns).sort_values(ascending=False)
print("\nTop 12 features:\n", imp.head(12))


0:	learn: 0.6292989	test: 0.6283069	best: 0.6283069 (0)	total: 138ms	remaining: 6m 53s
200:	learn: 0.8655754	test: 0.8078704	best: 0.8085317 (194)	total: 22.7s	remaining: 5m 15s
400:	learn: 0.9162533	test: 0.8376323	best: 0.8376323 (374)	total: 45s	remaining: 4m 51s
600:	learn: 0.9429563	test: 0.8465608	best: 0.8472222 (592)	total: 1m 7s	remaining: 4m 28s
800:	learn: 0.9617229	test: 0.8544974	best: 0.8544974 (797)	total: 1m 29s	remaining: 4m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8551587302
bestIteration = 837

Shrink model to first 838 iterations.
Accuracy: 0.8551587301587301

Top 12 features:
 Soil_Type                             16.057298
Elev_minus_VertHydro                  13.194269
Elevation                             11.005252
Wilderness_Area                       10.520790
Horizontal_Distance_To_Roadways        7.306233
Horizontal_Distance_To_Fire_Points     6.625909
Aspect_cos                             4.604151
Near_RoadOrFire             

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# --- load ---
df = pd.read_csv("train.csv")

y = df["Cover_Type"]
X = df.drop(columns=["Cover_Type", "Id"])

# split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- model ---
model = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# --- fit avec callbacks ---
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(200)   # affiche log tous les 200 itérations
    ]
)

# --- eval ---
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2155
[LightGBM] [Info] Number of data points in the train set: 12096, number of used features: 44
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
Training until validation scores don't improve for 100 rounds
[200]	valid_0's multi_logloss: 0.368
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 0.363318
Accuracy: 0.8644179894179894


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# --- load ---
df = pd.read_csv("train.csv")

y = df["Cover_Type"].astype(int)
X = df.drop(columns=["Cover_Type", "Id"]).copy()

# --- features qui payent ---
X["Aspect_sin"] = np.sin(np.deg2rad(df["Aspect"]))
X["Aspect_cos"] = np.cos(np.deg2rad(df["Aspect"]))
X["Hydro_Dist"] = np.hypot(df["Horizontal_Distance_To_Hydrology"],
                           df["Vertical_Distance_To_Hydrology"])
X["Near_RoadOrFire"] = np.minimum(df["Horizontal_Distance_To_Roadways"],
                                  df["Horizontal_Distance_To_Fire_Points"])
X["Elev_minus_VertHydro"] = df["Elevation"] - df["Vertical_Distance_To_Hydrology"]
X["Road_Fire"] = df["Horizontal_Distance_To_Roadways"] + df["Horizontal_Distance_To_Fire_Points"]
X["Hydro_Road"] = df["Horizontal_Distance_To_Hydrology"] + df["Horizontal_Distance_To_Roadways"]
X["Hydro_Fire"] = df["Horizontal_Distance_To_Hydrology"] + df["Horizontal_Distance_To_Fire_Points"]
X["Hillshade_mean"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].mean(axis=1)
X["Hillshade_range"] = df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1) - \
                       df[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)


# --- split ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- modèle lightGBM bien réglé mais simple ---
model = LGBMClassifier(
    n_estimators=8000,      # + d’arbres, early stopping décidera
    learning_rate=0.02,     # un peu plus fin
    num_leaves=384,         # plus expressif
    min_child_samples=20,   # limite l’overfit
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=1.0,
    reg_lambda=3.0,
    objective="multiclass",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)


fit_kwargs = dict(
    X=X_train, y=y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=200), log_evaluation(200)]
)

# si tu as des colonnes catégorielles uniques (pas en one-hot), passe-les ici
if categorical_feature:
    fit_kwargs["categorical_feature"] = categorical_feature

model.fit(**fit_kwargs)

y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))


Training until validation scores don't improve for 200 rounds
[200]	valid_0's multi_logloss: 0.382076
[400]	valid_0's multi_logloss: 0.340878
[600]	valid_0's multi_logloss: 0.339318
Early stopping, best iteration is:
[525]	valid_0's multi_logloss: 0.338593
Accuracy: 0.875


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# --- load ---
df = pd.read_csv("train.csv")

y = df["Cover_Type"].astype(int)
X = df.drop(columns=["Cover_Type", "Id"]).copy()

# --- features utiles ---
X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"]))
X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"]))
X["Hydro_Dist"] = np.hypot(X["Horizontal_Distance_To_Hydrology"],
                           X["Vertical_Distance_To_Hydrology"])
X["Near_RoadOrFire"] = np.minimum(X["Horizontal_Distance_To_Roadways"],
                                  X["Horizontal_Distance_To_Fire_Points"])
X["Elev_minus_VertHydro"] = X["Elevation"] - X["Vertical_Distance_To_Hydrology"]
X["Road_Fire"] = X["Horizontal_Distance_To_Roadways"] + X["Horizontal_Distance_To_Fire_Points"]
X["Hydro_Road"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Roadways"]
X["Hydro_Fire"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Fire_Points"]
X["Hillshade_mean"] = X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].mean(axis=1)
X["Hillshade_range"] = X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1) - \
                       X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)

# --- split ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- modèle ---
model = LGBMClassifier(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)

# --- éval ---
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[152]	valid_0's multi_logloss: 0.324772
Accuracy: 0.8852513227513228


In [10]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier

# --- load ---
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

# y / X
y_train = train["Cover_Type"].astype(int)
X_train = train.drop(columns=["Cover_Type", "Id"])
X_test  = test.drop(columns=["Id"])

# --- model (simple & efficace) ---
model = LGBMClassifier(
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

# --- fit sur tout le train ---
model.fit(X_train, y_train)

# --- predict sur test-full ---
test_pred = model.predict(X_test).astype(int)

# --- create submission ---
submission = pd.DataFrame({
    "Id": test["Id"],
    "Cover_Type": test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved. Shape:", submission.shape)
print(submission.head())


submission.csv saved. Shape: (581012, 2)
   Id  Cover_Type
0   1           5
1   2           5
2   3           2
3   4           2
4   5           5


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# --- load ---
df = pd.read_csv("train.csv")

y = df["Cover_Type"]
X = df.drop(columns=["Cover_Type", "Id"])

# --- split (train/val pour avoir une accuracy locale) ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- model ---
model = LGBMClassifier(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=128,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

# --- train avec early stopping ---
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)

# --- eval ---
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[196]	valid_0's multi_logloss: 0.352831
Validation Accuracy: 0.8759920634920635


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier, early_stopping

# -------- 1) Helpers: feature engineering --------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.drop(columns=["Cover_Type", "Id"], errors="ignore").copy()

    # a) Aspect (circulaire)
    ang = np.deg2rad(X["Aspect"])
    X["Aspect_sin"] = np.sin(ang)
    X["Aspect_cos"] = np.cos(ang)

    # b) Distances hydrologie + combinaisons
    X["Hydro_Dist"] = np.hypot(X["Horizontal_Distance_To_Hydrology"],
                               X["Vertical_Distance_To_Hydrology"])
    X["Abs_VertHydro"] = X["Vertical_Distance_To_Hydrology"].abs()
    X["Elev_minus_VertHydro"] = X["Elevation"] - X["Vertical_Distance_To_Hydrology"]

    # c) Routes / Feux
    X["Near_RoadOrFire"] = np.minimum(X["Horizontal_Distance_To_Roadways"],
                                      X["Horizontal_Distance_To_Fire_Points"])
    X["Road_Fire"] = X["Horizontal_Distance_To_Roadways"] + X["Horizontal_Distance_To_Fire_Points"]
    X["Hydro_Road"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Roadways"]
    X["Hydro_Fire"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Fire_Points"]

    # d) Hillshade
    hs_cols = ["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]
    X["Hillshade_mean"] = X[hs_cols].mean(axis=1)
    X["Hillshade_range"] = X[hs_cols].max(axis=1) - X[hs_cols].min(axis=1)
    X["Hillshade_sum"] = X[hs_cols].sum(axis=1)

    # e) Pente & interactions
    X["Slope_times_HydroDist"] = X["Slope"] * X["Hydro_Dist"]
    X["Slope_times_Elev"] = X["Slope"] * X["Elevation"]
    X["Elev_over_Slope"] = X["Elevation"] / (X["Slope"] + 1e-6)

    # f) Différences horizontales
    X["Road_minus_Hydro"] = X["Horizontal_Distance_To_Roadways"] - X["Horizontal_Distance_To_Hydrology"]
    X["Fire_minus_Road"]  = X["Horizontal_Distance_To_Fire_Points"] - X["Horizontal_Distance_To_Roadways"]
    X["Fire_minus_Hydro"] = X["Horizontal_Distance_To_Fire_Points"] - X["Horizontal_Distance_To_Hydrology"]

    # g) Non-linéaires
    for col in ["Elevation", "Slope", "Hydro_Dist"]:
        X[f"{col}_sq"] = X[col] * X[col]

    return X.replace([np.inf, -np.inf], np.nan).fillna(0)

# -------- 2) Load & FE --------
df = pd.read_csv("train.csv")

y = df["Cover_Type"].astype(int)
X = build_features(df)

# -------- 3) Split --------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -------- 4) Feature selection via RandomForest --------
rf = RandomForestClassifier(
    n_estimators=500, random_state=42, n_jobs=-1, max_features="sqrt"
)
rf.fit(X_train, y_train)

importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
TOP_K = min(40, len(importances))
top_features = importances.index[:TOP_K].tolist()

# -------- 5) Train LightGBM sur les features sélectionnées --------
model = LGBMClassifier(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)

model.fit(
    X_train[top_features], y_train,
    eval_set=[(X_val[top_features], y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)

# -------- 6) Eval --------
y_pred = model.predict(X_val[top_features])
print("\nAccuracy:", accuracy_score(y_val, y_pred))

# -------- 7) Print top features à la fin --------
print("\nTop features (RF):")
print(importances.head(20))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.333633

Accuracy: 0.8806216931216931

Top features (RF):
Elev_minus_VertHydro                  0.113906
Elevation_sq                          0.105276
Elevation                             0.100360
Road_Fire                             0.036131
Horizontal_Distance_To_Roadways       0.030202
Hydro_Road                            0.030099
Road_minus_Hydro                      0.029760
Fire_minus_Road                       0.026608
Hydro_Fire                            0.025150
Aspect_cos                            0.022789
Wilderness_Area4                      0.022560
Horizontal_Distance_To_Fire_Points    0.022316
Near_RoadOrFire                       0.022177
Hydro_Dist                            0.021572
Fire_minus_Hydro                      0.021489
Hillshade_9am                         0.020806
Hydro_Dist_sq                         0.020225
Slope_times_Hy

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier, early_stopping

# ---------- Feature engineering minimal ----------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.drop(columns=["Cover_Type", "Id"], errors="ignore").copy()

    # Aspect (circulaire)
    ang = np.deg2rad(X["Aspect"])
    X["Aspect_sin"] = np.sin(ang)
    X["Aspect_cos"] = np.cos(ang)

    # Distances & combinaisons
    X["Hydro_Dist"] = np.hypot(X["Horizontal_Distance_To_Hydrology"],
                               X["Vertical_Distance_To_Hydrology"])
    X["Abs_VertHydro"] = X["Vertical_Distance_To_Hydrology"].abs()
    X["Elev_minus_VertHydro"] = X["Elevation"] - X["Vertical_Distance_To_Hydrology"]
    X["Near_RoadOrFire"] = np.minimum(X["Horizontal_Distance_To_Roadways"],
                                      X["Horizontal_Distance_To_Fire_Points"])
    X["Road_Fire"]  = X["Horizontal_Distance_To_Roadways"] + X["Horizontal_Distance_To_Fire_Points"]
    X["Hydro_Road"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Roadways"]
    X["Hydro_Fire"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Fire_Points"]

    # Hillshade
    hs = ["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]
    X["Hillshade_mean"]  = X[hs].mean(axis=1)
    X["Hillshade_range"] = X[hs].max(axis=1) - X[hs].min(axis=1)
    X["Hillshade_sum"]   = X[hs].sum(axis=1)

    # Interactions simples + non-linéaires
    X["Slope_times_HydroDist"] = X["Slope"] * X["Hydro_Dist"]
    X["Slope_times_Elev"]      = X["Slope"] * X["Elevation"]
    X["Elev_over_Slope"]       = X["Elevation"] / (X["Slope"] + 1e-6)
    X["Road_minus_Hydro"]      = X["Horizontal_Distance_To_Roadways"] - X["Horizontal_Distance_To_Hydrology"]
    X["Fire_minus_Road"]       = X["Horizontal_Distance_To_Fire_Points"] - X["Horizontal_Distance_To_Roadways"]
    X["Fire_minus_Hydro"]      = X["Horizontal_Distance_To_Fire_Points"] - X["Horizontal_Distance_To_Hydrology"]

    for col in ["Elevation", "Slope", "Hydro_Dist"]:
        X[f"{col}_sq"] = X[col] * X[col]

    return X.replace([np.inf, -np.inf], np.nan).fillna(0)

# ---------- Load + FE ----------
df = pd.read_csv("train.csv")
y = df["Cover_Type"].astype(int)
X = build_features(df)

# ---------- Split ----------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ---------- Baseline LGBM sur TOUTES les features ----------
model_all = LGBMClassifier(
    n_estimators=4000, learning_rate=0.03, num_leaves=256,
    subsample=0.8, colsample_bytree=0.8,
    objective="multiclass", random_state=42, n_jobs=-1, verbosity=-1
)
model_all.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)
pred_all = model_all.predict(X_val)
acc_all = accuracy_score(y_val, pred_all)

# ---------- Sélection top-20 via RandomForest ----------
rf = RandomForestClassifier(
    n_estimators=500, random_state=42, n_jobs=-1, max_features="sqrt"
)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
TOP_K = min(20, len(importances))
top_features = importances.index[:TOP_K].tolist()

# ---------- LGBM sur Top-20 ----------
model_top = LGBMClassifier(
    n_estimators=4000, learning_rate=0.03, num_leaves=256,
    subsample=0.8, colsample_bytree=0.8,
    objective="multiclass", random_state=7, n_jobs=-1, verbosity=-1
)
model_top.fit(
    X_train[top_features], y_train,
    eval_set=[(X_val[top_features], y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)
pred_top = model_top.predict(X_val[top_features])
acc_top = accuracy_score(y_val, pred_top)

# ---------- Résultats ----------
print(f"Accuracy (all features):  {acc_all:.6f}")
print(f"Accuracy (top {TOP_K}):    {acc_top:.6f}")
print("\nTop features (RF):")
print(importances.head(TOP_K))


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[148]	valid_0's multi_logloss: 0.314672
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[161]	valid_0's multi_logloss: 0.338869
Accuracy (all features):  0.887566
Accuracy (top 20):    0.877976

Top features (RF):
Elev_minus_VertHydro                  0.113906
Elevation_sq                          0.105276
Elevation                             0.100360
Road_Fire                             0.036131
Horizontal_Distance_To_Roadways       0.030202
Hydro_Road                            0.030099
Road_minus_Hydro                      0.029760
Fire_minus_Road                       0.026608
Hydro_Fire                            0.025150
Aspect_cos                            0.022789
Wilderness_Area4                      0.022560
Horizontal_Distance_To_Fire_Points    0.022316
Near_RoadOrFire                       0.022177
Hydro_Dist                   

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# --- load ---
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X = train.drop(columns=["Cover_Type", "Id"])
X_test = test.drop(columns=["Id"])

# sécurité: même colonnes entre train et test
assert list(X.columns) == list(X_test.columns), "Mismatch colonnes train/test"

# --- split pour une accuracy locale ---
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- modèle ---
base_params = dict(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=128,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=42,
    n_jobs=-1,
    verbosity=-1
)
model = LGBMClassifier(**base_params)

# --- fit avec early stopping (sur le split) ---
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)

# --- accuracy locale ---
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# --- re-fit sur TOUT le train avec le meilleur nombre d'itérations ---
best_iters = getattr(model, "best_iteration_", None)
final_model = LGBMClassifier(**base_params)
if best_iters is not None:
    final_model.set_params(n_estimators=best_iters)

final_model.fit(X, y)  # pas d'eval_set ici: on utilise tout le train

# --- prédiction sur test-full + sauvegarde submission ---
test_pred = final_model.predict(X_test).astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_pred})
submission.to_csv("submission.csv", index=False)

print("submission.csv saved:", submission.shape)
print(submission.head())


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# load
df = pd.read_csv("train.csv")
y = df["Cover_Type"].astype(int)
X = df.drop(columns=["Cover_Type", "Id"])

# split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# model
model = LGBMClassifier(
    n_estimators=4000, learning_rate=0.03, num_leaves=128,
    subsample=0.8, colsample_bytree=0.8,
    objective="multiclass", random_state=42, n_jobs=-1, verbosity=-1
)

# train + early stopping
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(200)]
)

# accuracy locale
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# garder le meilleur nb d'itérations pour la phase finale
best_iters = getattr(model, "best_iteration_", None)
print("Best iterations:", best_iters)


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[196]	valid_0's multi_logloss: 0.352831
Validation Accuracy: 0.8759920634920635
Best iterations: 196


In [22]:
import pandas as pd
from lightgbm import LGBMClassifier

# reload data (propre)
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X = train.drop(columns=["Cover_Type", "Id"])
X_test = test.drop(columns=["Id"])

# même colonnes train/test
assert list(X.columns) == list(X_test.columns)

model = LGBMClassifier(
    n_estimators=(196),  # utilise la meilleure itération si dispo
    learning_rate=0.03, num_leaves=128,
    subsample=0.8, colsample_bytree=0.8,
    objective="multiclass", random_state=42, n_jobs=-1, verbosity=-1
)

# fit sur 100% du train (pas d'early stopping ici)
model.fit(X, y)

# prédire test + sauver submission
pred = model.predict(X_test).astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": pred})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved:", submission.shape)
print(submission.head())
from IPython.display import FileLink
FileLink("submission.csv")


submission.csv saved: (581012, 2)
   Id  Cover_Type
0   1           5
1   2           5
2   3           2
3   4           2
4   5           5


In [19]:
import pandas as pd, numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# ---------- 1) Feature engineering unique (train & test) ----------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.drop(columns=["Cover_Type", "Id"], errors="ignore").copy()
    ang = np.deg2rad(X["Aspect"])
    X["Aspect_sin"] = np.sin(ang); X["Aspect_cos"] = np.cos(ang)
    X["Hydro_Dist"] = np.hypot(X["Horizontal_Distance_To_Hydrology"],
                               X["Vertical_Distance_To_Hydrology"])
    X["Near_RoadOrFire"] = np.minimum(X["Horizontal_Distance_To_Roadways"],
                                      X["Horizontal_Distance_To_Fire_Points"])
    X["Elev_minus_VertHydro"] = X["Elevation"] - X["Vertical_Distance_To_Hydrology"]
    X["Road_Fire"]  = X["Horizontal_Distance_To_Roadways"] + X["Horizontal_Distance_To_Fire_Points"]
    X["Hydro_Road"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Roadways"]
    X["Hydro_Fire"] = X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Fire_Points"]
    hs = ["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]
    X["Hillshade_mean"]  = X[hs].mean(axis=1)
    X["Hillshade_range"] = X[hs].max(axis=1) - X[hs].min(axis=1)
    # petits bonus souvent payants
    X["Abs_VertHydro"] = X["Vertical_Distance_To_Hydrology"].abs()
    X["Slope_times_HydroDist"] = X["Slope"] * X["Hydro_Dist"]
    X["Elev_over_Slope"] = X["Elevation"] / (X["Slope"] + 1e-6)
    for c in ["Elevation","Slope","Hydro_Dist"]:
        X[f"{c}_sq"] = X[c]*X[c]
    return X.replace([np.inf, -np.inf], np.nan).fillna(0)

# ---------- 2) Data ----------
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int).values
X = build_features(train)
X_test = build_features(test)
assert list(X.columns) == list(X_test.columns), "Colonnes train/test différentes"

classes_ = np.sort(np.unique(y))
n_classes = len(classes_)

# ---------- 3) 5-fold CV + early stopping + moyenne ----------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros((len(X), n_classes))
test_pred = np.zeros((len(X_test), n_classes))
best_iters = []

params = dict(
    n_estimators=8000, learning_rate=0.02, num_leaves=256,
    min_child_samples=20, subsample=0.9, colsample_bytree=0.9,
    reg_alpha=1.0, reg_lambda=2.0,
    objective="multiclass", random_state=42, n_jobs=-1, verbosity=-1
)

for fold, (tr, va) in enumerate(skf.split(X, y), 1):
    model = LGBMClassifier(**{**params, "random_state": 42+fold})
    model.fit(
        X.iloc[tr], y[tr],
        eval_set=[(X.iloc[va], y[va])],
        eval_metric="multi_logloss",
        callbacks=[early_stopping(300)]
    )
    oof[va] = model.predict_proba(X.iloc[va])
    test_pred += model.predict_proba(X_test) / skf.n_splits
    best_iters.append(getattr(model, "best_iteration_", params["n_estimators"]))

# CV accuracy (plus fiable qu’un seul split)
oof_labels = classes_[oof.argmax(axis=1)]
cv_acc = accuracy_score(y, oof_labels)
print(f"CV OOF Accuracy: {cv_acc:.6f} | mean best_iter: {np.mean(best_iters):.0f}")

# ---------- 4) Réentraînement sur 100% du train au bon nb d’itérations ----------
final_n_estimators = int(np.mean(best_iters))
final = LGBMClassifier(**{**params, "n_estimators": final_n_estimators, "random_state": 42})
final.fit(X, y)

# ---------- 5) Submission ----------
test_labels = classes_[test_pred.argmax(axis=1)]  # moyenne des folds
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_labels})
submission.to_csv("submission.csv", index=False)
print("submission.csv saved:", submission.shape)


Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[612]	valid_0's multi_logloss: 0.323917
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[559]	valid_0's multi_logloss: 0.319499
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[675]	valid_0's multi_logloss: 0.315029
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[574]	valid_0's multi_logloss: 0.311972
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[481]	valid_0's multi_logloss: 0.376001
CV OOF Accuracy: 0.880291 | mean best_iter: 580
submission.csv saved: (581012, 2)


In [23]:
import numpy as np
import pandas as pd

SEED = 42

train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X = train.drop(columns=["Cover_Type", "Id"])
X_test = test.drop(columns=["Id"])

assert list(X.columns) == list(X_test.columns)

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Aspect wraps around (0≈360): trig works much better than raw degrees
    asp = np.deg2rad(out["Aspect"].astype(np.float32))
    out["Aspect_sin"] = np.sin(asp)
    out["Aspect_cos"] = np.cos(asp)

    # Hydro geometry
    h_hyd = out["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = out["Vertical_Distance_To_Hydrology"].astype(np.float32)
    out["Hydro_Euclid"] = np.sqrt(h_hyd**2 + v_hyd**2)

    # Pairwise distance combos (very predictive)
    h_road = out["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = out["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    out["Road_Fire_Diff"]  = (h_road - h_fire).abs()
    out["Road_Hydro_Diff"] = (h_road - h_hyd).abs()
    out["Fire_Hydro_Diff"] = (h_fire - h_hyd).abs()
    out["Road_Fire_Sum"]   = h_road + h_fire

    # Elevation interactions with hydrology
    elev = out["Elevation"].astype(np.float32)
    out["Elev_minus_VertHydro"] = elev - v_hyd
    out["Elev_plus_VertHydro"]  = elev + v_hyd
    out["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade summaries/differences
    hs9  = out["Hillshade_9am"].astype(np.float32)
    hsn  = out["Hillshade_Noon"].astype(np.float32)
    hs3  = out["Hillshade_3pm"].astype(np.float32)
    out["Hillshade_Mean"]  = (hs9 + hsn + hs3) / 3.0
    out["Hillshade_Range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                             pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    out["Hillshade_Noon_minus_9am"] = hsn - hs9
    out["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Simple slope flags
    out["Is_Flat"]  = (out["Slope"] == 0).astype(np.int8)
    out["Is_Steep"] = (out["Slope"] >= 25).astype(np.int8)

    # Collapse one-hot Wilderness/Soil into single integer categories
    soil_cols = [c for c in out.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in out.columns if c.startswith("Wilderness_Area")]

    out["Soil"] = (out[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)
    out["Wilderness"] = (out[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)

    # Drop original one-hots (we’ll treat Soil/Wilderness as categorical)
    out = out.drop(columns=soil_cols + wild_cols)

    return out

X_fe      = make_features(X)
X_test_fe = make_features(X_test)
X_fe.shape, X_test_fe.shape


((15120, 28), (581012, 28))

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
import numpy as np

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

lgb_params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=10000,           # big cap; early stopping will pick ~200
    num_leaves=255,               # a bit larger than 128 usually helps here
    max_depth=-1,
    min_child_samples=60,         # regularization
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.6,
    extra_trees=True,             # stabilizes and helps generalization
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

oof_proba  = np.zeros((len(X_fe), 7), dtype=np.float32)
test_proba = np.zeros((len(X_test_fe), 7), dtype=np.float32)
fold_scores = []
cat_feats = ["Soil", "Wilderness"]  # categorical columns in X_fe

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_fe, y), 1):
    X_tr, X_val = X_fe.iloc[trn_idx], X_fe.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="multi_logloss",
        categorical_feature=cat_feats,
        early_stopping_rounds=200,
        verbose=200
    )

    val_proba = model.predict_proba(X_val, num_iteration=model.best_iteration_)
    oof_proba[val_idx] = val_proba
    val_pred = model.classes_[val_proba.argmax(axis=1)]
    acc = accuracy_score(y_val, val_pred)
    fold_scores.append(acc)
    print(f"Fold {fold} | acc={acc:.5f} | best_iter={model.best_iteration_}")

    test_proba += model.predict_proba(X_test_fe, num_iteration=model.best_iteration_) / N_FOLDS

oof_pred = model.classes_[oof_proba.argmax(axis=1)]
cv_acc = accuracy_score(y, oof_pred)
print(f"\nCV mean accuracy: {np.mean(fold_scores):.5f} ± {np.std(fold_scores):.5f}")
print(f"OOF accuracy    : {cv_acc:.5f}")


TypeError: LGBMClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# ======================
# 1) Chargement des données
# ======================
SEED = 42
np.random.seed(SEED)

train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X = train.drop(columns=["Cover_Type", "Id"])
X_test = test.drop(columns=["Id"])

assert list(X.columns) == list(X_test.columns)

# ======================
# 2) Feature engineering
# ======================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Aspect trigonométrique
    asp = np.deg2rad(out["Aspect"].astype(np.float32))
    out["Aspect_sin"] = np.sin(asp)
    out["Aspect_cos"] = np.cos(asp)

    # Distances hydrologie
    h_hyd = out["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = out["Vertical_Distance_To_Hydrology"].astype(np.float32)
    out["Hydro_Euclid"] = np.sqrt(h_hyd**2 + v_hyd**2)

    # Interactions distances
    h_road = out["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = out["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    out["Road_Fire_Diff"]  = (h_road - h_fire).abs()
    out["Road_Hydro_Diff"] = (h_road - h_hyd).abs()
    out["Fire_Hydro_Diff"] = (h_fire - h_hyd).abs()
    out["Road_Fire_Sum"]   = h_road + h_fire

    # Liens avec l’élévation
    elev = out["Elevation"].astype(np.float32)
    out["Elev_minus_VertHydro"] = elev - v_hyd
    out["Elev_plus_VertHydro"]  = elev + v_hyd
    out["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade
    hs9, hsn, hs3 = out["Hillshade_9am"], out["Hillshade_Noon"], out["Hillshade_3pm"]
    out["Hillshade_Mean"]  = (hs9 + hsn + hs3) / 3.0
    out["Hillshade_Range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                             pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    out["Hillshade_Noon_minus_9am"] = hsn - hs9
    out["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Flags pente
    out["Is_Flat"]  = (out["Slope"] == 0).astype(np.int8)
    out["Is_Steep"] = (out["Slope"] >= 25).astype(np.int8)

    # Catégories soil & wilderness
    soil_cols = [c for c in out.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in out.columns if c.startswith("Wilderness_Area")]

    out["Soil"] = (out[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)
    out["Wilderness"] = (out[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)

    # Supprimer one-hot
    out = out.drop(columns=soil_cols + wild_cols)
    return out

X_fe      = make_features(X)
X_test_fe = make_features(X_test)

# ======================
# 3) Modèle LightGBM avec CV
# ======================
params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=10000,
    num_leaves=255,
    max_depth=-1,
    min_child_samples=60,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.6,
    extra_trees=True,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_proba  = np.zeros((len(X_fe), 7), dtype=np.float32)
test_proba = np.zeros((len(X_test_fe), 7), dtype=np.float32)
scores = []

for fold, (trn_idx, val_idx) in enumerate(skf.split(X_fe, y), 1):
    X_tr, X_val = X_fe.iloc[trn_idx], X_fe.iloc[val_idx]
    y_tr, y_val = y.iloc[trn_idx], y.iloc[val_idx]

    model = LGBMClassifier(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="multi_logloss",
        categorical_feature=["Soil","Wilderness"],
        callbacks=[early_stopping(200, verbose=True)]
    )

    # Validation
    val_proba = model.predict_proba(X_val, num_iteration=model.best_iteration_)
    oof_proba[val_idx] = val_proba
    val_pred = model.classes_[val_proba.argmax(axis=1)]
    acc = accuracy_score(y_val, val_pred)
    scores.append(acc)
    print(f"Fold {fold} | acc={acc:.5f} | best_iter={model.best_iteration_}")

    # Moyenne des prédictions test
    test_proba += model.predict_proba(X_test_fe, num_iteration=model.best_iteration_) / N_FOLDS

# ======================
# 4) Accuracy finale + Submission
# ======================
oof_pred = model.classes_[oof_proba.argmax(axis=1)]
cv_acc = accuracy_score(y, oof_pred)
print("\nMoyenne CV accuracy:", np.mean(scores))
print("OOF accuracy        :", cv_acc)

final_pred = model.classes_[test_proba.argmax(axis=1)]
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": final_pred.astype(int)})
submission.to_csv("submission.csv", index=False)
print("\n✅ submission.csv créé avec shape", submission.shape)
print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2491]	valid_0's multi_logloss: 0.337494
Fold 1 | acc=0.87269 | best_iter=2491
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3022]	valid_0's multi_logloss: 0.315401
Fold 2 | acc=0.89153 | best_iter=3022
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2024]	valid_0's multi_logloss: 0.331558
Fold 3 | acc=0.87235 | best_iter=2024
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3131]	valid_0's multi_logloss: 0.324175
Fold 4 | acc=0.88029 | best_iter=3131
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2405]	valid_0's multi_logloss: 0.373379
Fold 5 | acc=0.86640 | best_iter=2405

Moyenne CV accuracy: 0.8766534391534391
OOF accuracy        : 0.8766534391534392

✅ submission.csv créé avec shape (581

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping
import zipfile
import os

# ============== Config ==============
SEED = 42
VAL_SIZE = 0.20
EARLY_STOP_ROUNDS = 200

# ============== Helpers ==============
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # --- features utiles (ta version "best") ---
    X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"]))
    X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"]))

    X["Hydro_Dist"] = np.hypot(
        X["Horizontal_Distance_To_Hydrology"],
        X["Vertical_Distance_To_Hydrology"]
    )

    X["Near_RoadOrFire"] = np.minimum(
        X["Horizontal_Distance_To_Roadways"],
        X["Horizontal_Distance_To_Fire_Points"]
    )

    X["Elev_minus_VertHydro"] = X["Elevation"] - X["Vertical_Distance_To_Hydrology"]

    X["Road_Fire"] = (
        X["Horizontal_Distance_To_Roadways"] + X["Horizontal_Distance_To_Fire_Points"]
    )
    X["Hydro_Road"] = (
        X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Roadways"]
    )
    X["Hydro_Fire"] = (
        X["Horizontal_Distance_To_Hydrology"] + X["Horizontal_Distance_To_Fire_Points"]
    )

    X["Hillshade_mean"] = X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].mean(axis=1)
    X["Hillshade_range"] = (
        X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].max(axis=1) -
        X[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]].min(axis=1)
    )

    return X

# ============== 1) Load ==============
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X_base = train.drop(columns=["Cover_Type", "Id"]).copy()
X_test_base = test.drop(columns=["Id"]).copy()

# mêmes colonnes brutes
assert list(X_base.columns) == list(X_test_base.columns), "Colonnes train/test différentes !"

# FE identique sur train & test
X = make_features(X_base)
X_test = make_features(X_test_base)

# sécurité : même ordre de colonnes après FE
X = X.reindex(sorted(X.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
assert list(X.columns) == list(X_test.columns)

# ============== 2) Split & First training (with ES) ==============
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=VAL_SIZE, stratify=y, random_state=SEED
)

model = LGBMClassifier(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(EARLY_STOP_ROUNDS, verbose=True)]
)

# Accuracy de validation
y_pred = model.predict(X_val, num_iteration=model.best_iteration_)
val_acc = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {val_acc:.5f}")

best_iter = getattr(model, "best_iteration_", None)
if best_iter is None:
    best_iter = model.get_params().get("n_estimators", 4000)
print(f"Best iteration retenue: {best_iter}")

# ============== 3) Retrain on FULL train with best_iter ==============
final_model = LGBMClassifier(
    n_estimators=best_iter,
    learning_rate=0.03,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multiclass",
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

final_model.fit(X, y)  # entraînement sur 100% des données sans early stopping

# ============== 4) Prédictions test + fichiers ==============
test_pred = final_model.predict(X_test).astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_pred})

csv_path = "submission.csv"
zip_path = "submission.csv.zip"

submission.to_csv(csv_path, index=False)
print(f"\n✅ Fichier CSV créé: {csv_path}  | shape={submission.shape}")

# zip
with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(csv_path, arcname=os.path.basename(csv_path))
print(f"✅ Archive ZIP créée: {zip_path}")

# aperçu
print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[150]	valid_0's multi_logloss: 0.329207

Validation Accuracy: 0.88492
Best iteration retenue: 150

✅ Fichier CSV créé: submission.csv  | shape=(581012, 2)
✅ Archive ZIP créée: submission.csv.zip
   Id  Cover_Type
0   1           5
1   2           5
2   3           2
3   4           2
4   5           5


In [9]:
# ============================
# Forest Cover Type — Strong LGBM + ELU + TE + (opt) seed bagging
# ============================
import os, zipfile, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping

# ----------------------------
# Config
# ----------------------------
SEED = 42
VAL_SIZE = 0.20
EARLY_STOP_ROUNDS = 200
USE_BAGGING = True
BAGGING_SEEDS = [42, 123, 2024]  # used only for final test-time averaging

# ----------------------------
# Load
# ----------------------------
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X_base = train.drop(columns=["Cover_Type", "Id"]).copy()
X_test_base = test.drop(columns=["Id"]).copy()

assert list(X_base.columns) == list(X_test_base.columns), "Train/Test columns mismatch!"

# ----------------------------
# Feature Engineering
# ----------------------------
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # Aspect (wrap-around) -> trig
    X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"].astype(np.float32)))
    X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"].astype(np.float32)))

    # Hydrology geometry
    h_hyd = X["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = X["Vertical_Distance_To_Hydrology"].astype(np.float32)
    X["Hydro_Dist"] = np.hypot(h_hyd, v_hyd)
    X["Abs_VertHydro"] = np.abs(v_hyd)

    # Distances combos
    h_road = X["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = X["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    X["Near_RoadOrFire"] = np.minimum(h_road, h_fire)
    X["Road_Fire"] = h_road + h_fire
    X["Hydro_Road"] = h_hyd + h_road
    X["Hydro_Fire"] = h_hyd + h_fire
    X["Road_Fire_Diff"]  = np.abs(h_road - h_fire)
    X["Road_Hydro_Diff"] = np.abs(h_road - h_hyd)
    X["Fire_Hydro_Diff"] = np.abs(h_fire - h_hyd)

    # Elevation interactions
    elev = X["Elevation"].astype(np.float32)
    X["Elev_minus_VertHydro"] = elev - v_hyd
    X["Elev_plus_VertHydro"]  = elev + v_hyd
    X["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade summaries
    hs9 = X["Hillshade_9am"].astype(np.float32)
    hsn = X["Hillshade_Noon"].astype(np.float32)
    hs3 = X["Hillshade_3pm"].astype(np.float32)
    X["Hillshade_mean"]  = (hs9 + hsn + hs3) / 3.0
    X["Hillshade_range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                           pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    X["Hillshade_Noon_minus_9am"] = hsn - hs9
    X["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Slope flags
    X["Is_Flat"]  = (X["Slope"] == 0).astype(np.int8)
    X["Is_Steep"] = (X["Slope"] >= 25).astype(np.int8)

    # Collapse Soil/Wilderness one-hots -> single categories
    soil_cols = [c for c in X.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in X.columns if c.startswith("Wilderness_Area")]

    X["Soil"] = (X[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)  # 1..40
    X["Wilderness"] = (X[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)  # 1..4

    # Drop original one-hots
    X.drop(columns=soil_cols + wild_cols, inplace=True)

    # Simple interactions
    X["Elev_x_Soil"]  = elev * X["Soil"].astype(np.float32)
    X["Slope_x_Soil"] = X["Slope"].astype(np.float32) * X["Soil"].astype(np.float32)

    return X

X = make_features(X_base)
X_test = make_features(X_test_base)

# Soil -> ELU (climate & geology)
soil_to_elu = {
 1:2702, 2:2703, 3:2704, 4:2705, 5:2706, 6:2717, 7:3501, 8:3502, 9:4201, 10:4703,
 11:4704, 12:4744, 13:4758, 14:5101, 15:5151, 16:6101, 17:6102, 18:6731, 19:7101, 20:7102,
 21:7103, 22:7201, 23:7202, 24:7700, 25:7701, 26:7702, 27:7709, 28:7710, 29:7745, 30:7746,
 31:7755, 32:7756, 33:7757, 34:7790, 35:8703, 36:8707, 37:8708, 38:8771, 39:8772, 40:8776
}

def add_elu_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["ELU_code"]   = out["Soil"].map(soil_to_elu).astype(int)
    out["ELU_climate"] = (out["ELU_code"] // 1000).astype(np.int8)        # 1..8 (climate)
    out["ELU_geo"]     = ((out["ELU_code"] // 100) % 10).astype(np.int8)   # 1..8 (geology)
    out.drop(columns=["ELU_code"], inplace=True)
    return out

X = add_elu_features(X)
X_test = add_elu_features(X_test)

# Tiny helpers for class 1 vs 2 separation
X["Is_Climate_7_8"] = X["ELU_climate"].isin([7, 8]).astype(np.int8)
X_test["Is_Climate_7_8"] = X_test["ELU_climate"].isin([7, 8]).astype(np.int8)
X["Elev_x_Climate"] = X["Elevation"].astype(np.float32) * X["Is_Climate_7_8"]
X_test["Elev_x_Climate"] = X_test["Elevation"].astype(np.float32) * X_test["Is_Climate_7_8"]

# ----------------------------
# Target Encoding (Multiclass) on Soil & Soil_Wild
# ----------------------------
def _fit_te_maps_multiclass(X, y, col, smoothing=8):
    K = len(np.unique(y))
    priors = y.value_counts(normalize=True).sort_index().values  # class priors 1..K
    df = pd.DataFrame({col: X[col].values, "y": y.values})
    tab = df.groupby(col)["y"].value_counts().unstack(fill_value=0)
    tab = tab.reindex(columns=range(1, K+1), fill_value=0)
    cnts = tab.sum(axis=1).values[:, None]
    probs = (tab.values + smoothing * priors) / (cnts + smoothing)
    mapping = {k: probs[i] for i, k in enumerate(tab.index)}
    return mapping, priors

def cv_target_encode_multiclass_apply_to_test(X, y, X_test, cols, n_splits=5, seed=42, smoothing=8):
    X = X.copy()
    X_test = X_test.copy()
    K = len(np.unique(y))
    priors = y.value_counts(normalize=True).sort_index().values
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    for col in cols:
        # OOF encoding for X
        oof = np.zeros((len(X), K), dtype=np.float32)
        for tr, va in skf.split(X, y):
            df_tr = pd.DataFrame({col: X.iloc[tr][col].values, "y": y.iloc[tr].values})
            tab = df_tr.groupby(col)["y"].value_counts().unstack(fill_value=0)
            tab = tab.reindex(columns=range(1, K+1), fill_value=0)
            cnts = tab.sum(axis=1).values[:, None]
            probs = (tab.values + smoothing * priors) / (cnts + smoothing)
            mapping = {k: probs[i] for i, k in enumerate(tab.index)}

            enc = np.tile(priors, (len(va), 1))
            keys = X.iloc[va][col].values
            for j, key in enumerate(keys):
                if key in mapping:
                    enc[j] = mapping[key]
            oof[va] = enc

        for k in range(K):
            X[f"{col}_te_{k+1}"] = oof[:, k]

        # Full-train mapping for X_test
        mapping_full, pri = _fit_te_maps_multiclass(X, y, col, smoothing=smoothing)
        enc_test = np.tile(pri, (len(X_test), 1))
        keys_t = X_test[col].values
        for j, key in enumerate(keys_t):
            if key in mapping_full:
                enc_test[j] = mapping_full[key]
        for k in range(K):
            X_test[f"{col}_te_{k+1}"] = enc_test[:, k]

    return X, X_test

# Build Soil_Wild key (string ONLY for TE step)
X["Soil_Wild"] = X["Soil"].astype(str) + "_" + X["Wilderness"].astype(str)
X_test["Soil_Wild"] = X_test["Soil"].astype(str) + "_" + X_test["Wilderness"].astype(str)

# Apply TE on Soil & Soil_Wild (lighter smoothing=8)
X, X_test = cv_target_encode_multiclass_apply_to_test(
    X, y, X_test,
    cols=["Soil", "Soil_Wild"],
    n_splits=5, seed=SEED, smoothing=8
)

# Drop raw string key
X.drop(columns=["Soil_Wild"], inplace=True)
X_test.drop(columns=["Soil_Wild"], inplace=True)
# Ensure no object columns remain
obj_cols = list(X.select_dtypes(include=["object"]).columns)
assert len(obj_cols) == 0, f"Object dtypes remain: {obj_cols}"

# Keep identical column order
X = X.reindex(sorted(X.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
assert list(X.columns) == list(X_test.columns)

# Categorical columns for LGBM
cat_cols = [c for c in ["Soil", "Wilderness", "ELU_climate", "ELU_geo"] if c in X.columns]

# ----------------------------
# Train/Val split + Early Stopping
# ----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=VAL_SIZE, stratify=y, random_state=SEED
)

base_params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=20000,   # large cap; ES will pick ~2–3k
    num_leaves=256,
    max_depth=-1,
    min_child_samples=80,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.8,
    extra_trees=True,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

model = LGBMClassifier(**base_params)
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    categorical_feature=cat_cols,
    callbacks=[early_stopping(EARLY_STOP_ROUNDS, verbose=True)]
)

# Validation accuracy
y_pred = model.predict(X_val, num_iteration=model.best_iteration_)
val_acc = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {val_acc:.5f}")

best_iter = int(getattr(model, "best_iteration_", model.get_params().get("n_estimators", 4000)))
print(f"Best iteration selected: {best_iter}")

# ----------------------------
# Final training on 100% + (optional) seed bagging for test
# ----------------------------
def fit_full_and_predict(seed):
    params = dict(base_params)
    params["random_state"] = seed
    params["n_estimators"] = best_iter
    clf = LGBMClassifier(**params)
    clf.fit(X, y, categorical_feature=cat_cols)
    proba = clf.predict_proba(X_test)  # (n,7)
    return proba

if USE_BAGGING:
    proba_sum = np.zeros((len(X_test), 7), dtype=np.float32)
    for s in BAGGING_SEEDS:
        proba_sum += fit_full_and_predict(s)
    proba = proba_sum / float(len(BAGGING_SEEDS))
    test_pred = proba.argmax(axis=1) + 1
else:
    params = dict(base_params)
    params["n_estimators"] = best_iter
    final_model = LGBMClassifier(**params)
    final_model.fit(X, y, categorical_feature=cat_cols)
    test_pred = final_model.predict(X_test)

test_pred = test_pred.astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_pred})

# ----------------------------
# Save CSV and ZIP
# ----------------------------
csv_path = "submission.csv"
zip_path = "submission.csv.zip"
submission.to_csv(csv_path, index=False)
print(f"\n✅ Created: {csv_path} | shape={submission.shape}")

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(csv_path, arcname=os.path.basename(csv_path))
print(f"✅ Created: {zip_path}")

print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2310]	valid_0's multi_logloss: 0.347883

Validation Accuracy: 0.87368
Best iteration selected: 2310

✅ Created: submission.csv | shape=(581012, 2)
✅ Created: submission.csv.zip
   Id  Cover_Type
0   1           5
1   2           5
2   3           2
3   4           2
4   5           5


In [5]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np, pandas as pd

# 1) Matrice de confusion + rapport par classe
cm = confusion_matrix(y_val, y_pred, labels=np.arange(1,8))
print(pd.DataFrame(cm, index=[f"true_{i}" for i in range(1,8)],
                      columns=[f"pred_{i}" for i in range(1,8)]))
print("\nReport:\n", classification_report(y_val, y_pred, digits=3))

# 2) Accuracy par Wilderness et par Soil
def group_acc(dfX, y_true, y_hat, col):
    s = pd.DataFrame({col: dfX[col].values, "ok": (y_true.values==y_hat)}).groupby(col)["ok"].mean()
    return s.sort_values()

print("\nAcc par Wilderness:\n", group_acc(X_val, y_val, y_pred, "Wilderness").round(4))
print("\nAcc par Soil (top 10 difficiles):\n",
      group_acc(X_val, y_val, y_pred, "Soil").sort_values().head(10).round(4))

# 3) Drift train/test
def freq(col, X_train_full, X_test_full, k=10):
    a = X_train_full[col].value_counts(normalize=True)
    b = X_test_full[col].value_counts(normalize=True)
    df = pd.concat([a.rename("train%"), b.rename("test%")], axis=1).fillna(0).sort_values("test%", ascending=False)
    return df.head(k)

print("\nRépartition Wilderness train/test:\n", freq("Wilderness", X, X_test))
print("\nRépartition Soil train/test (top 12):\n", freq("Soil", X, X_test, 12))

        pred_1  pred_2  pred_3  pred_4  pred_5  pred_6  pred_7
true_1     337      63       0       0       5       0      27
true_2      70     311      10       0      25      14       2
true_3       0       0     378      13       4      37       0
true_4       0       0      15     414       0       3       0
true_5       5       9       5       0     411       2       0
true_6       0       0      29       7       3     393       0
true_7      24       1       0       0       0       0     407

Report:
               precision    recall  f1-score   support

           1      0.773     0.780     0.776       432
           2      0.810     0.720     0.762       432
           3      0.865     0.875     0.870       432
           4      0.954     0.958     0.956       432
           5      0.917     0.951     0.934       432
           6      0.875     0.910     0.892       432
           7      0.933     0.942     0.938       432

    accuracy                          0.877      302

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping
import zipfile, os, warnings
warnings.filterwarnings("ignore")

# ===================== Config =====================
SEED = 42
VAL_SIZE = 0.20
EARLY_STOP_ROUNDS = 200

# Small, stable test-time boost (bagging)
USE_BAGGING = True
BAGGING_SEEDS = [42, 123, 2024]   # keep small; averaging probabilities

# ===================== Features (your best set) =====================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # Aspect trig
    X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"].astype(np.float32)))
    X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"].astype(np.float32)))

    # Hydrology geometry
    h_hyd = X["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = X["Vertical_Distance_To_Hydrology"].astype(np.float32)
    X["Hydro_Dist"] = np.hypot(h_hyd, v_hyd)
    X["Abs_VertHydro"] = np.abs(v_hyd)

    # Road / Fire combos
    h_road = X["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = X["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    X["Near_RoadOrFire"] = np.minimum(h_road, h_fire)
    X["Road_Fire"] = (h_road + h_fire)
    X["Hydro_Road"] = (h_hyd + h_road)
    X["Hydro_Fire"] = (h_hyd + h_fire)

    # Useful diffs
    X["Road_Fire_Diff"]  = np.abs(h_road - h_fire)
    X["Road_Hydro_Diff"] = np.abs(h_road - h_hyd)
    X["Fire_Hydro_Diff"] = np.abs(h_fire - h_hyd)

    # Elevation interactions
    elev = X["Elevation"].astype(np.float32)
    X["Elev_minus_VertHydro"] = elev - v_hyd
    X["Elev_plus_VertHydro"]  = elev + v_hyd
    X["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade summaries + deltas
    hs9  = X["Hillshade_9am"].astype(np.float32)
    hsn  = X["Hillshade_Noon"].astype(np.float32)
    hs3  = X["Hillshade_3pm"].astype(np.float32)
    X["Hillshade_mean"]  = (hs9 + hsn + hs3) / 3.0
    X["Hillshade_range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                           pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    X["Hillshade_Noon_minus_9am"] = hsn - hs9
    X["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Slope flags
    X["Is_Flat"]  = (X["Slope"] == 0).astype(np.int8)
    X["Is_Steep"] = (X["Slope"] >= 25).astype(np.int8)

    # Collapse Soil/Wilderness one-hots
    soil_cols = [c for c in X.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in X.columns if c.startswith("Wilderness_Area")]
    X["Soil"] = (X[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)  # 1..40
    X["Wilderness"] = (X[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)  # 1..4
    X.drop(columns=soil_cols + wild_cols, inplace=True)

    # Light interactions
    X["Elev_x_Soil"]  = elev * X["Soil"].astype(np.float32)
    X["Slope_x_Soil"] = X["Slope"].astype(np.float32) * X["Soil"].astype(np.float32)
    return X

# ===================== Load =====================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X_base = train.drop(columns=["Cover_Type", "Id"]).copy()
X_test_base = test.drop(columns=["Id"]).copy()
assert list(X_base.columns) == list(X_test_base.columns), "Colonnes train/test différentes !"

# FE identique
X = make_features(X_base)
X_test = make_features(X_test_base)

# sécurité : même ordre de colonnes après FE
X = X.reindex(sorted(X.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
assert list(X.columns) == list(X_test.columns)

# colonnes catégorielles (noms) pour LGBM
cat_cols = [c for c in ["Soil", "Wilderness"] if c in X.columns]

# ===================== Split & First training (get best_iter) =====================
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=VAL_SIZE, stratify=y, random_state=SEED
)

base_params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=20000,          # early stopping chooses
    num_leaves=256,
    max_depth=-1,
    min_child_samples=80,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.8,
    extra_trees=True,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

model = LGBMClassifier(**base_params)
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    categorical_feature=cat_cols,   # LGBM >= 4 accepts names
    callbacks=[early_stopping(EARLY_STOP_ROUNDS, verbose=True)]
)

# Accuracy de validation
y_pred = model.predict(X_val, num_iteration=model.best_iteration_)
val_acc = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {val_acc:.5f}")

best_iter = int(getattr(model, "best_iteration_", model.get_params().get("n_estimators", 4000)))
print(f"Best iteration retenue: {best_iter}")

# ===================== Final training on 100% + (opt) seed bagging =====================
def fit_full_and_proba(random_state):
    params = dict(base_params)
    params["n_estimators"] = best_iter
    params["random_state"] = random_state
    clf = LGBMClassifier(**params)
    clf.fit(X, y, categorical_feature=cat_cols)
    return clf.predict_proba(X_test)  # (n_test, 7)

if USE_BAGGING:
    proba = np.zeros((len(X_test), 7), dtype=np.float32)
    for rs in BAGGING_SEEDS:
        proba += fit_full_and_proba(rs)
    proba /= float(len(BAGGING_SEEDS))
    test_pred = proba.argmax(axis=1) + 1
else:
    params = dict(base_params)
    params["n_estimators"] = best_iter
    final_model = LGBMClassifier(**params)
    final_model.fit(X, y, categorical_feature=cat_cols)
    test_pred = final_model.predict(X_test)

test_pred = test_pred.astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_pred})

# ===================== Save CSV & ZIP =====================
csv_path = "submission.csv"
zip_path = "submission.csv.zip"

submission.to_csv(csv_path, index=False)
print(f"\n✅ Fichier CSV créé: {csv_path} | shape={submission.shape}")

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(csv_path, arcname=os.path.basename(csv_path))
print(f"✅ Archive ZIP créée: {zip_path}")

print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2636]	valid_0's multi_logloss: 0.342656

Validation Accuracy: 0.87665
Best iteration retenue: 2636


KeyboardInterrupt: 

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping
import zipfile, os, warnings
warnings.filterwarnings("ignore")

# ===================== Config =====================
SEED = 42
VAL_SIZE = 0.20
EARLY_STOP_ROUNDS = 200

# Toggle simple seed bagging or a slightly stronger variant ensemble
USE_VARIANT_ENSEMBLE = True   # set False to use simple bagging with same params
BAGGING_SEEDS = [42, 123, 2024]

# ===================== Features (your best set) =====================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # Aspect trig
    X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"].astype(np.float32)))
    X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"].astype(np.float32)))

    # Hydrology geometry
    h_hyd = X["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = X["Vertical_Distance_To_Hydrology"].astype(np.float32)
    X["Hydro_Dist"] = np.hypot(h_hyd, v_hyd)
    X["Abs_VertHydro"] = np.abs(v_hyd)

    # Road / Fire combos
    h_road = X["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = X["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    X["Near_RoadOrFire"] = np.minimum(h_road, h_fire)
    X["Road_Fire"] = (h_road + h_fire)
    X["Hydro_Road"] = (h_hyd + h_road)
    X["Hydro_Fire"] = (h_hyd + h_fire)

    # Useful diffs
    X["Road_Fire_Diff"]  = np.abs(h_road - h_fire)
    X["Road_Hydro_Diff"] = np.abs(h_road - h_hyd)
    X["Fire_Hydro_Diff"] = np.abs(h_fire - h_hyd)

    # Elevation interactions
    elev = X["Elevation"].astype(np.float32)
    X["Elev_minus_VertHydro"] = elev - v_hyd
    X["Elev_plus_VertHydro"]  = elev + v_hyd
    X["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade summaries + deltas
    hs9  = X["Hillshade_9am"].astype(np.float32)
    hsn  = X["Hillshade_Noon"].astype(np.float32)
    hs3  = X["Hillshade_3pm"].astype(np.float32)
    X["Hillshade_mean"]  = (hs9 + hsn + hs3) / 3.0
    X["Hillshade_range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                           pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    X["Hillshade_Noon_minus_9am"] = hsn - hs9
    X["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Slope flags
    X["Is_Flat"]  = (X["Slope"] == 0).astype(np.int8)
    X["Is_Steep"] = (X["Slope"] >= 25).astype(np.int8)

    # Collapse Soil/Wilderness one-hots
    soil_cols = [c for c in X.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in X.columns if c.startswith("Wilderness_Area")]
    X["Soil"] = (X[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)  # 1..40
    X["Wilderness"] = (X[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)  # 1..4
    X.drop(columns=soil_cols + wild_cols, inplace=True)

    # Light interactions
    X["Elev_x_Soil"]  = elev * X["Soil"].astype(np.float32)
    X["Slope_x_Soil"] = X["Slope"].astype(np.float32) * X["Soil"].astype(np.float32)
    return X

# ===================== Load =====================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X_base = train.drop(columns=["Cover_Type", "Id"]).copy()
X_test_base = test.drop(columns=["Id"]).copy()
assert list(X_base.columns) == list(X_test_base.columns), "Colonnes train/test différentes !"

# FE identique
X = make_features(X_base)
X_test = make_features(X_test_base)

# sécurité : même ordre de colonnes après FE
X = X.reindex(sorted(X.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
assert list(X.columns) == list(X_test.columns)

# colonnes catégorielles (noms) pour LGBM
cat_cols = [c for c in ["Soil", "Wilderness"] if c in X.columns]

# ===================== Split & First training (get best_iter) =====================
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=VAL_SIZE, stratify=y, random_state=SEED
)

base_params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=20000,          # early stopping chooses
    num_leaves=256,
    max_depth=-1,
    min_child_samples=80,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.8,
    extra_trees=True,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

model = LGBMClassifier(**base_params)
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    categorical_feature=cat_cols,   # LGBM >= 4 accepts names
    callbacks=[early_stopping(EARLY_STOP_ROUNDS, verbose=True)]
)

# Accuracy de validation
y_pred = model.predict(X_val, num_iteration=model.best_iteration_)
val_acc = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {val_acc:.5f}")

best_iter = int(getattr(model, "best_iteration_", model.get_params().get("n_estimators", 4000)))
print(f"Best iteration retenue: {best_iter}")

# ===================== Final training on 100% with a small robust ensemble =====================
def fit_variant_and_proba(rs, leaves, mcs, subs, cols, lam):
    clf = LGBMClassifier(
        objective="multiclass",
        learning_rate=0.03,
        n_estimators=best_iter,     # reuse best_iter from ES
        num_leaves=leaves,
        max_depth=-1,
        min_child_samples=mcs,
        subsample=subs,
        subsample_freq=1,
        colsample_bytree=cols,
        reg_alpha=0.2,
        reg_lambda=lam,
        extra_trees=True,
        random_state=rs,
        n_jobs=-1,
        verbosity=-1
    )
    clf.fit(X, y, categorical_feature=cat_cols)
    return clf.predict_proba(X_test)  # (n_test, 7)

if USE_VARIANT_ENSEMBLE:
    ENSEMBLE_SPECS = [
        # (random_state, num_leaves, min_child_samples, subsample, colsample_bytree, reg_lambda)
        (42,   256, 80,  0.80, 0.80, 0.8),
        (123,  224, 100, 0.75, 0.75, 1.0),
        (2024, 288, 80,  0.80, 0.75, 0.8),
        (7,    256, 100, 0.75, 0.80, 1.0),
        (314,  224, 120, 0.75, 0.75, 1.2),
    ]
    proba_sum = np.zeros((len(X_test), 7), dtype=np.float32)
    for (rs, leaves, mcs, subs, cols, lam) in ENSEMBLE_SPECS:
        proba_sum += fit_variant_and_proba(rs, leaves, mcs, subs, cols, lam)
    proba = proba_sum / float(len(ENSEMBLE_SPECS))
    test_pred = proba.argmax(axis=1) + 1
else:
    # simple seed bagging with identical params
    proba_sum = np.zeros((len(X_test), 7), dtype=np.float32)
    for rs in BAGGING_SEEDS:
        proba_sum += fit_variant_and_proba(rs, 256, 80, 0.80, 0.80, 0.8)
    proba = proba_sum / float(len(BAGGING_SEEDS))
    test_pred = proba.argmax(axis=1) + 1

test_pred = test_pred.astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": test_pred})

# ===================== Save CSV & ZIP =====================
csv_path = "submission.csv"
zip_path = "submission.csv.zip"

submission.to_csv(csv_path, index=False)
print(f"\n✅ Fichier CSV créé: {csv_path} | shape={submission.shape}")

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(csv_path, arcname=os.path.basename(csv_path))
print(f"✅ Archive ZIP créée: {zip_path}")

print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2636]	valid_0's multi_logloss: 0.342656

Validation Accuracy: 0.87665
Best iteration retenue: 2636


KeyboardInterrupt: 

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, early_stopping
import zipfile, os, warnings
warnings.filterwarnings("ignore")

# ===================== Config =====================
SEED = 42
N_FOLDS = 5
EARLY_STOP_ROUNDS = 200

# ===================== Features (your best set, unchanged) =====================
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    X = df.copy()

    # Aspect trig
    X["Aspect_sin"] = np.sin(np.deg2rad(X["Aspect"].astype(np.float32)))
    X["Aspect_cos"] = np.cos(np.deg2rad(X["Aspect"].astype(np.float32)))

    # Hydrology geometry
    h_hyd = X["Horizontal_Distance_To_Hydrology"].astype(np.float32)
    v_hyd = X["Vertical_Distance_To_Hydrology"].astype(np.float32)
    X["Hydro_Dist"] = np.hypot(h_hyd, v_hyd)
    X["Abs_VertHydro"] = np.abs(v_hyd)

    # Road / Fire combos
    h_road = X["Horizontal_Distance_To_Roadways"].astype(np.float32)
    h_fire = X["Horizontal_Distance_To_Fire_Points"].astype(np.float32)
    X["Near_RoadOrFire"] = np.minimum(h_road, h_fire)
    X["Road_Fire"] = (h_road + h_fire)
    X["Hydro_Road"] = (h_hyd + h_road)
    X["Hydro_Fire"] = (h_hyd + h_fire)

    # Useful diffs
    X["Road_Fire_Diff"]  = np.abs(h_road - h_fire)
    X["Road_Hydro_Diff"] = np.abs(h_road - h_hyd)
    X["Fire_Hydro_Diff"] = np.abs(h_fire - h_hyd)

    # Elevation interactions
    elev = X["Elevation"].astype(np.float32)
    X["Elev_minus_VertHydro"] = elev - v_hyd
    X["Elev_plus_VertHydro"]  = elev + v_hyd
    X["VertHydro_over_Elev"]  = v_hyd / (elev + 1e-3)

    # Hillshade summaries + deltas
    hs9  = X["Hillshade_9am"].astype(np.float32)
    hsn  = X["Hillshade_Noon"].astype(np.float32)
    hs3  = X["Hillshade_3pm"].astype(np.float32)
    X["Hillshade_mean"]  = (hs9 + hsn + hs3) / 3.0
    X["Hillshade_range"] = pd.concat([hs9, hsn, hs3], axis=1).max(axis=1) - \
                           pd.concat([hs9, hsn, hs3], axis=1).min(axis=1)
    X["Hillshade_Noon_minus_9am"] = hsn - hs9
    X["Hillshade_3pm_minus_Noon"] = hs3 - hsn

    # Slope flags
    X["Is_Flat"]  = (X["Slope"] == 0).astype(np.int8)
    X["Is_Steep"] = (X["Slope"] >= 25).astype(np.int8)

    # Collapse Soil/Wilderness one-hots
    soil_cols = [c for c in X.columns if c.startswith("Soil_Type")]
    wild_cols = [c for c in X.columns if c.startswith("Wilderness_Area")]
    X["Soil"] = (X[soil_cols].values.argmax(axis=1) + 1).astype(np.int16)  # 1..40
    X["Wilderness"] = (X[wild_cols].values.argmax(axis=1) + 1).astype(np.int8)  # 1..4
    X.drop(columns=soil_cols + wild_cols, inplace=True)

    # Light interactions
    X["Elev_x_Soil"]  = elev * X["Soil"].astype(np.float32)
    X["Slope_x_Soil"] = X["Slope"].astype(np.float32) * X["Soil"].astype(np.float32)
    return X

# ===================== Load =====================
train = pd.read_csv("train.csv")
test  = pd.read_csv("test-full.csv")

y = train["Cover_Type"].astype(int)
X_base = train.drop(columns=["Cover_Type", "Id"]).copy()
X_test_base = test.drop(columns=["Id"]).copy()
assert list(X_base.columns) == list(X_test_base.columns), "Colonnes train/test différentes !"

# FE identique
X = make_features(X_base)
X_test = make_features(X_test_base)

# sécurité : même ordre de colonnes après FE
X = X.reindex(sorted(X.columns), axis=1)
X_test = X_test.reindex(sorted(X_test.columns), axis=1)
assert list(X.columns) == list(X_test.columns)

# Categoricals for LGBM
cat_cols = [c for c in ["Soil", "Wilderness"] if c in X.columns]

# ===================== 5-fold CV with early stopping (same params) =====================
base_params = dict(
    objective="multiclass",
    learning_rate=0.03,
    n_estimators=20000,          # ES will pick ~2–3k
    num_leaves=256,
    max_depth=-1,
    min_child_samples=80,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.8,
    extra_trees=True,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1
)

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_proba  = np.zeros((len(X), 7), dtype=np.float32)
test_proba = np.zeros((len(X_test), 7), dtype=np.float32)
fold_accs, best_iters = [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = LGBMClassifier(**base_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="multi_logloss",
        categorical_feature=cat_cols,
        callbacks=[early_stopping(EARLY_STOP_ROUNDS, verbose=True)]
    )

    va_proba = model.predict_proba(X_va, num_iteration=model.best_iteration_)
    oof_proba[va_idx] = va_proba
    va_pred = model.classes_[va_proba.argmax(axis=1)]
    acc = accuracy_score(y_va, va_pred)
    fold_accs.append(acc)
    best_iters.append(model.best_iteration_)
    print(f"Fold {fold}: acc={acc:.5f} | best_iter={model.best_iteration_}")

    test_proba += model.predict_proba(X_test, num_iteration=model.best_iteration_) / N_FOLDS

# OOF accuracy (proxy of train-time performance)
oof_pred = model.classes_[oof_proba.argmax(axis=1)]
oof_acc = accuracy_score(y, oof_pred)
print(f"\nCV mean acc: {np.mean(fold_accs):.5f} ± {np.std(fold_accs):.5f}")
print(f"OOF acc    : {oof_acc:.5f}")
print(f"Best iters : {best_iters} | mean={int(np.mean(best_iters))}")

# ===================== Build submission from averaged folds =====================
final_pred = model.classes_[test_proba.argmax(axis=1)].astype(int)
submission = pd.DataFrame({"Id": test["Id"], "Cover_Type": final_pred})

csv_path = "submission.csv"
zip_path = "submission.csv.zip"
submission.to_csv(csv_path, index=False)
print(f"\n✅ Fichier CSV créé: {csv_path} | shape={submission.shape}")

with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(csv_path, arcname=os.path.basename(csv_path))
print(f"✅ Archive ZIP créée: {zip_path}")

print(submission.head())


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2648]	valid_0's multi_logloss: 0.332692
Fold 1: acc=0.87698 | best_iter=2648
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2918]	valid_0's multi_logloss: 0.314935
Fold 2: acc=0.88988 | best_iter=2918
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2457]	valid_0's multi_logloss: 0.326914
Fold 3: acc=0.87500 | best_iter=2457
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2918]	valid_0's multi_logloss: 0.320054
Fold 4: acc=0.88128 | best_iter=2918
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2388]	valid_0's multi_logloss: 0.371053
Fold 5: acc=0.86872 | best_iter=2388

CV mean acc: 0.87837 ± 0.00703
OOF acc    : 0.87837
Best iters : [2648, 2918, 2457, 2918, 2388] | mean=2665

✅ Fichier CSV c