In [None]:
import pandas as pd
train = pd.read_csv("train.csv")
print(train)

In [None]:
train["Cover_Type"].value_counts()

In [None]:
train.describe()

In [None]:
import matplotlib.pyplot as plt

train.groupby("Cover_Type")["Elevation"].plot.hist(alpha=0.5, legend=True)
plt.xlabel("Elevation")
plt.show()


In [None]:
import plotly.express as px
fig = px.histogram(train, x="Elevation", color="Cover_Type", barmode="group")
fig.show()

In [None]:
import plotly.express as px

for col in train.columns:  # <-- pas de parenthèses !
    fig = px.histogram(train, x=col, color="Cover_Type", barmode="group")
    fig.show()


In [None]:
soil_cols = [c for c in train.columns if "Soil_Type" in c]
train[soil_cols].sum().sort_values(ascending=False)


In [1]:
import pandas as pd
train = pd.read_csv("train.csv")


In [2]:
X = train.drop(columns=["Cover_Type", "Id"])
y = train["Cover_Type"]

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(
    n_estimators=1200,
    max_depth=30,
    min_samples_leaf=2,
    max_features="sqrt",
    n_jobs=-1,
    random_state=42,
    class_weight=None,  # ou "balanced"
    oob_score=True,
)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))


Accuracy: 0.8531746031746031


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

importances = model.feature_importances_
feat_names = X.columns

feat_imp = pd.Series(importances, index=feat_names).sort_values(ascending=False)

# Afficher les 15 plus importantes
feat_imp.head(15).plot(kind="barh", figsize=(8,6))
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

train = pd.read_csv("train.csv")


# --- 2) Features ingénierées légères (provenant de solutions Kaggle fréquentes) ---
if set(["Horizontal_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Horizontal_Distance_To_Fire_Points"]).issubset(train.columns):
    train["HF_diff"] = train["Horizontal_Distance_To_Hydrology"] - train["Horizontal_Distance_To_Fire_Points"]
    train["HR_diff"] = train["Horizontal_Distance_To_Hydrology"] - train["Horizontal_Distance_To_Roadways"]
    train["RF_diff"] = train["Horizontal_Distance_To_Roadways"] - train["Horizontal_Distance_To_Fire_Points"]
    train["Dist_Sum"] = (train["Horizontal_Distance_To_Hydrology"] +
                         train["Horizontal_Distance_To_Roadways"] +
                         train["Horizontal_Distance_To_Fire_Points"])

if set(["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]).issubset(train.columns):
    hs = train[["Hillshade_9am","Hillshade_Noon","Hillshade_3pm"]]
    train["Hillshade_Mean"] = hs.mean(axis=1)
    train["Hillshade_Range"] = hs.max(axis=1) - hs.min(axis=1)

if set(["Elevation","Vertical_Distance_To_Hydrology"]).issubset(train.columns):
    train["Elev_minus_VertHydro"] = train["Elevation"] - train["Vertical_Distance_To_Hydrology"]
    train["Abs_VertHydro"] = train["Vertical_Distance_To_Hydrology"].abs()

# --- 3) Split & RF (un peu plus costaud) ---
X = train.drop(columns=["Cover_Type","Id"])
y = train["Cover_Type"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(
    n_estimators=600,           # + d’arbres
    max_features="sqrt",       # bon défaut
    min_samples_leaf=2,        # réduit l’overfit
    oob_score=True,            # score OOB pratique
    class_weight=None,         # mets "balanced_subsample" si déséquilibre marqué
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("Val Accuracy:", accuracy_score(y_val, y_pred))
if hasattr(model, "oob_score_"):
    print("OOB Accuracy:", model.oob_score_)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = train.drop(columns=["Cover_Type", "Id"])
y = train["Cover_Type"]

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# OneHot uniquement pour Soil_Type (colonne unique), pass-through pour le reste
preprocess = ColumnTransformer(
    transformers=[
        ("soil", OneHotEncoder(handle_unknown="ignore"), ["Soil_Type"]),
    ],
    remainder="passthrough",
)

model = RandomForestClassifier(
    n_estimators=800,
    max_features="sqrt",
    min_samples_leaf=2,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1,
    random_state=42,
)

pipe = make_pipeline(preprocess, model)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
# Optionnel : print("OOB:", pipe.named_steps["randomforestclassifier"].oob_score_)
