In [None]:
# =====================================================
# 0) Imports & Reproducibility
# =====================================================
import numpy as np
import pandas as pd
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)


# =====================================================
# 1) Load Data
# =====================================================
DATA_PATH = "historical_2003-2014.csv"
TARGET_COL = "cdm_historical"

ID_COLS = ["point_id", "lat_dd", "long_dd"]
AUX_COLS = ["gdp", "pop", "cdm_model", "cdm_p_value"]

df = pd.read_csv(DATA_PATH)

# فقط داده‌های برچسب‌دار
df = df[df[TARGET_COL].notna()].copy()

print("Dataset shape:", df.shape)
print("Class distribution (%):")
print(df[TARGET_COL].value_counts(normalize=True).mul(100).round(2))


# =====================================================
# 2) Feature Definition
# =====================================================
CATEGORICAL_COLS = [
    "lulc",
    "lithology",
    "soil_texture",
    "soil_order",
]

NUMERICAL_COLS = [
    "aspect", "cdd", "dem",
    "dist_snowice", "dist_water",
    "evspsbl", "hurs", "lai",
    "mrro", "mrsos", "prcptot",
    "r10mm", "rlds", "rsds",
    "sfcWind", "slope",
    "tg_mean", "tx_max", "txgt_30",
    "dist_crop", "dist_roads", "dist_urban",
]

ALL_FEATURES = NUMERICAL_COLS + CATEGORICAL_COLS

X = df[ALL_FEATURES]
y = df[TARGET_COL].astype(int)


# =====================================================
# 3) Train / Validation / Test Split (70 / 15 / 15)
# =====================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.80,
    stratify=y,
    random_state=RANDOM_STATE
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print("\nSplit sizes:")
print("Train:", X_train.shape)
print("Val  :", X_val.shape)
print("Test :", X_test.shape)


# =====================================================
# 4) Encoding categorical + Imputation (for MI only)
# =====================================================
from sklearn.impute import SimpleImputer

X_train_enc = X_train.copy()
X_val_enc   = X_val.copy()
X_test_enc  = X_test.copy()

# --- categorical → integer codes ---
for col in CATEGORICAL_COLS:
    X_train_enc[col] = X_train_enc[col].astype("category").cat.codes
    X_val_enc[col]   = X_val_enc[col].astype("category").cat.codes
    X_test_enc[col]  = X_test_enc[col].astype("category").cat.codes

# --- numerical imputation (median, TRAIN only) ---
num_imputer = SimpleImputer(strategy="median")

X_train_enc[NUMERICAL_COLS] = num_imputer.fit_transform(
    X_train_enc[NUMERICAL_COLS]
)
X_val_enc[NUMERICAL_COLS] = num_imputer.transform(
    X_val_enc[NUMERICAL_COLS]
)
X_test_enc[NUMERICAL_COLS] = num_imputer.transform(
    X_test_enc[NUMERICAL_COLS]
)

# sanity
print("Any NaN in X_train_enc?", X_train_enc.isna().any().any())

# =====================================================
# 5) Mutual Information — Top 20 (TRAIN ONLY)
# =====================================================
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(
    X_train_enc,
    y_train,
    random_state=RANDOM_STATE,
    discrete_features=[c in CATEGORICAL_COLS for c in X_train_enc.columns]
)

mi_df = pd.DataFrame({
    "feature": X_train_enc.columns,
    "mi_score": mi_scores
}).sort_values("mi_score", ascending=False)

TOP_K = 10
TOP_FEATURES = mi_df.head(TOP_K)["feature"].tolist()

print("\nTop 10 Features (Mutual Information):")
display(mi_df.head(TOP_K))


# =====================================================
# 6) Reduce Feature Space
# =====================================================
X_train_fs = X_train_enc[TOP_FEATURES]
X_val_fs   = X_val_enc[TOP_FEATURES]
X_test_fs  = X_test_enc[TOP_FEATURES]


# =====================================================
# 7) Logistic Regression Baseline
# =====================================================
lr_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=300,
        class_weight="balanced",
        n_jobs=-1,
        multi_class="auto",
        random_state=RANDOM_STATE
    ))
])

lr_pipeline.fit(X_train_fs, y_train)


# =====================================================
# 8) Evaluation
# =====================================================
def evaluate(model, X, y, name):
    pred = model.predict(X)
    print(f"\n{name} Results")
    print("Accuracy:", accuracy_score(y, pred))
    print("Balanced Accuracy:", balanced_accuracy_score(y, pred))
    print("F1 (macro):", f1_score(y, pred, average="macro"))
    print(confusion_matrix(y, pred))


evaluate(lr_pipeline, X_train_fs, y_train, "Train")
evaluate(lr_pipeline, X_val_fs, y_val, "Validation")
evaluate(lr_pipeline, X_test_fs, y_test, "Test")


Dataset shape: (98482, 34)
Class distribution (%):
cdm_historical
0.0    36.22
2.0    31.13
1.0    26.05
3.0     6.60
Name: proportion, dtype: float64

Split sizes:
Train: (68937, 26)
Val  : (14772, 26)
Test : (14773, 26)
Any NaN in X_train_enc? False

Top 20 Features (Mutual Information):


Unnamed: 0,feature,mi_score
3,dist_snowice,0.311836
20,dist_roads,0.253942
19,dist_crop,0.247116
9,mrsos,0.236514
7,lai,0.222481
5,evspsbl,0.211614
6,hurs,0.188001
8,mrro,0.175908
13,rsds,0.172452
16,tg_mean,0.165153





Train Results
Accuracy: 0.6164178887970175
Balanced Accuracy: 0.650905305462292
F1 (macro): 0.5863052879302222
[[19733  3408  1472   357]
 [ 3242  8715  4303  1698]
 [ 1435  5444 10156  4423]
 [    0   136   525  3890]]

Validation Results
Accuracy: 0.6157595450852965
Balanced Accuracy: 0.6516078921997771
F1 (macro): 0.5856016922220224
[[4244  719  291   96]
 [ 666 1829  972  381]
 [ 323 1170 2180  925]
 [   0   27  106  843]]

Test Results
Accuracy: 0.6213362214851418
Balanced Accuracy: 0.6577099807235322
F1 (macro): 0.5932207119284232
[[4236  747  269   99]
 [ 686 1882  887  393]
 [ 318 1230 2214  837]
 [   0   21  107  847]]
