**1. Importing Necessary Libraries**

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    ExtraTreesClassifier,
    StackingClassifier
)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")


**2.Load the Dataset**

In [2]:
train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

print(train_df.shape, test_df.shape)


(12842, 26) (1692, 25)


**3. Exploratory Data Analysis (EDA)**

In [3]:
train_df.head()

Unnamed: 0,record_id,flood_indicator,country_code,week,temperature_celsius,pm25_ugm3,region,gdp_per_capita_usd,month,precipitation_mm,...,drought_indicator,income_level,extreme_weather_events,heat_wave_days,date,country_name,population_millions,air_quality_index,temp_anomaly_celsius,health_risk_class
0,1,0.0,IND,39.0,6.8,93.3,South Asia,2700.0,9.0,49.5,...,0.0,Lower Middle,0.0,0.0,2019/09/29,India,,138.0,0.52,1
1,2,0.0,EGY,11.0\t,63.032,,Africa,4174.0,3.0,153.7,...,0.0,Lower-Middle,0.0,0.0,15/03/2020,Egypt,102.0,175.0,-0.31,2
2,3,0.0,IND\t,40.0,3.51,111.6,South Asia,2606.0,10.0,51.1,...,0.0,LOWER-MIDDLE,0.0,0.0,08/10/2017,India,1380.0,176.0,-0.97,1
3,4,0.0,CHN,33.0,,81.1,East Asia,,8.0,18.2,...,0.0,Upper-Middle,0.0,0.0,21.08.2016,China,1411.0,123.0,0.76,1
4,5,,IND,1.0,,114.2,South Asia,2963.0,1.0,134.5,...,0.0,Lower-Middle,0.0,0.0,05/01/2025,India,1380.0,159.0,0.12,1


In [4]:
TARGET = "health_risk_class"

X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]

# Convert common missing strings to NaN
missing_values = ["#VALUE!", "-", "NA", "N/A", "null", "None"]
X.replace(missing_values, np.nan, inplace=True)
test_df.replace(missing_values, np.nan, inplace=True)


**4. Data Cleaning and Feature Engineering**

4.1 Date Processing

In [5]:
def parse_date_column(df):
    df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)
    df["date_year"]  = df["date"].dt.year
    df["date_month"] = df["date"].dt.month
    df["date_week"]  = df["date"].dt.isocalendar().week.astype(float)
    df.drop(columns=["date"], inplace=True)
    return df

X = parse_date_column(X)
test_df = parse_date_column(test_df)


4.2 Binary Column Cleaning

In [6]:
binary_cols = ["drought_indicator", "flood_indicator"]

for col in binary_cols:
    for df in [X, test_df]:
        df[col] = df[col].replace({
            "Yes": 1, "No": 0,
            True: 1, False: 0
        })

# Fix impossible temperatures
for df in [X, test_df]:
    # Convert 'temperature_celsius' to numeric, coercing errors to NaN
    df["temperature_celsius"] = pd.to_numeric(df["temperature_celsius"], errors="coerce")
    df.loc[df["temperature_celsius"] > 50, "temperature_celsius"] = np.nan

4.3 Categorical and Numerical Separation

In [7]:
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)

    X[col] = le.transform(X[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

    label_encoders[col] = le


**5. Data Preprocessing Pipeline**

In [8]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", SimpleImputer(strategy="most_frequent"), categorical_cols)
])


**6. Model Building**

In [9]:
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=18,
    min_samples_split=4,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

et = ExtraTreesClassifier(
    n_estimators=400,
    max_depth=20,
    min_samples_split=4,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)


**7. Ensemble Model (Stacking)**

In [10]:
stack_model = StackingClassifier(
    estimators=[
        ("rf", rf),
        ("et", et),
        ("gb", gb)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=StratifiedKFold(n_splits=5),
    n_jobs=-1
)


**8. Final Pipeline (Preprocessing + Model)**

In [11]:
model = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", stack_model)
])


**9. Train Validation Split**

In [12]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

model.fit(X_train, y_train)

val_preds = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))


Accuracy: 0.5834955235500194
              precision    recall  f1-score   support

           0       0.59      0.48      0.53       518
           1       0.52      0.57      0.54       892
           2       0.57      0.63      0.60       763
           3       0.80      0.67      0.73       396

    accuracy                           0.58      2569
   macro avg       0.62      0.59      0.60      2569
weighted avg       0.59      0.58      0.58      2569



**10. Model Training**

In [13]:
model.fit(X, y)


**11. Prediction & Submission File**

In [14]:
test_preds = model.predict_proba(test_df)

submission = pd.DataFrame({
    "record_id": test_df["record_id"],
    "Low_Risk": test_preds[:, 0],
    "Moderate_Risk": test_preds[:, 1],
    "High_Risk": test_preds[:, 2],
    "Critical_Risk": test_preds[:, 3]
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,record_id,Low_Risk,Moderate_Risk,High_Risk,Critical_Risk
0,1,0.011303,0.039029,0.383727,0.56594
1,2,0.396162,0.488663,0.102511,0.012664
2,3,0.344482,0.549146,0.100703,0.005668
3,4,0.426545,0.481312,0.082694,0.009449
4,5,0.24783,0.566299,0.169101,0.01677
