In [None]:
# ============================================
# 0. IMPORTS
# ============================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# ============================================
# 1. LOAD DATA  (DIRECT PATH STYLE)
# ============================================
train = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/train.csv")
test  = pd.read_csv("/kaggle/input/ai-201-b-mse-2-aiml-a/test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)

# ============================================
# 2. BASIC SETTINGS
# ============================================
TARGET_COL = "NObeyesdad"   # target column

# ============================================
# 3. OPTIONAL: SAFE OUTLIER CAPPING (IQR)
# ============================================
def cap_outliers(df, cols):
    df = df.copy()
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        if pd.isna(IQR) or IQR == 0:
            continue

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR

        df[col] = df[col].clip(lower, upper)

    return df

numeric_cols_all = train.select_dtypes(include=[np.number]).columns.tolist()

train = cap_outliers(train, numeric_cols_all)
test  = cap_outliers(test, numeric_cols_all)

# ============================================
# 4. FEATURES & TARGET
# ============================================
x = train.drop(columns=[TARGET_COL])   # train me id column nahi hai
y = train[TARGET_COL]

x_test = test.copy()                   # test me bhi id nahi, sabhi features use

# ============================================
# 5. COLUMN TYPES
# ============================================
cat_cols = x.select_dtypes(include=["object"]).columns.tolist()
num_cols = x.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical:", cat_cols)
print("Numeric    :", num_cols)

# ============================================
# 6. PREPROCESSING PIPELINES
# ============================================
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, num_cols),
        ("cat", categorical_pipeline, cat_cols),
    ]
)

# ============================================
# 7. MODEL
# ============================================
model = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

# ============================================
# 8. TRAIN / VALIDATION SPLIT + TRAIN
# ============================================
x_train, x_valid, y_train, y_valid = train_test_split(
    x, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

model.fit(x_train, y_train)

# ============================================
# 9. EVALUATION (LABEL PROBLEM)
# ============================================
y_valid_pred = model.predict(x_valid)
print("Validation Accuracy:", accuracy_score(y_valid, y_valid_pred))

# ============================================
# 10. TRAIN ON FULL DATA
# ============================================
model.fit(x, y)

# ============================================
# 11. PREDICT ON TEST (LABELS)
# ============================================
test_pred = model.predict(x_test)

# ============================================
# 12. CREATE SUBMISSION (LABEL TYPE, CUSTOM id)
# ============================================
submission = pd.DataFrame({
    "id": np.arange(1, len(test_pred) + 1),  # 1..4152
    "NObeyesdad": test_pred
})

print(submission.head())

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv âœ”")