In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss

In [3]:

# ==================== Load Data ====================

In [4]:

# Auto-detect dataset path (folder that has train.csv)
base_path = "/kaggle/input"
folders = [os.path.join(base_path, f) for f in os.listdir(base_path)]
print("Detected input folders:", folders)

dataset_path = None
for folder in folders:
    if os.path.isfile(os.path.join(folder, "train.csv")):
        dataset_path = folder
        break

if dataset_path is None:
    raise FileNotFoundError("train.csv not found in any /kaggle/input subfolder!")

DATA_PATH = dataset_path + "/"
print("Using dataset path:", DATA_PATH)

# Read train and test
df = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")

TARGET_COL = "NObeyesdad"

# <<< CHANGED: detect id if present in train OR test
ID_COL = "id" if ("id" in df.columns or "id" in df_test.columns) else None

print("Train shape:", df.shape)
print("Test shape :", df_test.shape)

Detected input folders: []


FileNotFoundError: train.csv not found in any /kaggle/input subfolder!

In [5]:

# ==================== Encode target ====================

le = LabelEncoder()
df["target_enc"] = le.fit_transform(df[TARGET_COL])
y = df["target_enc"]

# Features: drop target, encoded target, and id (if exists)
drop_cols = [TARGET_COL, "target_enc"]
if ID_COL and ID_COL in df.columns:        # <<< CHANGED
    drop_cols.append(ID_COL)

X = df.drop(columns=drop_cols)

NameError: name 'df' is not defined

In [6]:

# ==================== Encode categorical columns ====================

cat_cols = X.select_dtypes(include=["object"]).columns
print("Categorical columns:", list(cat_cols))

oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
if len(cat_cols) > 0:
    X[cat_cols] = oe.fit_transform(X[cat_cols].astype(str))

NameError: name 'X' is not defined

In [7]:
# ==================== Fill missing values ====================

X = X.fillna(X.median(numeric_only=True))

NameError: name 'X' is not defined

In [8]:

# ==================== Cap outliers ====================

def cap_outliers(df_in, cols, lower=1, upper=99):
    df_out = df_in.copy()
    for col in cols:
        q_low = df_out[col].quantile(lower / 100)
        q_high = df_out[col].quantile(upper / 100)
        df_out[col] = df_out[col].clip(q_low, q_high)
    return df_out

num_cols = X.select_dtypes(include=['float64', 'int64']).columns
print("Numeric columns:", list(num_cols))

X = cap_outliers(X, num_cols)

NameError: name 'X' is not defined

In [9]:

# ==================== Train-test split ====================

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


NameError: name 'X' is not defined

In [10]:

# ==================== Random Forest Classifier ====================

rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

NameError: name 'X_train' is not defined

In [11]:

# ==================== Evaluate on validation ====================

y_val_pred = rf.predict(X_val)
y_val_proba = rf.predict_proba(X_val)

acc = accuracy_score(y_val, y_val_pred)
ll = log_loss(y_val, y_val_proba, labels=rf.classes_)

print("Validation Accuracy:", acc)
print("Validation Log Loss:", ll)


NameError: name 'X_val' is not defined

In [12]:
# ==================== Prepare test data ====================

# <<< CHANGED: keep original ids if present, but DO NOT use them as features
test_ids = df_test["id"] if ID_COL and "id" in df_test.columns else None

if ID_COL and "id" in df_test.columns:     # <<< CHANGED
    X_test = df_test.drop(columns=[ID_COL])
else:
    X_test = df_test.copy()

# Encode categoricals using same OrdinalEncoder as train
if len(cat_cols) > 0:
    existing_cat_cols = [c for c in cat_cols if c in X_test.columns]
    if len(existing_cat_cols) > 0:
        X_test[existing_cat_cols] = oe.transform(X_test[existing_cat_cols].astype(str))

# Fill missing with median (test)
X_test = X_test.fillna(X_test.median(numeric_only=True))

# Cap outliers in test
num_cols_test = X_test.select_dtypes(include=['float64', 'int64']).columns
X_test = cap_outliers(X_test, num_cols_test)

NameError: name 'ID_COL' is not defined

In [13]:
# ==================== Predict on test ====================

y_test_enc = rf.predict(X_test)
y_test_labels = le.inverse_transform(y_test_enc)

NameError: name 'X_test' is not defined

In [14]:
# ==================== Create submission ====================

submission = pd.DataFrame()

# Start id from 1, 2, 3, ... (as you asked earlier)
submission["id"] = range(1, len(df_test) + 1)
submission[TARGET_COL] = y_test_labels

submission.to_csv("obesity_rf_simple.csv", index=False)
print("Submission file saved as obesity_rf_simple.csv")
print(submission.head())

NameError: name 'df_test' is not defined