<a href="https://colab.research.google.com/github/aditiSharma55555/ml_mse2/blob/main/universal4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# FINAL UNIVERSAL MACHINE LEARNING TEMPLATE (TABULAR DATA)
#
# Handles:
# 1) Categorical targets  ‚Üí Status, Class, NObeyesdad
# 2) Numerical targets    ‚Üí price, MEDV
# 3) Missing values in FEATURES
# 4) Missing values in TARGET
# 5) Test WITHOUT target (normal Kaggle)
# 6) Test WITH target (rare exam case)
# 7) Single-column output (DEFAULT)
# 8) Multiclass PROBABILITY output (OPTIONAL ‚Äì COMMENTED)
#
# ‚ùó IMPORTANT:
# - Original code is for SINGLE TARGET
# - BELOW we ADD support for SPLIT TARGET COLUMNS
# =========================================================


#for multiple target cols

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier   # üî¥ ADDED


# -------------------- STEP 1: Load Data --------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# -------------------- STEP 2: DEFINE TARGET ----------------
#TARGET = "Status"   # üî¥ ORIGINAL (USED FOR SINGLE TARGET ONLY)

# üî¥ NEW (FOR SPLIT TARGET CASE)
SPLIT_TARGET_COLS = ["temp_low", "temp_mid", "temp_high"]
# Use this ONLY if train has these columns


# -------------------- STEP 3: CHECK IF TEST CONTAINS TARGET
test_has_target = TARGET in test.columns
print("Test contains target:", test_has_target)


# =========================================================
# üî¥ ORIGINAL SINGLE TARGET HANDLING (COMMENTED)
# =========================================================
"""
y = train[TARGET]
train_features = train.drop(columns=[TARGET])

mask = y.notna()
y = y[mask]
train_features = train_features.loc[mask]
"""

# =========================================================
# üî¥ NEW MULTI-OUTPUT TARGET HANDLING (ADDED)
# =========================================================
# Use this when target is already split like:
# temp_low, temp_mid, temp_high

y = train[SPLIT_TARGET_COLS]                     # üî¥ MULTI-OUTPUT y
train_features = train.drop(columns=SPLIT_TARGET_COLS)


# -------------------- STEP 4: Prepare test features --------
test_features = test.copy()


# -------------------- STEP 5: SAVE ORIGINAL TEST IDs -------
if "id" in test.columns:
    test_ids = test["id"]
else:
    test_ids = pd.Series(range(1, len(test) + 1), name="id")


# -------------------- STEP 6: DROP ID FROM FEATURES --------
for df in [train_features, test_features]:
    if "id" in df.columns:
        df.drop("id", axis=1, inplace=True)


# -------------------- STEP 7: HANDLE MISSING VALUES --------
num_cols_train = train_features.select_dtypes(include=np.number).columns
num_cols_test = test_features.select_dtypes(include=np.number).columns

train_features[num_cols_train] = train_features[num_cols_train].fillna(
    train_features[num_cols_train].median()
)
test_features[num_cols_test] = test_features[num_cols_test].fillna(
    test_features[num_cols_test].median()
)

cat_cols_train = train_features.select_dtypes(include="object").columns
cat_cols_test = test_features.select_dtypes(include="object").columns

for col in cat_cols_train:
    train_features[col] = train_features[col].fillna(train_features[col].mode()[0])

for col in cat_cols_test:
    test_features[col] = test_features[col].fillna(test_features[col].mode()[0])


# -------------------- STEP 8: ENCODE FEATURES --------------
for col in cat_cols_train:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col])
    test_features[col] = test_features[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )


# =========================================================
# üî¥ ORIGINAL TARGET ENCODING (COMMENTED)
# =========================================================
"""
target_encoder = None
is_categorical_target = y.dtype == "object"

if is_categorical_target:
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)
"""

# =========================================================
# üî¥ NEW: MULTI-OUTPUT TARGET ‚Üí NO ENCODING REQUIRED
# Because temp_low/mid/high are already numeric (0/1)
# =========================================================


# -------------------- STEP 9: FEATURE SCALING --------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_features)
test_scaled = scaler.transform(test_features)


# -------------------- STEP 10: TRAIN-TEST SPLIT ------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# =========================================================
# üî¥ ORIGINAL MODEL SELECTION (COMMENTED)
# =========================================================
"""
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
"""

# =========================================================
# üî¥ NEW MULTI-OUTPUT MODEL (ADDED)
# =========================================================
base_model = RandomForestClassifier(random_state=42)
model = MultiOutputClassifier(base_model)
model.fit(X_train, y_train)


# -------------------- STEP 11: PREDICT TEST ----------------
preds = model.predict(test_scaled)

# Convert predictions to DataFrame
pred_df = pd.DataFrame(
    preds,
    columns=SPLIT_TARGET_COLS
)


# =========================================================
# üî¥ FINAL SUBMISSION (MATCHES QUESTION)
# id, temp_low, temp_mid, temp_high
# =========================================================
submission = pd.DataFrame({
    "id": test_ids,
    "temp_low": pred_df["temp_low"],
    "temp_mid": pred_df["temp_mid"],
    "temp_high": pred_df["temp_high"]
})

submission.to_csv("submission.csv", index=False)

print("submission.csv created successfully")