<a href="https://colab.research.google.com/github/aditiSharma55555/ml_mse2/blob/main/multiplecombined.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# FINAL UNIVERSAL MACHINE LEARNING TEMPLATE (TABULAR DATA)
#
# Handles:
# 1) Categorical targets  ‚Üí Status, Class, NObeyesdad
# 2) Numerical targets    ‚Üí price, MEDV
# 3) Missing values in FEATURES
# 4) Missing values in TARGET
# 5) Test WITHOUT target (normal Kaggle)
# 6) Test WITH target (rare exam case)
# 7) Single-column output (DEFAULT)
# 8) Multiclass PROBABILITY output (OPTIONAL ‚Äì COMMENTED)
#
# ‚ùó IMPORTANT:
# - This code WORKS AS-IS for normal questions
# - BELOW we ADD support when TARGET is split into columns
# =========================================================



#multiple cols combined into one target

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


# -------------------- STEP 1: Load Data --------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# -------------------- STEP 2: DEFINE TARGET ----------------
# üî¥ FINAL TARGET NAME (AFTER COMBINING)
TARGET = "Status"


# =========================================================
# üî¥ NEW STEP: COMBINE MULTIPLE TARGET COLUMNS (ADDED)
# Use ONLY if train.csv has one-hot encoded target columns
# Example: Status_A, Status_B, Status_C
# =========================================================
MULTI_TARGET_COLS = ["Status_A", "Status_B", "Status_C"]

if all(col in train.columns for col in MULTI_TARGET_COLS):
    # Combine into single categorical target
    train[TARGET] = train[MULTI_TARGET_COLS].idxmax(axis=1)
    train[TARGET] = train[TARGET].str.replace("Status_", "")

    # Drop original one-hot target columns
    train.drop(columns=MULTI_TARGET_COLS, inplace=True)


# -------------------- STEP 3: CHECK IF TEST CONTAINS TARGET
test_has_target = TARGET in test.columns
print("Test contains target:", test_has_target)


# =========================================================
# üî¥ ORIGINAL STEP 4 (COMMENTED ‚Äì SINGLE TARGET ASSUMED)
# =========================================================
"""
y = train[TARGET]
train_features = train.drop(columns=[TARGET])

mask = y.notna()
y = y[mask]
train_features = train_features.loc[mask]
"""

# =========================================================
# üî¥ NEW STEP 4: TARGET & FEATURE SEPARATION (ADDED)
# =========================================================
y = train[TARGET]
train_features = train.drop(columns=[TARGET])

mask = y.notna()
y = y[mask]
train_features = train_features.loc[mask]

if test_has_target:
    y_test_true = test[TARGET]      # ONLY for evaluation
    test_features = test.drop(columns=[TARGET])
else:
    test_features = test.copy()


# -------------------- STEP 5: SAVE ORIGINAL TEST IDs -------
if "id" in test.columns:
    test_ids = test["id"]
else:
    test_ids = pd.Series(range(1, len(test) + 1), name="id")


# -------------------- STEP 6: DROP ID FROM FEATURES --------
for df in [train_features, test_features]:
    if "id" in df.columns:
        df.drop("id", axis=1, inplace=True)


# -------------------- STEP 7: HANDLE MISSING VALUES --------
num_cols_train = train_features.select_dtypes(include=np.number).columns
num_cols_test = test_features.select_dtypes(include=np.number).columns

train_features[num_cols_train] = train_features[num_cols_train].fillna(
    train_features[num_cols_train].median()
)
test_features[num_cols_test] = test_features[num_cols_test].fillna(
    test_features[num_cols_test].median()
)

cat_cols_train = train_features.select_dtypes(include="object").columns
cat_cols_test = test_features.select_dtypes(include="object").columns

for col in cat_cols_train:
    train_features[col] = train_features[col].fillna(train_features[col].mode()[0])

for col in cat_cols_test:
    test_features[col] = test_features[col].fillna(test_features[col].mode()[0])


# -------------------- STEP 8: ENCODE FEATURES --------------
for col in cat_cols_train:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col])
    test_features[col] = test_features[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )


# -------------------- STEP 9: TARGET HANDLING --------------
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)


# -------------------- STEP 10: FEATURE SCALING -------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_features)
test_scaled = scaler.transform(test_features)


# -------------------- STEP 11: TRAIN-TEST SPLIT ------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# -------------------- STEP 12: MODEL SELECTION -------------
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


# -------------------- STEP 13: EVALUATION ------------------
val_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred))


# -------------------- STEP 14: HYPERPARAMETER TUNING -------
params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10]
}

grid = GridSearchCV(model, params, cv=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_


# -------------------- STEP 15: PREDICT TEST ----------------
test_pred = best_model.predict(test_scaled)
test_pred = target_encoder.inverse_transform(test_pred)


# =========================================================
# üî¥ STEP 16A: SINGLE-COLUMN SUBMISSION
# =========================================================
submission = pd.DataFrame({
    "id": test_ids,
    TARGET: test_pred
})
submission.to_csv("submission.csv", index=False)


# =========================================================
# üî¥ STEP 16B: MULTICLASS PROBABILITY SUBMISSION (OPTIONAL)
# =========================================================
"""
probs = best_model.predict_proba(test_scaled)

submission = pd.DataFrame({
    "id": test_ids,
    "Status_A": probs[:, list(best_model.classes_).index("A")],
    "Status_B": probs[:, list(best_model.classes_).index("B")],
    "Status_C": probs[:, list(best_model.classes_).index("C")]
})

submission.to_csv("submission.csv", index=False)
"""
# =========================================================

print("submission.csv created successfully")