<a href="https://colab.research.google.com/github/aditiSharma55555/ml_mse2/blob/main/universal2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# FINAL UNIVERSAL MACHINE LEARNING TEMPLATE (TABULAR DATA)
#
# Handles:
# 1) Categorical targets  ‚Üí Status, Class, NObeyesdad
# 2) Numerical targets    ‚Üí price, MEDV
# 3) Missing values in FEATURES
# 4) Missing values in TARGET
# 5) Test WITHOUT target (normal Kaggle)
# 6) Test WITH target (rare exam case)
# 7) Single-column output (DEFAULT)
# 8) Multiclass PROBABILITY output (OPTIONAL ‚Äì COMMENTED)
#
# ‚ùó IMPORTANT:
# - This code WORKS AS-IS for normal questions
# - Multiclass probability block is COMMENTED
# - Uncomment ONLY if question explicitly asks
#
# ‚ùó EXTRA NOTE (FOR EXAMS):
# - If test.csv has NO id column
# - AND question explicitly says "create id starting from 1"
# - Then generating id using range(1, n+1) is CORRECT
# =========================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


# -------------------- STEP 1: Load Data --------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# -------------------- STEP 2: DEFINE TARGET ----------------
# üî¥ CHANGE ONLY THIS LINE BASED ON QUESTION
TARGET = "Status"    # e.g. "Status", "Class", "NObeyesdad", "price"


# -------------------- STEP 3: CHECK IF TEST CONTAINS TARGET
# (DO NOT COMMENT)
test_has_target = TARGET in test.columns
print("Test contains target:", test_has_target)


# -------------------- STEP 4: Separate Target & Features ---
y = train[TARGET]
train_features = train.drop(columns=[TARGET])

# Remove rows with missing target (MANDATORY)
mask = y.notna()
y = y[mask]
train_features = train_features.loc[mask]

# Prepare test features
if test_has_target:
    y_test_true = test[TARGET]      # ONLY for evaluation
    test_features = test.drop(columns=[TARGET])
else:
    test_features = test.copy()


# -------------------- STEP 5: SAVE ORIGINAL TEST IDs -------
# ‚ùó Submission id MUST match test.csv id
#
# EXAM NOTE:
# If test.csv DOES NOT have an id column
# AND question explicitly asks to "create id starting from 1",
# then range(1, len(test)+1) is CORRECT.
#
# KAGGLE NOTE:
# If test.csv HAS an id column, NEVER generate new ids.
#
# if "id" in test.columns:
#     test_ids = test["id"]
# else:
#     test_ids = pd.Series(
#         range(15000, 15000 + len(test)),
#         name="id"
#     )

if "id" in test.columns:
    test_ids = test["id"]
else:
    test_ids = pd.Series(range(1, len(test) + 1), name="id")


# -------------------- STEP 6: DROP ID FROM FEATURES --------
# id is NEVER a feature
for df in [train_features, test_features]:
    if "id" in df.columns:
        df.drop("id", axis=1, inplace=True)


# -------------------- STEP 7: HANDLE MISSING VALUES --------
num_cols_train = train_features.select_dtypes(include=np.number).columns
num_cols_test = test_features.select_dtypes(include=np.number).columns

train_features[num_cols_train] = train_features[num_cols_train].fillna(
    train_features[num_cols_train].median()
)
test_features[num_cols_test] = test_features[num_cols_test].fillna(
    test_features[num_cols_test].median()
)

cat_cols_train = train_features.select_dtypes(include="object").columns
cat_cols_test = test_features.select_dtypes(include="object").columns

for col in cat_cols_train:
    train_features[col] = train_features[col].fillna(train_features[col].mode()[0])

for col in cat_cols_test:
    test_features[col] = test_features[col].fillna(test_features[col].mode()[0])


# -------------------- STEP 8: ENCODE FEATURES --------------
for col in cat_cols_train:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col])

    test_features[col] = test_features[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )


# -------------------- STEP 9: TARGET HANDLING --------------
target_encoder = None
is_categorical_target = y.dtype == "object"

if is_categorical_target:
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)
# Numerical target ‚Üí nothing to do


# -------------------- STEP 10: FEATURE SCALING -------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_features)
test_scaled = scaler.transform(test_features)


# -------------------- STEP 11: TRAIN-TEST SPLIT ------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# -------------------- STEP 12: MODEL SELECTION -------------
# RandomForestClassifier ‚Üí binary + multiclass
# RandomForestRegressor  ‚Üí numerical target
if is_categorical_target:
    model = RandomForestClassifier(random_state=42)
else:
    model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)


# -------------------- STEP 13: EVALUATION ------------------
val_pred = model.predict(X_val)

if is_categorical_target:
    print("Validation Accuracy:", accuracy_score(y_val, val_pred))
    print(classification_report(y_val, val_pred))
else:
    print("Validation RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))


# -------------------- STEP 14: HYPERPARAMETER TUNING -------
params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10]
}

grid = GridSearchCV(model, params, cv=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_


# -------------------- STEP 15: PREDICT TEST ----------------
test_pred = best_model.predict(test_scaled)

if is_categorical_target and target_encoder is not None:
    test_pred = target_encoder.inverse_transform(test_pred)


# =========================================================
# üî¥ STEP 15B: OPTIONAL TEST EVALUATION
# (ONLY when test already had target AND question asks)
# =========================================================
"""
if test_has_target:
    print("\n--- Test Set Evaluation ---")
    if is_categorical_target:
        print("Test Accuracy:", accuracy_score(y_test_true, test_pred))
    else:
        print("Test RMSE:", np.sqrt(mean_squared_error(y_test_true, test_pred)))
"""
# =========================================================


# =========================================================
# üî¥ STEP 16A: SINGLE-COLUMN SUBMISSION (DEFAULT ‚Äì USE THIS)
# =========================================================
submission = pd.DataFrame({
    "id": test_ids,
    TARGET: test_pred
})
submission.to_csv("submission.csv", index=False)


# =========================================================
# üî¥ STEP 16B: MULTICLASS PROBABILITY SUBMISSION (OPTIONAL)
# ‚ùå KEEP COMMENTED
# ‚ùå UNCOMMENT ONLY IF QUESTION ASKS FOR PROBABILITIES
# =========================================================
"""
probs = best_model.predict_proba(test_scaled)

submission = pd.DataFrame({
    "id": test_ids,
    "Status_C":  probs[:, 0],
    "Status_CL": probs[:, 1],
    "Status_D":  probs[:, 2]
})

submission.to_csv("submission.csv", index=False)
"""
# =========================================================

print("submission.csv created successfully")