<a href="https://colab.research.google.com/github/aditiSharma55555/ml_mse2/blob/main/universal3ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# FINAL UNIVERSAL MACHINE LEARNING TEMPLATE (TABULAR DATA)
#
# Handles:
# 1) CATEGORICAL targets  ‚Üí Status, Class, NObeyesdad
# 2) NUMERICAL targets    ‚Üí price, MEDV
# 3) Missing values in FEATURES
# 4) Missing values in TARGET
# 5) Test WITHOUT target (normal Kaggle)
# 6) Test WITH target (rare exam case)
# 7) Single-column output (DEFAULT)
# 8) Multiclass PROBABILITY output (OPTIONAL ‚Äì COMMENTED)
#
# üîπ MODEL NOTE:
# - Random Forest (USED BELOW) is BEST for TABULAR data
# - For HIGH-DIMENSIONAL SPARSE DATA (TF-IDF / TEXT):
#     ‚ûú Logistic Regression or Linear SVM is preferred
#     ‚ûú See COMMENTED sections below for how to switch
#
# ‚ùó IMPORTANT:
# - This code WORKS AS-IS for normal tabular ML questions
# - Logistic / SVM part is ONLY COMMENTED for reference
# =========================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# üîπ OPTIONAL MODELS (DO NOT IMPORT UNLESS NEEDED)
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC


# -------------------- STEP 1: Load Data --------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


# -------------------- STEP 2: DEFINE TARGET ----------------
# üî¥ CHANGE ONLY THIS LINE BASED ON QUESTION
TARGET = "Status"    # e.g. "Status", "Class", "NObeyesdad", "price"


# -------------------- STEP 3: CHECK IF TEST CONTAINS TARGET
test_has_target = TARGET in test.columns
print("Test contains target:", test_has_target)


# -------------------- STEP 4: Separate Target & Features ---
y = train[TARGET]
train_features = train.drop(columns=[TARGET])

# Remove rows with missing target (MANDATORY)
mask = y.notna()
y = y[mask]
train_features = train_features.loc[mask]

# Prepare test features
if test_has_target:
    y_test_true = test[TARGET]      # ONLY for evaluation
    test_features = test.drop(columns=[TARGET])
else:
    test_features = test.copy()


# -------------------- STEP 5: SAVE ORIGINAL TEST IDs -------
# ‚ùó Submission id MUST match test.csv id
if "id" in test.columns:
    test_ids = test["id"]
else:
    test_ids = pd.Series(range(1, len(test) + 1), name="id")


# -------------------- STEP 6: DROP ID FROM FEATURES --------
for df in [train_features, test_features]:
    if "id" in df.columns:
        df.drop("id", axis=1, inplace=True)


# -------------------- STEP 7: HANDLE MISSING VALUES --------
num_cols_train = train_features.select_dtypes(include=np.number).columns
num_cols_test = test_features.select_dtypes(include=np.number).columns

train_features[num_cols_train] = train_features[num_cols_train].fillna(
    train_features[num_cols_train].median()
)
test_features[num_cols_test] = test_features[num_cols_test].fillna(
    test_features[num_cols_test].median()
)

cat_cols_train = train_features.select_dtypes(include="object").columns
cat_cols_test = test_features.select_dtypes(include="object").columns

for col in cat_cols_train:
    train_features[col] = train_features[col].fillna(train_features[col].mode()[0])

for col in cat_cols_test:
    test_features[col] = test_features[col].fillna(test_features[col].mode()[0])


# -------------------- STEP 8: ENCODE FEATURES --------------
for col in cat_cols_train:
    le = LabelEncoder()
    train_features[col] = le.fit_transform(train_features[col])

    test_features[col] = test_features[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )


# -------------------- STEP 9: TARGET HANDLING --------------
target_encoder = None
is_categorical_target = y.dtype == "object"

if is_categorical_target:
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)


# -------------------- STEP 10: FEATURE SCALING -------------
# NOTE:
# - Scaling is REQUIRED for Logistic Regression / SVM
# - Random Forest does NOT strictly need scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_features)
test_scaled = scaler.transform(test_features)


# -------------------- STEP 11: TRAIN-TEST SPLIT ------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)


# -------------------- STEP 12: MODEL SELECTION -------------
# =========================================================
# DEFAULT MODEL (USED):
# Random Forest ‚Üí BEST for TABULAR data
# =========================================================
if is_categorical_target:
    model = RandomForestClassifier(random_state=42)
else:
    model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)


# =========================================================
# üî¥ OPTIONAL MODEL SWITCH (COMMENTED ‚Äî DO NOT TOUCH)
# Use ONLY for HIGH-DIMENSIONAL SPARSE DATA (TF-IDF / TEXT)
#
# if is_categorical_target:
#     model = LogisticRegression(max_iter=1000)
#     model.fit(X_train, y_train)
#
# OR
#
# if is_categorical_target:
#     model = LinearSVC()
#     model.fit(X_train, y_train)
# =========================================================


# -------------------- STEP 13: EVALUATION ------------------
val_pred = model.predict(X_val)

if is_categorical_target:
    print("Validation Accuracy:", accuracy_score(y_val, val_pred))
    print(classification_report(y_val, val_pred))
else:
    print("Validation RMSE:", np.sqrt(mean_squared_error(y_val, val_pred)))


# -------------------- STEP 14: HYPERPARAMETER TUNING -------
params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10]
}

grid = GridSearchCV(model, params, cv=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_


# =========================================================
# üî¥ IF USING LOGISTIC REGRESSION (COMMENTED)
# üëâ Comment the RandomForest GridSearch above
# =========================================================
"""
params = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs"],
    "max_iter": [500, 1000]
}

grid = GridSearchCV(model, params, cv=3)
grid.fit(X_train, y_train)
best_model = grid.best_estimator_
"""
# =========================================================


# =========================================================
# üî¥ IF USING LinearSVC (COMMENTED)
# üëâ Skip GridSearch completely
# =========================================================
"""
best_model = model
"""
# =========================================================


# -------------------- STEP 15: PREDICT TEST ----------------
test_pred = best_model.predict(test_scaled)

if is_categorical_target and target_encoder is not None:
    test_pred = target_encoder.inverse_transform(test_pred)


# -------------------- STEP 16: SUBMISSION -----------------
submission = pd.DataFrame({
    "id": test_ids,
    TARGET: test_pred
})

submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully")
