In [None]:
# =============================================================
# ðŸ”¥ UNIVERSAL ML PIPELINE (AUTO CLASS/REG)
# RandomForest + SMOTE + Scaling + Outliers + Tuning + EDA
# =============================================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, classification_report,
    mean_squared_error, r2_score, mean_absolute_error
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# ===============================
# ðŸ”¹ Load the dataset
# ===============================
train = pd.read_csv("/mnt/data/train.csv")   # replace path as needed
test = pd.read_csv("/mnt/data/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# ===============================
# ðŸ”¹ Detect Target Column
# ===============================
target = train.columns[-1]  # last column is target
print("Detected target:", target)

# ===============================
# ðŸ”¹ EDA
# ===============================

# Pairplot (first few numeric columns)
num_cols = train.select_dtypes(include=[np.number]).columns[:5]
if len(num_cols) >= 2:
    sns.pairplot(train[num_cols], diag_kind="kde")
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(train.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Boxplot for visualizing outliers
train[num_cols].plot(kind="box", figsize=(10,6))
plt.title("Boxplot (Outlier Visualization)")
plt.show()

# ===============================
# ðŸ”¹ Outlier Removal (IQR)
# ===============================
def remove_outliers(df, cols):
    df_clean = df.copy()
    for c in cols:
        Q1, Q3 = df_clean[c].quantile(0.25), df_clean[c].quantile(0.75)
        IQR = Q3 - Q1
        low, high = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[c] >= low) & (df_clean[c] <= high)]
    return df_clean

train = remove_outliers(train, num_cols)
print("Shape after outlier removal:", train.shape)

# ===============================
# ðŸ”¹ Label Encoding
# ===============================
label_encoders = {}
cat_cols = train.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# ===============================
# ðŸ”¹ Detect Classification or Regression
# ===============================
if train[target].dtype == 'object' or train[target].nunique() < 20:
    PROBLEM_TYPE = "classification"
else:
    PROBLEM_TYPE = "regression"

print("Problem Type:", PROBLEM_TYPE.upper())

# ===============================
# ðŸ”¹ Train / Validation split
# ===============================
X = train.drop(target, axis=1)
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# ðŸ”¹ Preprocessing (scaling)
# ===============================
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", "passthrough", categorical_features)
    ]
)

# ===============================
# ðŸ”¹ Build Pipeline (SMOTE only for classification)
# ===============================
if PROBLEM_TYPE == "classification":
    model = RandomForestClassifier()
    pipe = ImbPipeline([
        ("preprocess", preprocess),
        ("smote", SMOTE()),          # SMOTE added
        ("model", model),
    ])
else:
    model = RandomForestRegressor()
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model),
    ])

# ===============================
# ðŸ”¹ Hyperparameter Search
# ===============================
param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [5, 10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

search = RandomizedSearchCV(
    pipe, param_grid, cv=3, n_iter=5,
    scoring="accuracy" if PROBLEM_TYPE=="classification" else "neg_mean_squared_error",
    n_jobs=-1, verbose=1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

print("Best Model:", best_model)

# ===============================
# ðŸ”¹ Evaluation
# ===============================
y_pred = best_model.predict(X_val)

if PROBLEM_TYPE == "classification":
    print("\nAccuracy:", accuracy_score(y_val, y_pred))
    print("F1 Score:", f1_score(y_val, y_pred, average="weighted"))

    if len(np.unique(y)) == 2:
        y_proba = best_model.predict_proba(X_val)[:,1]
        print("AUC:", roc_auc_score(y_val, y_proba))

else:
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae  = mean_absolute_error(y_val, y_pred)
    r2   = r2_score(y_val, y_pred)

    print("\nRMSE:", rmse)
    print("MAE:", mae)
    print("RÂ² Score:", r2)

# ===============================
# ðŸ”¹ Train Full Model
# ===============================
best_model.fit(X, y)

# ===============================
# ðŸ”¹ Final Predictions
# ===============================
preds = best_model.predict(test)

# Inverse label transform if classification
if PROBLEM_TYPE == "classification" and target in label_encoders:
    preds = label_encoders[target].inverse_transform(preds)

# ===============================
# ðŸ”¹ Submission File
# ===============================
submission = pd.DataFrame({"id": test.index, target: preds})
submission.to_csv("submission.csv", index=False)

print("\nSubmission saved as: submission.csv")
