In [None]:
# =============================================================
# ðŸ”¥ BLOCK 1 â€” Import Required Libraries
# =============================================================

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    mean_squared_error, r2_score, mean_absolute_error
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import roc_curve, auc

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 2 â€” Load Dataset
# =============================================================

train = pd.read_csv("/mnt/data/train.csv")   # change paths as needed
test = pd.read_csv("/mnt/data/test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

target = train.columns[-1]     # automatically detect target column
print("Detected Target Column:", target)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 3 â€” Exploratory Data Analysis (EDA)
# =============================================================

# Pairplot
num_cols = train.select_dtypes(include=[np.number]).columns[:5]
if len(num_cols) >= 2:
    sns.pairplot(train[num_cols], diag_kind="kde")
    plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(train.corr(), cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Boxplot for outliers
train[num_cols].plot(kind="box", figsize=(10,6))
plt.title("Boxplot (Outlier Visualization)")
plt.show()


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 4 â€” Outlier Removal (IQR)
# =============================================================

def remove_outliers(df, cols):
    df_clean = df.copy()
    for c in cols:
        Q1, Q3 = df_clean[c].quantile(0.25), df_clean[c].quantile(0.75)
        IQR = Q3 - Q1
        low, high = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[c] >= low) & (df_clean[c] <= high)]
    return df_clean

train = remove_outliers(train, num_cols)
print("Shape after Outlier Removal:", train.shape)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 5 â€” Label Encoding
# =============================================================

label_encoders = {}
cat_cols = train.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 6 â€” Auto-Detect Classification or Regression
# =============================================================

if train[target].dtype == "object" or train[target].nunique() < 20:
    PROBLEM_TYPE = "classification"
else:
    PROBLEM_TYPE = "regression"

print("Problem Type:", PROBLEM_TYPE.upper())


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 7 â€” Train / Validation Split
# =============================================================

X = train.drop(target, axis=1)
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 8 â€” Scaling (Numeric Only)
# =============================================================

numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", "passthrough", categorical_features)
    ]
)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 9 â€” Build Model Pipeline (SMOTE + RF)
# =============================================================

if PROBLEM_TYPE == "classification":
    model = RandomForestClassifier()
    pipe = ImbPipeline([
        ("preprocess", preprocess),
        ("smote", SMOTE()),
        ("model", model)
    ])
else:
    model = RandomForestRegressor()
    pipe = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 10 â€” Hyperparameter Tuning (RandomizedSearch)
# =============================================================

param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [5, 10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
}

search = RandomizedSearchCV(
    pipe, param_grid, cv=3, n_iter=5,
    scoring="accuracy" if PROBLEM_TYPE=="classification" else "neg_mean_squared_error",
    n_jobs=-1, verbose=1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

print("Best Model:", best_model)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 11 â€” Model Evaluation (with AUC Curve)
# =============================================================

y_pred = best_model.predict(X_val)

if PROBLEM_TYPE == "classification":
    print("Accuracy:", accuracy_score(y_val, y_pred))
    print("F1 Score:", f1_score(y_val, y_pred, average="weighted"))

    # Only for binary classification
    if len(np.unique(y)) == 2:
        y_proba = best_model.predict_proba(X_val)[:, 1]
        fpr, tpr, _ = roc_curve(y_val, y_proba)
        roc_auc = auc(fpr, tpr)

        print("AUC:", roc_auc)

        plt.figure(figsize=(7,5))
        plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.4f}")
        plt.plot([0,1],[0,1],linestyle="--",color="gray")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend()
        plt.grid(True)
        plt.show()

else:
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae  = mean_absolute_error(y_val, y_pred)
    r2   = r2_score(y_val, y_pred)

    print("\nRMSE:", rmse)
    print("MAE:", mae)
    print("RÂ² Score:", r2)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 12 â€” Train on Full Dataset
# =============================================================

best_model.fit(X, y)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 13 â€” Final Predictions + Inverse Transform
# =============================================================

preds = best_model.predict(test)

if PROBLEM_TYPE == "classification" and target in label_encoders:
    preds = label_encoders[target].inverse_transform(preds)


In [None]:
# =============================================================
# ðŸ”¥ BLOCK 14 â€” Save Submission CSV
# =============================================================

submission = pd.DataFrame({
    "id": test.index,
    target: preds
})

submission.to_csv("submission.csv", index=False)

print("Submission saved as submission.csv")


In [None]:
# === Predict on test and save submission_regression.csv ===
preds = best.predict(X_test)
sub = pd.DataFrame({"id": test.index, "target": preds})
sub.to_csv("submission_regression.csv", index=False)
print("Saved submission_regression.csv (id, target).")