In [None]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load your data
df = pd.read_csv("data/clean/clean_housing_tunisia_model_ready.csv")

# Feature engineering (optional, can toggle per run)
df["rooms_per_surface"] = df["rooms"] / df["surface"]
df["surface_squared"] = df["surface"] ** 2

# Pick features
X = df[
    ["surface", "surface_squared", "rooms", "rooms_per_surface",
     "governorate", "property_type"]
]
X = pd.get_dummies(X, columns=["governorate", "property_type"], drop_first=True)
y = df["log_price"]

# Split function (toggle between random or grouped)
def get_split(X, y, method="random"):
    if method == "grouped":
        gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        for train_idx, test_idx in gss.split(X, y, groups=df["governorate"]):
            return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
    else:
        return train_test_split(X, y, test_size=0.2, random_state=42)

# Models to test
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(
        n_estimators=300, max_depth=None, min_samples_leaf=1, random_state=42, n_jobs=-1
    ),
    "XGBoost": XGBRegressor(
        n_estimators=500, max_depth=6, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8,
        objective="reg:squarederror", random_state=42, n_jobs=-1
    )
}

mlflow.set_experiment("house-price-tunisia")

# Iterate over models and split types
for split_method in ["random", "grouped"]:
    X_train, X_test, y_train, y_test = get_split(X, y, method=split_method)
    
    for name, model in models.items():
        run_name = f"{name}_{split_method}_split"
        with mlflow.start_run(run_name=run_name):
            # Train
            model.fit(X_train, y_train)
            
            # Predict
            y_pred = model.predict(X_test)
            
            # Metrics
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            # Log metrics and params
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_param("model", name)
            mlflow.log_param("split", split_method)
            mlflow.log_param("features", list(X.columns))
            
            # Log model
            mlflow.sklearn.log_model(model, name="model")

        

            
