In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import mlflow
import sys
sys.path.append("..")
from preprocessing.custom_transformers import CustomPreprocessor , CorrelationFeatureDropper, MissingValueHandler

In [2]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [3]:
df = pd.read_csv('../train.csv')

# Train/Test Split



In [4]:
X = df.drop(columns=["Id" , "SalePrice"])
Y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [5]:
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
# num_cols = [col for col in X_train.columns if X_train[col].dtype != 'object']
s = X_train[cat_cols].nunique()
woe_columns = list(s[s > 3].index)
one_hot_columns = list(s[s <= 3].index)

# Build Pipeline

In [7]:
pipeline = Pipeline([
    ('missing_handler', MissingValueHandler(num_strategy='median', cat_strategy='mode')),
    ('preprocessing', CustomPreprocessor(woe_columns=woe_columns, one_hot_columns=one_hot_columns, nan_drop_threshold=0.9)),
    ('correlation_filter', CorrelationFeatureDropper()),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# GridSearch

In [8]:
param_grid = {
    'correlation_filter__threshold': [0.75, 0.8, 0.9],
    'ridge__alpha': [0.1, 1, 10, 25, 100]
}

In [9]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

search = GridSearchCV(pipeline, param_grid=param_grid, cv=cv,
                      scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=2)

search.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [10]:
y_pred_test = search.predict(X_test)
y_pred_train = search.predict(X_train)

def log_regression_metrics(y_true, y_pred, prefix="test"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mlflow.log_metric(f"{prefix}_rmse", rmse)
    mlflow.log_metric(f"{prefix}_mae", mae)
    mlflow.log_metric(f"{prefix}_r2", r2)
    return rmse, mae, r2

train_rmse, train_mae, train_r2 = log_regression_metrics(y_train, y_pred_train, prefix="train")
test_rmse, test_mae, test_r2 = log_regression_metrics(y_test, y_pred_test, prefix="test")


 # ==== Summary ====
print("\n=== Ridge Regression Summary ===")
print(f"Train RMSE: {train_rmse:.2f} | MAE: {train_mae:.2f} | R²: {train_r2:.4f}")
print(f"Test  RMSE: {test_rmse:.2f} | MAE: {test_mae:.2f} | R²: {test_r2:.4f}")




=== Ridge Regression Summary ===
Train RMSE: 30319.78 | MAE: 18949.14 | R²: 0.8459
Test  RMSE: 34302.63 | MAE: 20357.75 | R²: 0.8466


In [11]:
search.best_params_

{'correlation_filter__threshold': 0.9, 'ridge__alpha': 100}

In [12]:
def plot_actual_vs_predicted(y_true, y_pred, title, filename):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_true, y_pred, alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], color='red', linestyle='--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(title)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [13]:
def eval_regression(y_true, y_pred):
    return {
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }

In [16]:
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance

# Set experiment name
experiment_name = "Ridge Linear Regression"
run_name = "Ridge with custom feature Removal"

# Set up MLflow experiment
mlflow.set_experiment(experiment_name)

if mlflow.active_run():
    mlflow.end_run()

with mlflow.start_run(run_name=run_name) as mlflow_run:
    run_id = mlflow_run.info.run_id
    print(f"MLflow Run ID: {run_id}")

    # Log experiment-level params
    mlflow.log_params({
        "model_type": "Ridge",
        "cv_folds": cv.n_splits,
        "scoring": "neg_root_mean_squared_error",
        "correlation_threshold": 0.9,
        "preprocessing": "WOE + OneHot + Scaling + CorrelationDropper + custom Feature removing"
    })

    search.fit(X_train, y_train)

    y_train_pred = search.predict(X_train)
    y_test_pred = search.predict(X_test)

    train_metrics = eval_regression(y_train, y_train_pred)
    test_metrics = eval_regression(y_test, y_test_pred)

    mlflow.log_param("best_params", search.best_params_)

    for k, v in train_metrics.items():
        mlflow.log_metric(f"train_{k.lower()}", v)
    for k, v in test_metrics.items():
        mlflow.log_metric(f"test_{k.lower()}", v)

    mlflow.sklearn.log_model(search.best_estimator_, artifact_path="ridge_best_model")

    plot_actual_vs_predicted(y_train, y_train_pred, "Train: Actual vs Predicted", "plots/Ridge/train_actual_vs_pred.png")
    plot_actual_vs_predicted(y_test, y_test_pred, "Test: Actual vs Predicted", "plots/Ridge/test_actual_vs_pred.png")

    mlflow.log_artifact("plots/Ridge/train_actual_vs_pred.png")
    mlflow.log_artifact("plots/Ridge/test_actual_vs_pred.png")

    cv_results_df = pd.DataFrame(search.cv_results_)
    cv_results_df.to_csv("ridge_cv_results.csv", index=False)
    mlflow.log_artifact("ridge_cv_results.csv")

MLflow Run ID: d88eafdfa0344b10bd2bc38666925937
Fitting 5 folds for each of 15 candidates, totalling 75 fits




🏃 View run Ridge with custom feature Removal at: https://dagshub.com/losaberidzebadri/House-Prices-Regression.mlflow/#/experiments/2/runs/d88eafdfa0344b10bd2bc38666925937
🧪 View experiment at: https://dagshub.com/losaberidzebadri/House-Prices-Regression.mlflow/#/experiments/2


In [15]:
import dagshub
dagshub.init(repo_owner='losaberidzebadri', repo_name='House-Prices-Regression', mlflow=True)