In [16]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
# Custom Imputer for Missing Values
class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, enable_mlflow_logging=False):
        self.num_cols = None
        self.cat_cols = None
        self.num_means = None
        self.enable_mlflow_logging = enable_mlflow_logging

    def fit(self, X, y=None):
        self.cat_cols = X.select_dtypes(include=['object']).columns.tolist()
        self.num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
        self.num_means = X[self.num_cols].mean()

        if self.enable_mlflow_logging:
            mlflow.log_dict(self.num_means.to_dict(), "imputer/num_means.json")
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.cat_cols] = X_copy[self.cat_cols].fillna('Unknown')
        X_copy[self.num_cols] = X_copy[self.num_cols].fillna(self.num_means)

        if self.enable_mlflow_logging:
            nan_counts = X_copy.isna().sum()
            mlflow.log_dict(nan_counts[nan_counts > 0].to_dict(), 
                           "imputer/remaining_nans.json")
        return X_copy

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, drop_threshold=3, threshold=2):
        self.drop_threshold = drop_threshold
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        y_copy = y.copy() if y is not None else None

        # Only consider numeric columns
        numeric_cols = X_copy.select_dtypes(include=[np.number]).columns

        outlier_mask = np.zeros(X_copy.shape[0], dtype=int)

        for col in numeric_cols:
            Q1 = X_copy[col].quantile(0.25)
            Q3 = X_copy[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            is_outlier = (X_copy[col] < lower_bound) | (X_copy[col] > upper_bound)
            outlier_mask += is_outlier.fillna(False).astype(int)

        rows_to_keep = outlier_mask < self.drop_threshold
        return X_copy[rows_to_keep], (y_copy[rows_to_keep] if y_copy is not None else None)

# Corrected Custom Preprocessor for Encoding
# class CustomPreprocessor(BaseEstimator, TransformerMixin):
#     def __init__(self, kfold_cols, one_hot_cols, n_splits=5, smoothing=4):
#         self.kfold_cols = kfold_cols  # Corrected attribute name
#         self.one_hot_cols = one_hot_cols  # Corrected attribute name
#         self.n_splits = n_splits
#         self.smoothing = smoothing
#         self.kfold_mappings = {}
#         self.global_means = {}
# 
#     def fit(self, X, y):
#         kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
#         X_temp = X.copy()
#         X_temp['target'] = y
# 
#         for col in self.kfold_cols:  # Use corrected attribute name
#             global_mean = X_temp['target'].mean()
#             self.global_means[col] = global_mean
#             self.kfold_mappings[col] = {}
# 
#             for train_idx, val_idx in kf.split(X_temp):
#                 train_fold, val_fold = X_temp.iloc[train_idx], X_temp.iloc[val_idx]
#                 category_means = train_fold.groupby(col)['target'].mean()
#                 category_counts = train_fold.groupby(col)['target'].count()
#                 smoothed_means = (category_means * category_counts + global_mean * self.smoothing) / (category_counts + self.smoothing)
#                 self.kfold_mappings[col].update(smoothed_means.to_dict())
# 
#         return self
# 
#     def transform(self, X):
#         X_transformed = X.copy()
#         for col in self.kfold_cols:  # Use corrected attribute name
#             X_transformed[f'{col}_encoded'] = X_transformed[col].map(self.kfold_mappings[col]).fillna(self.global_means[col])
#             X_transformed.drop(columns=[col], inplace=True)
# 
#         X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_cols, drop_first=True, dummy_na=True, dtype=int)  # Use corrected attribute name
#         return X_transformed
# 
# class CustomPreprocessor(BaseEstimator, TransformerMixin):
#     def __init__(self, kfold_cols, one_hot_cols, n_splits=5, smoothing=4):
#         self.kfold_cols = kfold_cols
#         self.one_hot_cols = one_hot_cols
#         self.n_splits = n_splits
#         self.smoothing = smoothing
#         self.kfold_mappings = {}
#         self.global_means = {}
#         self.dummy_columns = []
# 
#     def fit(self, X, y):
#         kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
#         X_temp = X.copy()
#         X_temp['target'] = y
# 
#         for col in self.kfold_cols:
#             global_mean = X_temp['target'].mean()
#             self.global_means[col] = global_mean
#             self.kfold_mappings[col] = {}
# 
#             for train_idx, val_idx in kf.split(X_temp):
#                 train_fold = X_temp.iloc[train_idx]
#                 category_means = train_fold.groupby(col)['target'].mean()
#                 category_counts = train_fold.groupby(col)['target'].count()
#                 smoothed_means = (
#                     category_means * category_counts + global_mean * self.smoothing
#                 ) / (category_counts + self.smoothing)
#                 self.kfold_mappings[col].update(smoothed_means.to_dict())
# 
#         X_transformed = X.copy()
#         # for col in self.kfold_cols:
#         #     X_transformed[f'{col}_encoded'] = X_transformed[col].map(self.kfold_mappings[col]).fillna(self.global_means[col])
#         #     X_transformed.drop(columns=[col], inplace=True)
#         # Inside transform() of CustomPreprocessor
#         for col in self.kfold_cols:
#             if col in X_transformed.columns:
#                 X_transformed[f'{col}_encoded'] = X_transformed[col].map(self.kfold_mappings[col]).fillna(self.global_means[col])
#                 X_transformed.drop(columns=[col], inplace=True)
# 
#         X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_cols, drop_first=True, dummy_na=True, dtype=int)
#         self.dummy_columns = X_transformed.columns.tolist()
#         mlflow.log_dict({
#             "kfold_encoded": [f"{col}_encoded" for col in self.kfold_cols],
#             "one_hot_encoded": self.one_hot_cols,
#             "final_features": self.dummy_columns
#         }, "encoding/features.json")
#         
#         return self
# 
#     def transform(self, X):
#         X_transformed = X.copy()
#         for col in self.kfold_cols:
#             X_transformed[f'{col}_encoded'] = X_transformed[col].map(self.kfold_mappings[col]).fillna(self.global_means[col])
#             X_transformed.drop(columns=[col], inplace=True)
# 
#         X_transformed = pd.get_dummies(X_transformed, columns=self.one_hot_cols, drop_first=True, dummy_na=True, dtype=int)
# 
#         # Ensure same columns as during fit
#         for col in self.dummy_columns:
#             if col not in X_transformed.columns:
#                 X_transformed[col] = 0
# 
#         # Drop unexpected columns
#         extra_cols = [col for col in X_transformed.columns if col not in self.dummy_columns]
#         if extra_cols:
#             X_transformed.drop(columns=extra_cols, inplace=True)
# 
#         # Reorder columns to match training
#         X_transformed = X_transformed[self.dummy_columns]
#         return X_transformed
# 
#     def get_feature_names_out(self, input_features=None):
#         return np.array(self.dummy_columns)
# 


class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_splits=5, smoothing=4, encoding_threshold=5, 
                 enable_mlflow_logging=False):
        self.n_splits = n_splits
        self.smoothing = smoothing
        self.encoding_threshold = encoding_threshold
        self.enable_mlflow_logging = enable_mlflow_logging
        self.kfold_mappings = {}
        self.global_means = {}
        self.dummy_columns = []
        self.cols_for_kfold = []
        self.cols_for_onehot = []

    def fit(self, X, y):
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        X_temp = X.copy()
        X_temp['target'] = y

        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Split columns based on unique value threshold
        self.cols_for_kfold = []
        self.cols_for_onehot = []
        for col in categorical_cols:
            if X[col].nunique() <= self.encoding_threshold:
                self.cols_for_kfold.append(col)
            else:
                self.cols_for_onehot.append(col)

        # Create k-fold encodings
        for col in self.cols_for_kfold:
            global_mean = X_temp['target'].mean()
            self.global_means[col] = global_mean
            self.kfold_mappings[col] = {}

            for train_idx, val_idx in kf.split(X_temp):
                train_fold = X_temp.iloc[train_idx]
                category_means = train_fold.groupby(col)['target'].mean()
                category_counts = train_fold.groupby(col)['target'].count()
                smoothed_means = (
                    category_means * category_counts + global_mean * self.smoothing
                ) / (category_counts + self.smoothing)
                self.kfold_mappings[col].update(smoothed_means.to_dict())

        if self.enable_mlflow_logging:
            log_data = {
                "kfold_encoded": self.cols_for_kfold,
                "one_hot_encoded": self.cols_for_onehot,
                "final_features": self.dummy_columns
            }
            mlflow.log_dict(log_data, "encoding/features.json")
        # Prepare transformed dataframe structure
        X_transformed = X.copy()
        for col in self.cols_for_kfold:
            X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                self.kfold_mappings[col]
            ).fillna(self.global_means[col])
            X_transformed.drop(columns=[col], inplace=True)

        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        self.dummy_columns = X_transformed.columns.tolist()
        return self

    def transform(self, X):
        X_transformed = X.copy()

        # Apply k-fold encoding
        for col in self.cols_for_kfold:
            if col in X_transformed.columns:
                X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                    self.kfold_mappings[col]
                ).fillna(self.global_means.get(col, 0))
                X_transformed.drop(columns=[col], inplace=True)

        # Apply one-hot encoding
        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        # Ensure consistent columns with training data
        missing_cols = set(self.dummy_columns) - set(X_transformed.columns)
        for col in missing_cols:
            X_transformed[col] = 0

        extra_cols = set(X_transformed.columns) - set(self.dummy_columns)
        X_transformed = X_transformed[self.dummy_columns]

        return X_transformed

    def get_feature_names_out(self, input_features=None):
        return self.dummy_columns





class CorrelationFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8, enable_mlflow_logging=False):
        self.threshold = threshold
        self.enable_mlflow_logging = enable_mlflow_logging
        self.features_to_drop_ = []

    def fit(self, X, y):
        numeric_X = X.select_dtypes(include=[np.number]).copy()
        numeric_X['target'] = y
        corr_matrix = numeric_X.corr().abs()

        upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        upper_matrix = pd.DataFrame(upper, index=corr_matrix.index, 
                                  columns=corr_matrix.columns)

        to_drop = []
        for col in upper_matrix.columns:
            high_corr = corr_matrix[col][upper_matrix[col]]
            for row, value in high_corr.items():
                if value > self.threshold:
                    if corr_matrix[col]['target'] < corr_matrix[row]['target']:
                        to_drop.append(col)
                    else:
                        to_drop.append(row)

        self.features_to_drop_ = list(set(to_drop))
        
        if self.enable_mlflow_logging:
            mlflow.log_dict({
                "features_dropped": self.features_to_drop_,
                "correlation_threshold": self.threshold
            }, "feature_selection/corr_filter.json")
        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop_, errors='ignore')
from sklearn.base import clone

class RFEFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, n_features_to_select=10, step=1, 
                 enable_mlflow_logging=False):
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.step = step
        self.enable_mlflow_logging = enable_mlflow_logging
        self.support_ = None
        self.ranking_ = None
        self.selected_features_ = []

    def fit(self, X, y):
        from sklearn.feature_selection import RFE

        X_num = X.select_dtypes(include=[np.number])
        rfe = RFE(estimator=clone(self.estimator),
                  n_features_to_select=self.n_features_to_select,
                  step=self.step)
        rfe.fit(X_num, y)

        self.support_ = rfe.support_
        self.ranking_ = rfe.ranking_
        self.selected_features_ = X_num.columns[self.support_].tolist()

        if self.enable_mlflow_logging:
            mlflow.log_dict({
                "selected_features": self.selected_features_,
                "feature_ranking": pd.Series(self.ranking_, index=X_num.columns).to_dict(),
                "n_features_to_select": self.n_features_to_select,
                "step": self.step
            }, "feature_selection/rfe.json")
        return self

    def transform(self, X):
        return X[self.selected_features_]


In [21]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [17]:
from sklearn import set_config
set_config(display='diagram')
# Load Data
df = pd.read_csv('kaggle/input/train.csv')
df_test = pd.read_csv('kaggle/input/test.csv')

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']
X_test = df_test
train_ids = X.pop('Id')
test_ids = X_test.pop('Id')

print(X.shape, y.shape)

# Define categorical columns
# cat_cols = [col for col in X.columns if X[col].dtype == 'object']
# s = X[cat_cols].nunique()
# woe_columns = list(s[s > 3].index)
# one_hot_columns = list(s[s <= 3].index)


(1460, 79) (1460,)


In [122]:
import dagshub
import mlflow
import mlflow.sklearn
# Initialize DagsHub logging
dagshub.init(repo_owner='arazm21', repo_name='ML-homework_1', mlflow=True)

# Setup MLflow experiment
experiment_name = "house_guessing_experiment"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/679e3907f91545ad98d45d252f29b107', creation_time=1744102093771, experiment_id='1', last_update_time=1744102093771, lifecycle_stage='active', name='house_guessing_experiment', tags={}>

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
run_name = "linear_regression_rfe_with_standard_scaling"

n_splits = 5
correlation_threshold = 0.8
rfe_n_features_to_select = 15
rfe_step_size=2
encoding_threshold = 4
model = LinearRegression()

# Create the pipeline
pipeline = Pipeline([
    ('imputer', MissingValueImputer()),
    # ('outlier_remover', OutlierRemover(drop_threshold=3, threshold=2)),
    ('preprocessor', CustomPreprocessor(encoding_threshold=encoding_threshold)),
    ("scaler", StandardScaler()),
    ('feature_selector', CorrelationFeatureSelector(threshold=correlation_threshold)),
    ("rfe_selector", RFEFeatureSelector(
        estimator=model,
        n_features_to_select=rfe_n_features_to_select,
        step=rfe_step_size
    )),
    ('model', model)
])


pipeline

In [125]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import clone
from sklearn.model_selection import KFold 
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

import mlflow

# Init plot storage
rmse_per_fold = []
rmsle_per_fold = []
mae_per_fold = []
r2_per_fold = []
mape_per_fold = []

# Create a temporary folder for storing plots
os.makedirs("mlflow_artifacts", exist_ok=True)
kf_instance = KFold(n_splits=n_splits, shuffle=True, random_state=42)

with mlflow.start_run(run_name=run_name) as run:
    pipeline_steps = {name: type(step).__name__ for name, step in pipeline.named_steps.items()}
    mlflow.log_param("pipeline_steps",  str(pipeline_steps))
    mlflow.log_param("n_splits", n_splits)

    for fold, (train_idx, val_idx) in enumerate(kf_instance.split(X)):
        print(f"\nFold {fold + 1}")
    
        X_tr, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
        pipeline_fold = clone(pipeline)

        # Fit pipeline stages
        pipeline_fold.named_steps["imputer"].fit(X_tr)
        X_tr = pipeline_fold.named_steps["imputer"].transform(X_tr)

        pipeline_fold.named_steps["preprocessor"].fit(X_tr, y_tr)
        X_tr = pipeline_fold.named_steps["preprocessor"].transform(X_tr)

        pipeline_fold.named_steps["scaler"].fit(X_tr)
        X_tr = pipeline_fold.named_steps["scaler"].transform(X_tr)
        X_tr = pd.DataFrame(X_tr, columns=pipeline_fold.named_steps["preprocessor"].get_feature_names_out())

        pipeline_fold.named_steps["feature_selector"].fit(X_tr, y_tr)
        X_tr = pipeline_fold.named_steps["feature_selector"].transform(X_tr)

        model = pipeline_fold.named_steps["model"]
        model.fit(X_tr, y_tr)

        # Validation
        X_val = pipeline_fold.named_steps["imputer"].transform(X_val)
        X_val = pipeline_fold.named_steps["preprocessor"].transform(X_val)
        X_val = pipeline_fold.named_steps["scaler"].transform(X_val)
        X_val = pd.DataFrame(X_val, columns=pipeline_fold.named_steps["preprocessor"].get_feature_names_out())
        X_val = pipeline_fold.named_steps["feature_selector"].transform(X_val)

        preds = model.predict(X_val)

        # Metrics
        rmse = mean_squared_error(y_val, preds)
        rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds)))
        mae = mean_absolute_error(y_val, preds)
        r2 = r2_score(y_val, preds)
        mape = np.mean(np.abs((y_val - preds) / y_val)) * 100

        # Append scores
        rmse_per_fold.append(rmse)
        rmsle_per_fold.append(rmsle)
        mae_per_fold.append(mae)
        r2_per_fold.append(r2)
        mape_per_fold.append(mape)

        # Error analysis
        errors = preds - y_val
        results = pd.DataFrame({
            "Prediction": preds,
            "Actual": y_val,
            "Error": errors
        })

        # Visualizations
        fig, axs = plt.subplots(1, 3, figsize=(18, 5))

        # Actual vs Predicted
        sns.scatterplot(x="Actual", y="Prediction", data=results, ax=axs[0])
        axs[0].set_title(f"Fold {fold+1} - Actual vs Prediction")
        axs[0].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], color='red', linestyle='--')

        # Residuals
        sns.scatterplot(x="Prediction", y="Error", data=results, ax=axs[1])
        axs[1].axhline(0, color='red', linestyle='--')
        axs[1].set_title(f"Fold {fold+1} - Residuals vs Prediction")

        # Error distribution
        sns.histplot(results["Error"], bins=30, kde=True, ax=axs[2])
        axs[2].set_title(f"Fold {fold+1} - Error Distribution")

        plot_path = f"mlflow_artifacts/errors_fold_{fold+1}.png"
        fig.tight_layout()
        fig.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close(fig)

        print(results.sort_values(by="Error", key=np.abs, ascending=False).head(10).to_string(index=False))
        print(f"Fold {fold + 1} RMSE: {rmse:.4f} | RMSLE: {rmsle:.4f} | MAE: {mae:.4f} | R2: {r2:.4f}")

    # Log averages
    mlflow.log_metric("avg_rmse", np.mean(rmse_per_fold))
    mlflow.log_metric("avg_rmsle", np.mean(rmsle_per_fold))
    mlflow.log_metric("avg_mae", np.mean(mae_per_fold))
    mlflow.log_metric("avg_r2", np.mean(r2_per_fold))
    mlflow.log_metric("avg_mape", np.mean(mape_per_fold))

    # Log metric plots
    for metric_name, values in {
        "RMSE": rmse_per_fold,
        "RMSLE": rmsle_per_fold,
        "MAE": mae_per_fold,
        "R2": r2_per_fold,
        "MAPE": mape_per_fold
    }.items():
        plt.figure(figsize=(6, 4))
        sns.barplot(x=list(range(1, n_splits+1)), y=values)
        plt.title(f"{metric_name} per Fold")
        plt.xlabel("Fold")
        plt.ylabel(metric_name)
        plot_path = f"mlflow_artifacts/{metric_name.lower()}_per_fold.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()



Fold 1
   Prediction  Actual          Error
545367.575546  755000 -209632.424454
361032.262166  171000  190032.262166
432035.006878  611657 -179621.993122
383127.594223  556581 -173453.405777
249045.475482  143000  106045.475482
304201.936234  395000  -90798.063766
320169.674028  403000  -82830.325972
335249.974410  253293   81956.974410
236672.533453  311500  -74827.466547
377745.839459  451950  -74204.160541
Fold 1 RMSE: 1045627489.5494 | RMSLE: 0.1534 | MAE: 19835.6151 | R2: 0.8637

Fold 2
   Prediction  Actual          Error
390264.592967  745000 -354735.407033
393021.332783  501837 -108815.667217
235172.588528  340000 -104827.411472
329139.657667  423000  -93860.342333
349544.624911  260000   89544.624911
249058.819902  328900  -79841.180098
247902.778250  168500   79402.778250
352052.514764  424870  -72817.485236
225390.074217  154000   71390.074217
210845.360985  140000   70845.360985
Fold 2 RMSE: 1124765244.2125 | RMSLE: 0.1522 | MAE: 20448.9343 | R2: 0.8346

Fold 3
   Predict

In [72]:
X_train = X
y_train=y
pipeline.named_steps["imputer"].fit(X=X_train)
X_train = pipeline.named_steps["imputer"].transform(X=X_train)



# print(X_train.shape, y_train.shape)
# X_train = pipeline.named_steps["outlier_remover"].fit(X_train)
# pipeline.named_steps["outlier_remover"].transform(X_train)
# print(X_train.shape, y_train.shape)


pipeline.named_steps["preprocessor"].fit(X=X_train,y=y_train)
X_train = pipeline.named_steps["preprocessor"].transform(X_train)


pipeline.named_steps["feature_selector"].fit(X_train,y_train)
X_train = pipeline.named_steps["feature_selector"].transform(X_train)

pipeline.named_steps["model"].fit(X_train, y_train)

In [29]:

# Fit the pipeline
# X_train_transformed, y_train_transformed = pipeline[:-1].fit_transform(X_train, y_train)  # Apply transformations
# pipeline.named_steps['model'].fit(X_train_transformed, y_train_transformed)
pipeline.fit()
# Predict on test data
X_test_transformed = pipeline[:-1].transform(X_test)
y_pred = pipeline.named_steps['model'].predict(X_test_transformed)

imputer succesfull
----------------------------
Series([], dtype: int64)
----------------------------


ValueError: too many values to unpack (expected 2)

# finished chain!!!

In [18]:
from sklearn import set_config
set_config(display='diagram')
# Load Data
df = pd.read_csv('kaggle/input/train.csv')
df_test = pd.read_csv('kaggle/input/test.csv')

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']
X_test = df_test
train_ids = X.pop('Id')
test_ids = X_test.pop('Id')

print(X.shape, y.shape)

(1460, 79) (1460,)


In [19]:
import dagshub
import mlflow
import mlflow.sklearn
# Initialize DagsHub logging
dagshub.init(repo_owner='arazm21', repo_name='ML-homework_1', mlflow=True)

# Setup MLflow experiment
experiment_name = "final_HW1_experiment"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/b5e077ef5f6a4d7f923700fcc178da54', creation_time=1744220532503, experiment_id='3', last_update_time=1744220532503, lifecycle_stage='active', name='final_HW1_experiment', tags={}>

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
run_name = "everything_with_gridsearch"

# Define base model and pipeline
model = RandomForestRegressor()
RFEmodel = RandomForestRegressor()
pipeline = Pipeline([
    ('imputer', MissingValueImputer()),
    ('preprocessor', CustomPreprocessor()),
    ('scaler', StandardScaler()),
    ('feature_selector', CorrelationFeatureSelector()),
    ('rfe_selector', RFEFeatureSelector(estimator=model)),
    ('model', model)
])

# Define hyperparameter grid
param_grid = {
    "model__n_estimators": [100, 300, 500],
    "model__max_depth": [10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ['sqrt', 'log2', None],
    "model__bootstrap": [True, False]
}

n_splits = 5
pipeline

In [23]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

# Disable autologging to avoid logging every GridSearchCV trial
mlflow.sklearn.autolog(disable=True)

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

with mlflow.start_run(run_name=run_name):
    # Log experiment config
    mlflow.log_params({
        "n_splits": n_splits,
        "model_type": "LinearRegression",
        "preprocessor": "CustomPreprocessor"
    })

    # --- GRID SEARCH (only once) ---
    print("Running GridSearchCV on the full dataset...")

    grid_search = GridSearchCV(pipeline, param_grid, cv=5,
                               scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    # mlflow.log_params(best_params)  # Log best parameters once
    print(f"Best params: {best_params}")
    

Running GridSearchCV on the full dataset...
🏃 View run everything_with_gridsearch at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/3/runs/40cadc775c654d42935ff59e7e1a11e6
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/3


ValueError: 
All the 2430 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2430 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ~~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
        params=step_params,
        ^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "C:\Users\alex\Documents\university_work\design_patterns\DP_final_project\venv\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ~~~~~~~~^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\alex\AppData\Local\Temp\ipykernel_13884\3404294274.py", line 294, in fit
AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'


In [15]:
    # --- Rebuild pipeline with best params ---
    pipeline.set_params(**best_params)

    # Initialize metric storage
    metrics = {
        'rmse': [],
        'rmsle': [],
        'mae': [],
        'r2': [],
        'mape': []
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"\nFold {fold+1}")
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Fit pipeline with best params on training split
        pipeline.fit(X_tr, y_tr)
        preds = pipeline.predict(X_val)

        # Avoid divide-by-zero in MAPE
        non_zero_mask = y_val != 0
        mape = np.mean(np.abs((y_val[non_zero_mask] - preds[non_zero_mask]) / y_val[non_zero_mask])) * 100

        # Calculate metrics
        fold_metrics = {
            'rmse': np.sqrt(mean_squared_error(y_val, preds)),
            'rmsle': np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds))),
            'mae': mean_absolute_error(y_val, preds),
            'r2': r2_score(y_val, preds),
            'mape': mape
        }

        # Store metrics
        for k, v in fold_metrics.items():
            metrics[k].append(v)

        # Only log plots for first fold to save space

        plot_path = f"mlflow_artifacts/{metric_name.lower()}_per_fold.png"

        mlflow.log_artifact(plot_path, "fold_diagnostics")

        print(f"Fold {fold+1} RMSE: {fold_metrics['rmse']:.4f} | RMSLE: {fold_metrics['rmsle']:.4f}")

    # Log aggregated metrics
    mlflow.log_metrics({
        'mean_rmse': np.mean(metrics['rmse']),
        'mean_rmsle': np.mean(metrics['rmsle']),
        'mean_mae': np.mean(metrics['mae']),
        'mean_r2': np.mean(metrics['r2']),
        'mean_mape': np.mean(metrics['mape'])
    })

    # Log final experiment config
    mlflow.log_dict({
        "best_params": best_params,
        "feature_columns": list(X.columns),
        "target_column": y.name
    }, "experiment_config.json")


NameError: name 'best_params' is not defined

In [None]:
# Log averages
mlflow.log_metric("avg_rmse", np.mean(rmse_per_fold))
mlflow.log_metric("avg_rmsle", np.mean(rmsle_per_fold))
mlflow.log_metric("avg_mae", np.mean(mae_per_fold))
mlflow.log_metric("avg_r2", np.mean(r2_per_fold))
mlflow.log_metric("avg_mape", np.mean(mape_per_fold))

# Metric plots
for metric_name, values in {
    "RMSE": rmse_per_fold,
    "RMSLE": rmsle_per_fold,
    "MAE": mae_per_fold,
    "R2": r2_per_fold,
    "MAPE": mape_per_fold
}.items():
    plt.figure(figsize=(6, 4))
    sns.barplot(x=list(range(1, n_splits+1)), y=values)
    plt.title(f"{metric_name} per Fold")
    plt.xlabel("Fold")
    plt.ylabel(metric_name)
    plot_path = f"mlflow_artifacts/{metric_name.lower()}_per_fold.png"
    plt.tight_layout()
    plt.savefig(plot_path)
    mlflow.log_artifact(plot_path)
    plt.close()
