In [87]:
import dagshub
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, enable_mlflow_logging=False):
        self.num_cols = None
        self.cat_cols = None
        self.num_means = None
        self.enable_mlflow_logging = enable_mlflow_logging
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Check if X is a DataFrame
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
            self.cat_cols = X.select_dtypes(include=['object']).columns.tolist()
            self.num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
            self.num_means = X[self.num_cols].mean()
            # self.num_means = X[self.num_cols].mean()
        else:
            # print("not dataframe MVI")
            # For numpy arrays, assume all columns are numeric
            self.feature_names_in_ = [f'feature_{i}' for i in range(X.shape[1])]
            self.cat_cols = []
            self.num_cols = self.feature_names_in_
            # Create a pandas Series with feature names as index
            self.num_means = pd.Series(np.nanmean(X, axis=0), index=self.num_cols)

        if self.enable_mlflow_logging and hasattr(mlflow, 'log_dict'):
            mlflow.log_dict(self.num_means.to_dict(), "imputer/num_means.json")
        return self

    def transform(self, X):
        # Convert to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        X_copy = X.copy()
        
        # Apply imputation
        if self.cat_cols:
            X_copy[self.cat_cols] = X_copy[self.cat_cols].fillna('Unknown')
        
        for col in self.num_cols:
            if col in X_copy.columns:
                # X_copy[col] = X_copy[col].fillna(self.num_means.get(col, 0))
                X_copy[col] = X_copy[col].fillna(0)
        if self.enable_mlflow_logging and hasattr(mlflow, 'log_dict'):
            nan_counts = X_copy.isna().sum()
            mlflow.log_dict(nan_counts[nan_counts > 0].to_dict(), 
                           "imputer/remaining_nans.json")
        # print("missing:")
        # print(X_copy.head)

        return X_copy
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_in_)
    
    
class RFEFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, n_features_to_select=10, step=1, 
                 enable_mlflow_logging=False):
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.step = step
        self.enable_mlflow_logging = enable_mlflow_logging
        self.support_ = None
        self.ranking_ = None
        self.selected_features_ = []
        self.feature_names_in_ = None

    def fit(self, X, y):
        from sklearn.feature_selection import RFE
        
        # Save original feature names
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
            X_num = X.select_dtypes(include=[np.number])
            feature_names = X_num.columns
        else:
            # print("not dataframe RFE")
            self.feature_names_in_ = [f'feature_{i}' for i in range(X.shape[1])]
            X_num = X  # Assume all features are numeric if numpy array
            feature_names = np.array(self.feature_names_in_)
        
        rfe = RFE(estimator=clone(self.estimator),
                  n_features_to_select=self.n_features_to_select,
                  step=self.step)
        rfe.fit(X_num, y)

        self.support_ = rfe.support_
        self.ranking_ = rfe.ranking_
        
        if isinstance(X, pd.DataFrame):
            self.selected_features_ = feature_names[self.support_].tolist()
        else:
            # For numpy arrays, keep track of feature indices
            self.selected_indices_ = np.where(self.support_)[0]
            self.selected_features_ = [self.feature_names_in_[i] for i in self.selected_indices_]

        if self.enable_mlflow_logging and hasattr(mlflow, 'log_dict'):
            mlflow.log_dict({
                "selected_features": self.selected_features_,
                "feature_ranking": {str(name): int(rank) for name, rank in zip(feature_names, self.ranking_)},
                "n_features_to_select": self.n_features_to_select,
                "step": self.step
            }, "feature_selection/rfe.json")
        # print("selected features:")
        # for f in self.selected_features_:
        #     print(f)
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            # For pandas DataFrame, select columns by name
            # Make sure all selected features exist in the input
            available_features = [f for f in self.selected_features_ if f in X.columns]
            # print("RFE:")
            # print(X.head)
            return X[available_features]
        else:
            # For numpy arrays, select columns by index
            if hasattr(self, 'selected_indices_'):
                return X[:, self.selected_indices_]
            else:
                # Fallback (shouldn't happen if fit was called first)
                return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.selected_features_)
    
    
class CorrelationFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8, enable_mlflow_logging=False):
        self.threshold = threshold
        self.enable_mlflow_logging = enable_mlflow_logging
        self.features_to_drop_ = []
        self.feature_names_in_ = None

    def fit(self, X, y):
        # Check if X is a DataFrame
        is_dataframe = isinstance(X, pd.DataFrame)
        
        if is_dataframe:
            # Store original feature names
            self.feature_names_in_ = X.columns.tolist()
            numeric_X = X.select_dtypes(include=[np.number]).copy()
            numeric_X['target'] = y
            feature_names = numeric_X.columns[:-1]  # Exclude 'target'
        else:
            # print("not dataframe")
            self.feature_names_in_ = [f'feature_{i}' for i in range(X.shape[1])]
            numeric_X = pd.DataFrame(X, columns=self.feature_names_in_)
            numeric_X['target'] = y
            feature_names = numeric_X.columns[:-1]  # Exclude 'target'
        
        corr_matrix = numeric_X.corr().abs()

        upper = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        upper_matrix = pd.DataFrame(upper, index=corr_matrix.index, 
                                  columns=corr_matrix.columns)

        to_drop = []
        for col in upper_matrix.columns:
            high_corr = corr_matrix[col][upper_matrix[col]]
            for row, value in high_corr.items():
                if value > self.threshold:
                    if corr_matrix[col]['target'] < corr_matrix[row]['target']:
                        to_drop.append(col)
                    else:
                        to_drop.append(row)

        # Filter out 'target' if it ended up in to_drop
        to_drop = [col for col in to_drop if col != 'target']
        self.features_to_drop_ = list(set(to_drop))
        
        if self.enable_mlflow_logging:
            mlflow.log_dict({
                "features_dropped": self.features_to_drop_,
                "correlation_threshold": self.threshold
            }, "feature_selection/corr_filter.json")
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            # print("corr:")
            # print(X.drop(columns=self.features_to_drop_, errors='ignore').head())
            return X.drop(columns=self.features_to_drop_, errors='ignore')
        else:
            # For numpy arrays, identify indices of features to keep
            keep_indices = [i for i, name in enumerate(self.feature_names_in_) 
                           if name not in self.features_to_drop_]
            return X[:, keep_indices]
    
    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array([feat for feat in input_features if feat not in self.features_to_drop_])
    
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_splits=5, smoothing=4, encoding_threshold=3, 
                 enable_mlflow_logging=False):
        self.n_splits = n_splits
        self.smoothing = smoothing
        self.encoding_threshold = encoding_threshold
        self.enable_mlflow_logging = enable_mlflow_logging
        self.kfold_mappings = {}
        self.global_means = {}
        self.dummy_columns = []
        self.cols_for_kfold = []
        self.cols_for_onehot = []
        self.feature_names_in_ = None

    def fit(self, X, y):
        # Convert X to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            # print("not dataframe")
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        
        self.feature_names_in_ = X.columns.tolist()
        
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        X_temp = X.copy()
        X_temp['target'] = y

        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Split columns based on unique value threshold
        self.cols_for_kfold = []
        self.cols_for_onehot = []
        for col in categorical_cols:
            if X[col].nunique() <= self.encoding_threshold:
                self.cols_for_onehot.append(col)
            else:
                self.cols_for_kfold.append(col)

        # Create k-fold encodings
        for col in self.cols_for_kfold:
            global_mean = X_temp['target'].mean()
            self.global_means[col] = global_mean
            self.kfold_mappings[col] = {}

            for train_idx, val_idx in kf.split(X_temp):
                train_fold = X_temp.iloc[train_idx]
                category_means = train_fold.groupby(col)['target'].mean()
                category_counts = train_fold.groupby(col)['target'].count()
                smoothed_means = (
                    category_means * category_counts + global_mean * self.smoothing
                ) / (category_counts + self.smoothing)
                self.kfold_mappings[col].update(smoothed_means.to_dict())

        # Prepare transformed dataframe structure
        X_transformed = X.copy()
        for col in self.cols_for_kfold:
            X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                self.kfold_mappings[col]
            ).fillna(self.global_means[col])
            X_transformed.drop(columns=[col], inplace=True)

        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        self.dummy_columns = X_transformed.columns.tolist()
        
        if self.enable_mlflow_logging:
            log_data = {
                "kfold_encoded": self.cols_for_kfold,
                "one_hot_encoded": self.cols_for_onehot,
                "final_features": self.dummy_columns
            }
            mlflow.log_dict(log_data, "encoding/features.json")
        # for k in self.cols_for_kfold:
        #     print(k)
        # for o in self.cols_for_onehot:
        #     print(o)
        
        return self

    def transform(self, X):
        # Convert X to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            # print("not dataframe")
            X = pd.DataFrame(X, columns=self.feature_names_in_)
            
        X_transformed = X.copy()

        # Apply k-fold encoding
        for col in self.cols_for_kfold:
            if col in X_transformed.columns:
                X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                    self.kfold_mappings[col]
                ).fillna(self.global_means.get(col, 0))
                X_transformed.drop(columns=[col], inplace=True)

        # Apply one-hot encoding
        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        # Ensure consistent columns with training data
        missing_cols = set(self.dummy_columns) - set(X_transformed.columns)
        for col in missing_cols:
            X_transformed[col] = 0

        extra_cols = set(X_transformed.columns) - set(self.dummy_columns)
        if extra_cols:
            X_transformed = X_transformed.drop(columns=list(extra_cols))
            
        # Ensure columns are in the same order
        X_transformed = X_transformed[self.dummy_columns]
        # print("prepro:")
        # print(X_transformed.head())
        return X_transformed

    def get_feature_names_out(self, input_features=None):
        return np.array(self.dummy_columns)
    


In [88]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [89]:
from sklearn import set_config
set_config(display='diagram')
# Load Data
df = pd.read_csv('kaggle/input/train.csv')
df_test = pd.read_csv('kaggle/input/test.csv')

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']
X_test = df_test
train_ids = X.pop('Id')
test_ids = X_test.pop('Id')

print(X.shape, y.shape)

(1460, 79) (1460,)


In [90]:
import dagshub
import mlflow
import mlflow.sklearn
# Initialize DagsHub logging
dagshub.init(repo_owner='arazm21', repo_name='ML-homework_1', mlflow=True)

# Setup MLflow experiment
experiment_name = "cleaned_HW_1"
mlflow.set_experiment(experiment_name)

2025/04/10 03:59:37 INFO mlflow.tracking.fluent: Experiment with name 'cleaned_HW_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1b0c2837c9604da3adc31b1036442c56', creation_time=1744243178194, experiment_id='5', last_update_time=1744243178194, lifecycle_stage='active', name='cleaned_HW_1', tags={}>

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
run_name = "everything_with_gridsearch"

# Define base model and pipeline
model = LinearRegression()
pipeline = Pipeline([
    ('imputer', MissingValueImputer()),
    ('preprocessor', CustomPreprocessor()),
    ('scaler', StandardScaler()),
    ('feature_selector', CorrelationFeatureSelector()),
    ('rfe_selector', RFEFeatureSelector(estimator=model)),
    ('model', model)
])

# Define hyperparameter grid
param_grid = {
    "model__n_estimators": [100, 300, 500],
    "model__max_depth": [10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ['sqrt', 'log2', None],
    "model__bootstrap": [True, False]
}
n_splits = 5
pipeline

In [7]:

# Running GridSearchCV on the full dataset...
# Best params: {'feature_selector__threshold': 0.85, 'preprocessor__encoding_threshold': 4, 'preprocessor__smoothing': 3, 'rfe_selector__n_features_to_select': 10, 'rfe_selector__step': 2}
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn

# Disable autologging to avoid logging every GridSearchCV trial
mlflow.sklearn.autolog(disable=True)

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

with mlflow.start_run(run_name=run_name):
    # Log experiment config
    mlflow.log_params({
        "n_splits": n_splits,
        "model_type": "LinearRegression",
        "preprocessor": "CustomPreprocessor"
    })

    # --- GRID SEARCH (only once) ---
    print("Running GridSearchCV on the full dataset...")

    grid_search = GridSearchCV(pipeline, param_grid, cv=5,
                               scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    mlflow.log_params(best_params)  # Log best parameters once
    print(f"Best params: {best_params}")
    

Running GridSearchCV on the full dataset...
Best params: {'feature_selector__threshold': 0.85, 'preprocessor__encoding_threshold': 4, 'preprocessor__smoothing': 3, 'rfe_selector__n_features_to_select': 10, 'rfe_selector__step': 2}
🏃 View run everything_with_gridsearch at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/3/runs/6519972c8cf34d56be5fa0d8419f52f6
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/3


In [91]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LogisticRegression
# Apply best parameters and enable logging
model = RandomForestRegressor(n_estimators=150, max_depth=6, random_state=42)

pipeline = Pipeline([
    ('imputer', MissingValueImputer(enable_mlflow_logging=True)),
    ('preprocessor', CustomPreprocessor(enable_mlflow_logging=True, encoding_threshold=4, smoothing=3)),
    ('scaler', StandardScaler()),
    ('feature_selector', CorrelationFeatureSelector(enable_mlflow_logging=True, threshold=0.85)),
    ('rfe_selector', RFEFeatureSelector(estimator=model,enable_mlflow_logging=True, n_features_to_select=10, step=2)),
    ('model', model)
])
param_grid = {
    "model__n_estimators": [100, 300, 500],
    "model__max_depth": [10, 20, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ['sqrt', 'log2', None],
    "model__bootstrap": [True, False]
}

In [92]:
import joblib

run_name = "final_one"

# Lists to store fold results
fold_metrics = []

# Main cross-validation loop
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    with mlflow.start_run(run_name=f"{run_name}_fold_{fold}", nested=True):
        X_tr, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
        pipeline_fold = clone(pipeline)

        # Fit pipeline stages
        pipeline_fold.named_steps["imputer"].fit(X_tr)
        X_tr = pipeline_fold.named_steps["imputer"].transform(X_tr)


        print(X_tr.isna().sum().sum())

        pipeline_fold.named_steps["preprocessor"].fit(X_tr, y_tr)
        X_tr = pipeline_fold.named_steps["preprocessor"].transform(X_tr)
        # display(X_tr.head())
        
        
        pipeline_fold.named_steps["scaler"].fit(X_tr)
        X_tr = pipeline_fold.named_steps["scaler"].transform(X_tr)
        X_tr = pd.DataFrame(X_tr, columns=pipeline_fold.named_steps["preprocessor"].get_feature_names_out())

        pipeline_fold.named_steps["feature_selector"].fit(X_tr, y_tr)
        X_tr = pipeline_fold.named_steps["feature_selector"].transform(X_tr)

        pipeline_fold.named_steps["rfe_selector"].fit(X_tr, y_tr)
        X_tr = pipeline_fold.named_steps["rfe_selector"].transform(X_tr)


        model = pipeline_fold.named_steps["model"]
        model.fit(X_tr, y_tr)
        
        
        # Validation
        X_val = pipeline_fold.named_steps["imputer"].transform(X_val)
        X_val = pipeline_fold.named_steps["preprocessor"].transform(X_val)
        X_val = pipeline_fold.named_steps["scaler"].transform(X_val)
        X_val = pd.DataFrame(X_val, columns=pipeline_fold.named_steps["preprocessor"].get_feature_names_out())
        X_val = pipeline_fold.named_steps["rfe_selector"].transform(X_val)
        X_val = pipeline_fold.named_steps["feature_selector"].transform(X_val)

        preds = model.predict(X_val)
        preds[preds < 0.5] = 1


        # print(preds)
        # Compute metrics
        mse = mean_squared_error(y_val, preds)
        rmse = np.sqrt(mse)
        rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(preds)))
        mae = mean_absolute_error(y_val, preds)
        r2 = r2_score(y_val, preds)

        # Log metrics
        mlflow.log_metrics({
            "mse": mse,
            "rmse": rmse,
            "rmsle": rmsle,
            "mae": mae,
            "r2": r2
        })

        # Store fold metrics
        fold_metrics.append({"fold": fold, "mse": mse, "rmse": rmse, "rmsle": rmsle, "mae": mae, "r2": r2})

        # Plot: Predicted vs Actual
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=y_val, y=preds)
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title(f"Fold {fold}: Actual vs Predicted")
        plot_path = f"fold_{fold}_actual_vs_predicted.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()

        # Plot: Residuals
        residuals = y_val - preds
        plt.figure(figsize=(6, 4))
        sns.histplot(residuals, kde=True)
        plt.title(f"Fold {fold}: Residuals Distribution")
        plot_path = f"fold_{fold}_residuals.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()
        
        
        # === Additional Plots ===

        # --- Prediction Error Plot ---
        from sklearn.linear_model import LinearRegression
        
        reg_line = LinearRegression()
        reg_line.fit(y_val.values.reshape(-1, 1), preds)
        
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=y_val, y=preds, label="Predictions")
        sns.lineplot(x=y_val, y=reg_line.predict(y_val.values.reshape(-1, 1)), color="red", label="Regression line")
        plt.plot(y_val, y_val, color="green", linestyle="--", label="Ideal prediction")
        plt.xlabel("Actual")
        plt.ylabel("Predicted")
        plt.title(f"Fold {fold}: Prediction Error")
        plt.legend()
        plot_path = f"fold_{fold}_prediction_error.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()
        os.remove(plot_path)
        
        # --- Residuals vs Predicted ---
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=preds, y=residuals)
        plt.axhline(0, color='red', linestyle='--')
        plt.xlabel("Predicted")
        plt.ylabel("Residual")
        plt.title(f"Fold {fold}: Residuals vs Predicted")
        plot_path = f"fold_{fold}_residuals_vs_predicted.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()
        os.remove(plot_path)
        
        # --- Absolute Error by Actual Quartile ---
        val_df = pd.DataFrame({"actual": y_val, "predicted": preds})
        val_df["abs_error"] = np.abs(val_df["actual"] - val_df["predicted"])
        val_df["quartile"] = pd.qcut(val_df["actual"], 4, labels=["Q1", "Q2", "Q3", "Q4"])
        
        plt.figure(figsize=(6, 4))
        sns.boxplot(data=val_df, x="quartile", y="abs_error")
        plt.title(f"Fold {fold}: Abs Error by Actual Value Quartile")
        plt.xlabel("Actual Value Quartile")
        plt.ylabel("Absolute Error")
        plot_path = f"fold_{fold}_quartile_error.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()
        os.remove(plot_path)
        
        # --- True vs Predicted Histogram Overlay ---
        plt.figure(figsize=(6, 4))
        sns.histplot(y_val, label="Actual", kde=True, stat="density", color="blue")
        sns.histplot(preds, label="Predicted", kde=True, stat="density", color="orange")
        plt.legend()
        plt.title(f"Fold {fold}: True vs Predicted Distribution")
        plot_path = f"fold_{fold}_hist_overlay.png"
        plt.tight_layout()
        plt.savefig(plot_path)
        mlflow.log_artifact(plot_path)
        plt.close()
        os.remove(plot_path)

        
        
        # Clean up
        os.remove(f"fold_{fold}_actual_vs_predicted.png")
        os.remove(f"fold_{fold}_residuals.png")
        
        # Save the full pipeline
        pipeline_path = f"pipeline_fold_{fold}.joblib"
        joblib.dump(pipeline_fold, pipeline_path)
        mlflow.log_artifact(pipeline_path)
        os.remove(pipeline_path)
        
        # Save just the model
        model_path = f"model_fold_{fold}.joblib"
        joblib.dump(model, model_path)
        mlflow.log_artifact(model_path)
        os.remove(model_path)
        
        
with mlflow.start_run(run_name=f"{run_name}_summary", nested=True):
    # Log pipeline steps in readable format
    pipeline_summary = "\n".join([f"{name}: {step.__class__.__name__}" 
                                  for name, step in pipeline_fold.named_steps.items()])
    with open("pipeline_summary.txt", "w") as f:
        f.write(pipeline_summary)
    mlflow.log_artifact("pipeline_summary.txt")
    os.remove("pipeline_summary.txt")

    # === Final summary ===
    mse_avg = np.mean([m["mse"] for m in fold_metrics])
    rmse_avg = np.mean([m["rmse"] for m in fold_metrics])
    rmsle_avg = np.mean([m["rmsle"] for m in fold_metrics])
    mae_avg = np.mean([m["mae"] for m in fold_metrics])
    r2_avg = np.mean([m["r2"] for m in fold_metrics])
    
    mlflow.log_metrics({
        "avg_mse": mse_avg,
        "avg_rmse": rmse_avg,
        "avg_rmsle": rmsle_avg,
        "avg_mae": mae_avg,
        "avg_r2": r2_avg
    })
    
    # Save all fold results to a CSV and log it
    fold_df = pd.DataFrame(fold_metrics)
    fold_df.to_csv("fold_metrics.csv", index=False)
    mlflow.log_artifact("fold_metrics.csv")
    os.remove("fold_metrics.csv")
    
    # Plot summary boxplots
    plt.figure(figsize=(8, 5))
    fold_df.drop(columns="fold").boxplot()
    plt.title("Distribution of Metrics Across Folds")
    plt.tight_layout()
    plt.savefig("fold_metrics_boxplot.png")
    mlflow.log_artifact("fold_metrics_boxplot.png")
    plt.close()
    os.remove("fold_metrics_boxplot.png")
    # --- Fold Metric Trends ---
    plt.figure(figsize=(8, 5))
    for metric in ["mse", "rmse", "rmsle", "mae", "r2"]:
        sns.lineplot(x=fold_df["fold"], y=fold_df[metric], label=metric, marker="o")
    plt.xlabel("Fold")
    plt.ylabel("Metric Value")
    plt.title("Fold Metric Trends")
    plt.legend()
    plt.tight_layout()
    plt.savefig("fold_metric_trends.png")
    mlflow.log_artifact("fold_metric_trends.png")
    plt.close()
    os.remove("fold_metric_trends.png")

    mlflow.end_run()

0
🏃 View run final_one_fold_0 at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5/runs/b10718924d424a57bb9d84a26b4a7030
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5
0
🏃 View run final_one_fold_1 at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5/runs/3a1496fc05c34986846197f8a7467cc0
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5
0
🏃 View run final_one_fold_2 at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5/runs/ea5c46b2107d48a28c31c787f5abebf9
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5
0
🏃 View run final_one_fold_3 at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5/runs/3205f82c11b24ee68c75e938163a1d84
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5
0
🏃 View run final_one_fold_4 at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/5/runs/

In [86]:
with mlflow.start_run(run_name=f"{run_name}_final_model", nested=True):
    # Clone and refit pipeline on full training data
    final_pipeline = clone(pipeline)

    final_pipeline.named_steps["imputer"].fit(X)
    X_full = final_pipeline.named_steps["imputer"].transform(X)

    final_pipeline.named_steps["preprocessor"].fit(X_full, y)
    X_full = final_pipeline.named_steps["preprocessor"].transform(X_full)

    final_pipeline.named_steps["scaler"].fit(X_full)
    X_full = final_pipeline.named_steps["scaler"].transform(X_full)
    X_full = pd.DataFrame(X_full, columns=final_pipeline.named_steps["preprocessor"].get_feature_names_out())

    final_pipeline.named_steps["feature_selector"].fit(X_full, y)
    X_full = final_pipeline.named_steps["feature_selector"].transform(X_full)

    final_pipeline.named_steps["rfe_selector"].fit(X_full, y)
    X_full = final_pipeline.named_steps["rfe_selector"].transform(X_full)

    # Train model
    final_model = final_pipeline.named_steps["model"]
    final_model.fit(X_full, y)

    # Prepare test data
    X_test_trans = final_pipeline.named_steps["imputer"].transform(X_test)
    X_test_trans = final_pipeline.named_steps["preprocessor"].transform(X_test_trans)
    X_test_trans = final_pipeline.named_steps["scaler"].transform(X_test_trans)
    X_test_trans = pd.DataFrame(X_test_trans, columns=final_pipeline.named_steps["preprocessor"].get_feature_names_out())
    X_test_trans = final_pipeline.named_steps["feature_selector"].transform(X_test_trans)
    X_test_trans = final_pipeline.named_steps["rfe_selector"].transform(X_test_trans)

    # Predict
    test_preds = final_model.predict(X_test_trans)

    # Save submission
    submission = pd.DataFrame({
        "Id": test_ids,          # Replace with your test set ID column
        "target": test_preds     # Replace "target" with actual column name
    })
    submission.to_csv("submission.csv", index=False)
    mlflow.log_artifact("submission.csv")



🏃 View run final_everything_linear_0fill_final_model at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/4/runs/eda4eb68951d40fcadc51cd32cd6fdab
🧪 View experiment at: https://dagshub.com/arazm21/ML-homework_1.mlflow/#/experiments/4
