In [2]:
import dagshub
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
class MissingValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self, enable_mlflow_logging=False):
        self.num_cols = None
        self.cat_cols = None
        self.num_means = None
        self.enable_mlflow_logging = enable_mlflow_logging
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Check if X is a DataFrame
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.tolist()
            self.cat_cols = X.select_dtypes(include=['object']).columns.tolist()
            self.num_cols = X.select_dtypes(exclude=['object']).columns.tolist()
            self.num_means = X[self.num_cols].mean()
        else:
            print("not dataframe MVI")
            # For numpy arrays, assume all columns are numeric
            self.feature_names_in_ = [f'feature_{i}' for i in range(X.shape[1])]
            self.cat_cols = []
            self.num_cols = self.feature_names_in_
            # Create a pandas Series with feature names as index
            self.num_means = pd.Series(np.nanmean(X, axis=0), index=self.num_cols)

        if self.enable_mlflow_logging and hasattr(mlflow, 'log_dict'):
            mlflow.log_dict(self.num_means.to_dict(), "imputer/num_means.json")
        return self

    def transform(self, X):
        # Convert to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.feature_names_in_)
        
        X_copy = X.copy()
        
        # Apply imputation
        if self.cat_cols:
            X_copy[self.cat_cols] = X_copy[self.cat_cols].fillna('Unknown')
        
        for col in self.num_cols:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(self.num_means.get(col, 0))

        if self.enable_mlflow_logging and hasattr(mlflow, 'log_dict'):
            nan_counts = X_copy.isna().sum()
            mlflow.log_dict(nan_counts[nan_counts > 0].to_dict(), 
                           "imputer/remaining_nans.json")
        print("missing:")
        print(X_copy.head)

        return X_copy
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_in_)
    
    
    
    
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_splits=5, smoothing=4, encoding_threshold=5, 
                 enable_mlflow_logging=False):
        self.n_splits = n_splits
        self.smoothing = smoothing
        self.encoding_threshold = encoding_threshold
        self.enable_mlflow_logging = enable_mlflow_logging
        self.kfold_mappings = {}
        self.global_means = {}
        self.dummy_columns = []
        self.cols_for_kfold = []
        self.cols_for_onehot = []
        self.feature_names_in_ = None

    def fit(self, X, y):
        # Convert X to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            print("not dataframe")
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        
        self.feature_names_in_ = X.columns.tolist()
        
        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        X_temp = X.copy()
        X_temp['target'] = y

        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        
        # Split columns based on unique value threshold
        self.cols_for_kfold = []
        self.cols_for_onehot = []
        for col in categorical_cols:
            if X[col].nunique() <= self.encoding_threshold:
                self.cols_for_kfold.append(col)
            else:
                self.cols_for_onehot.append(col)

        # Create k-fold encodings
        for col in self.cols_for_kfold:
            global_mean = X_temp['target'].mean()
            self.global_means[col] = global_mean
            self.kfold_mappings[col] = {}

            for train_idx, val_idx in kf.split(X_temp):
                train_fold = X_temp.iloc[train_idx]
                category_means = train_fold.groupby(col)['target'].mean()
                category_counts = train_fold.groupby(col)['target'].count()
                smoothed_means = (
                    category_means * category_counts + global_mean * self.smoothing
                ) / (category_counts + self.smoothing)
                self.kfold_mappings[col].update(smoothed_means.to_dict())

        # Prepare transformed dataframe structure
        X_transformed = X.copy()
        for col in self.cols_for_kfold:
            X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                self.kfold_mappings[col]
            ).fillna(self.global_means[col])
            X_transformed.drop(columns=[col], inplace=True)

        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        self.dummy_columns = X_transformed.columns.tolist()
        
        if self.enable_mlflow_logging:
            log_data = {
                "kfold_encoded": self.cols_for_kfold,
                "one_hot_encoded": self.cols_for_onehot,
                "final_features": self.dummy_columns
            }
            mlflow.log_dict(log_data, "encoding/features.json")
        for k in self.cols_for_kfold:
            print(k)
        for o in self.cols_for_onehot:
            print(o)
        
        return self

    def transform(self, X):
        # Convert X to DataFrame if it's a numpy array
        if not isinstance(X, pd.DataFrame):
            print("not dataframe")
            X = pd.DataFrame(X, columns=self.feature_names_in_)
            
        X_transformed = X.copy()

        # Apply k-fold encoding
        for col in self.cols_for_kfold:
            if col in X_transformed.columns:
                X_transformed[f'{col}_encoded'] = X_transformed[col].map(
                    self.kfold_mappings[col]
                ).fillna(self.global_means.get(col, 0))
                X_transformed.drop(columns=[col], inplace=True)

        # Apply one-hot encoding
        X_transformed = pd.get_dummies(
            X_transformed,
            columns=self.cols_for_onehot,
            drop_first=True,
            dummy_na=True,
            dtype=int
        )

        # Ensure consistent columns with training data
        missing_cols = set(self.dummy_columns) - set(X_transformed.columns)
        for col in missing_cols:
            X_transformed[col] = 0

        extra_cols = set(X_transformed.columns) - set(self.dummy_columns)
        if extra_cols:
            X_transformed = X_transformed.drop(columns=list(extra_cols))
            
        # Ensure columns are in the same order
        X_transformed = X_transformed[self.dummy_columns]
        print("prepro:")
        print(X_transformed.head())
        return X_transformed

    def get_feature_names_out(self, input_features=None):
        return np.array(self.dummy_columns)

In [3]:
from sklearn import set_config
set_config(display='diagram')
# Load Data
df = pd.read_csv('kaggle/input/train.csv')
df_test = pd.read_csv('kaggle/input/test.csv')

X = df.drop(columns=['SalePrice'])
y = df['SalePrice']
X_test = df_test
train_ids = X.pop('Id')
test_ids = X_test.pop('Id')

print(X.shape, y.shape)
mvi = MissingValueImputer()
mvi.fit(X)
X_new = mvi.transform(X)
X_new


(1460, 79) (1460,)
missing:
<bound method NDFrame.head of       MSSubClass MSZoning  LotFrontage  LotArea Street    Alley LotShape  \
0             60       RL         65.0     8450   Pave  Unknown      Reg   
1             20       RL         80.0     9600   Pave  Unknown      Reg   
2             60       RL         68.0    11250   Pave  Unknown      IR1   
3             70       RL         60.0     9550   Pave  Unknown      IR1   
4             60       RL         84.0    14260   Pave  Unknown      IR1   
...          ...      ...          ...      ...    ...      ...      ...   
1455          60       RL         62.0     7917   Pave  Unknown      Reg   
1456          20       RL         85.0    13175   Pave  Unknown      Reg   
1457          70       RL         66.0     9042   Pave  Unknown      Reg   
1458          20       RL         68.0     9717   Pave  Unknown      Reg   
1459          20       RL         75.0     9937   Pave  Unknown      Reg   

     LandContour Utilities Lo

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Unknown,Reg,Lvl,AllPub,Inside,...,0,0,Unknown,Unknown,Unknown,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Unknown,Reg,Lvl,AllPub,FR2,...,0,0,Unknown,Unknown,Unknown,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,Unknown,IR1,Lvl,AllPub,Inside,...,0,0,Unknown,Unknown,Unknown,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,Unknown,IR1,Lvl,AllPub,Corner,...,0,0,Unknown,Unknown,Unknown,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,Unknown,IR1,Lvl,AllPub,FR2,...,0,0,Unknown,Unknown,Unknown,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Unknown,Reg,Lvl,AllPub,Inside,...,0,0,Unknown,Unknown,Unknown,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Unknown,Reg,Lvl,AllPub,Inside,...,0,0,Unknown,MnPrv,Unknown,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Unknown,Reg,Lvl,AllPub,Inside,...,0,0,Unknown,GdPrv,Shed,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Unknown,Reg,Lvl,AllPub,Inside,...,0,0,Unknown,Unknown,Unknown,0,4,2010,WD,Normal


In [6]:
X_new.isna().sum().sum()


np.int64(0)