In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)


In [3]:
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [4]:
from sklearn.model_selection import train_test_split
x = df.drop(columns=['SalePrice'])
y = df['SalePrice']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
train_ids = x_train.pop('Id')
test_ids = x_test.pop('Id')

In [6]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1168 entries, 254 to 1126
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1168 non-null   int64  
 1   MSZoning       1168 non-null   object 
 2   LotFrontage    951 non-null    float64
 3   LotArea        1168 non-null   int64  
 4   Street         1168 non-null   object 
 5   Alley          74 non-null     object 
 6   LotShape       1168 non-null   object 
 7   LandContour    1168 non-null   object 
 8   Utilities      1168 non-null   object 
 9   LotConfig      1168 non-null   object 
 10  LandSlope      1168 non-null   object 
 11  Neighborhood   1168 non-null   object 
 12  Condition1     1168 non-null   object 
 13  Condition2     1168 non-null   object 
 14  BldgType       1168 non-null   object 
 15  HouseStyle     1168 non-null   object 
 16  OverallQual    1168 non-null   int64  
 17  OverallCond    1168 non-null   int64  
 18  YearBuilt  

In [7]:
cat_cols = [col for col in x_train.columns if x_train[col].dtype == 'object']
num_cols = [col for col in x_train.columns if x_train[col].dtype != 'object']

In [8]:
threshold = 3
s = x_train[cat_cols].nunique()
ordinal_columns = list(s[s > 3].index)
one_hot_columns = list(s[s <= 3].index)

**CLEANING**

In [10]:
fill_strategies = {
    'Alley': 'none',
    'BsmtQual': 'none',
    'BsmtCond': 'none',
    'BsmtExposure': 'none',
    'BsmtFinType1': 'none',
    'BsmtFinType2': 'none',
    'FireplaceQu': 'none',
    'GarageType': 'none',
    'GarageFinish': 'none',
    'GarageQual': 'none',
    'GarageCond': 'none',
    'PoolQC': 'none',
    'Fence': 'none',
    'MiscFeature': 'none',
    'MasVnrType': 'none'
}

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Optional, Dict, Union
class NullCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 fill_strategies: Optional[Dict[str, Union[str, float, int]]] = None,
                 default_numerical_strategy: str = 'mean',  
                 default_categorical_strategy: str = 'mode',
                 drop_threshold: float = 0.8):
        
        self.fill_strategies = fill_strategies or {}
        self.default_numerical_strategy = default_numerical_strategy
        self.default_categorical_strategy = default_categorical_strategy
        self.drop_threshold = drop_threshold
        self._fill_values = {}
        self._dropped_columns = []
    
    def fit(self, X: pd.DataFrame, y=None):
        total_rows = len(X)
        self._dropped_columns = [
            col for col in X.columns 
            if X[col].isna().sum() / total_rows >= self.drop_threshold
        ]
        
        remaining_cols = [col for col in X.columns if col not in self._dropped_columns]
        df_remaining = X[remaining_cols]
        
        self._fill_values = {}
        
        for col in df_remaining.columns:
            if col in self.fill_strategies:
                strategy = self.fill_strategies[col]
            else:
                if pd.api.types.is_numeric_dtype(df_remaining[col]):
                    strategy = self.default_numerical_strategy
                else:
                    strategy = self.default_categorical_strategy
            
            if strategy == 'mode':
                fill_value = df_remaining[col].mode()[0] if not df_remaining[col].mode().empty else None
            elif strategy == 'median':
                fill_value = df_remaining[col].median()
            elif strategy == 'mean':
                fill_value = df_remaining[col].mean()
            elif strategy == 'none':
                fill_value = 0 
            else:
                fill_value = strategy
            
            self._fill_values[col] = fill_value
        
        return self
    
        
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df_filled = X.drop(columns=self._dropped_columns, errors='ignore')

        for col, fill_value in self._fill_values.items():
            if col in df_filled.columns and fill_value is not None:
                df_filled[col] = df_filled[col].fillna(fill_value)
        
        
        return df_filled

**FEATURE ENGINEERING**

In [11]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin


class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, ordinal_columns, one_hot_columns):
        self.ordinal_columns = ordinal_columns
        self.one_hot_columns = one_hot_columns
        self.ordinal_mappings_ = {}
        self.onehot_columns_ = None
        
    def fit(self, X, y=None):
        for col in self.ordinal_columns:
            if col in X.columns:
                unique_vals = X[col].dropna().unique()
                self.ordinal_mappings_[col] = {val: idx for idx, val in enumerate(unique_vals)}
        
        if self.one_hot_columns:
            temp = pd.get_dummies(X[self.one_hot_columns], drop_first=True)
            self.onehot_columns_ = temp.columns.tolist()
            
        return self
        
    def transform(self, X):
        X_transformed = X.copy()
        
        for col, mapping in self.ordinal_mappings_.items():
            if col in X_transformed.columns:
                X_transformed[col] = X_transformed[col].map(mapping).fillna(-1)
        
        if self.one_hot_columns and self.onehot_columns_:
            dummies = pd.get_dummies(X_transformed[self.one_hot_columns], drop_first=True)
            
            for col in self.onehot_columns_:
                if col not in dummies.columns:
                    dummies[col] = 0
            
            X_transformed = pd.concat([
                X_transformed.drop(self.one_hot_columns, axis=1),
                dummies[self.onehot_columns_]  # Maintain consistent column order
            ], axis=1)
            
        for col in X_transformed.columns:
            if X_transformed[col].dtype == object:
                X_transformed[col] = pd.to_numeric(X_transformed[col], errors='coerce').fillna(0)
                
        return X_transformed

**FEATURE SELECTION AND TRAINING**

In [12]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer

gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=7,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)


xgb = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=5,
    reg_lambda=1.0,  
    reg_alpha=0.5,   
    subsample=0.8,
    random_state=42
)
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler()]
}

cleaner = NullCleaner(
    fill_strategies=fill_strategies, 
    default_numerical_strategy='mean',
    default_categorical_strategy='mode',
    drop_threshold=0.8
)
x_cleaned = cleaner.fit_transform(x_train)


ordinal_columns = [col for col in ordinal_columns if col in x_cleaned]
one_hot_columns = [col for col in one_hot_columns if col in x_cleaned]

pipeline = Pipeline([
    ('cleaner', cleaner),
    ('preprocessor', CustomPreprocessor(
        ordinal_columns=ordinal_columns,
        one_hot_columns=one_hot_columns
    )),
    ('scaler', StandardScaler()),
    #('feature_select', RFE(estimator=gb, n_features_to_select=60, step=1)),
    #('feature_select', RFE(estimator=LinearRegression(), n_features_to_select=60, step=1)),
    ('feature_select', RFE(estimator=LinearRegression(), n_features_to_select=60, step=1)),
    ('model', LinearRegression()),
    #('feature_select', RFE(estimator=rf, n_features_to_select=60, step=1)),
    #('model', rf)
])



kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=kfold,
    scoring='neg_mean_squared_error',  
    verbose=2,
    return_train_score=True
)


grid_search.fit(x_train, np.log1p(y_train))
best_pipeline = grid_search.best_estimator_
cv_results = grid_search.cv_results_

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] END ............................scaler=StandardScaler(); total time=   0.8s
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ............................scaler=StandardScaler(); total time=   0.3s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s
[CV] END ..............................scaler=MinMaxScaler(); total time=   0.3s


In [13]:
def log_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

train_scores = -cv_results['mean_train_score']  
val_scores = -cv_results['mean_test_score']

train_rmse = np.sqrt(train_scores).mean()  
val_rmse = np.sqrt(val_scores).mean()     

y_test_pred = np.expm1(best_pipeline.predict(x_test))
test_rmse = log_rmse(y_test, y_test_pred)

print("\n=== Overfitting/Underfitting Analysis ===")
print(f"Training RMSE (avg CV folds): {train_rmse:.6f}")
print(f"Validation RMSE (avg CV folds): {val_rmse:.6f}")
print(f"Test RMSE: {test_rmse:.6f}")

if train_rmse < 0.7 * val_rmse:
    print("\n Training error much lower than validation error")
elif val_rmse > 1.3 * test_rmse:
    print("\n Validation error higher than test error")
elif train_rmse > 0.9 * val_rmse and val_rmse > 0.8:
    print("\n Both training and validation errors are high")
else:
    print("\n Balanced Model")


=== Overfitting/Underfitting Analysis ===
Training RMSE (avg CV folds): 0.129568
Validation RMSE (avg CV folds): 0.156797
Test RMSE: 0.148503

 Balanced Model


In [None]:
!pip install mlflow dagshub


In [None]:
import dagshub
dagshub.init(repo_owner='abarb22', repo_name='-House-Prices---Advanced-Regression-Techniques', mlflow=True)

In [None]:
import mlflow
import mlflow.sklearn
import platform
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt


with mlflow.start_run():
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("feature_selection", "RFE")
    mlflow.log_param("n_features_selected", 60)  
    mlflow.log_param("preprocessing", "OrdinalEncoding+OneHotEncoding")
    mlflow.log_param("scaler_options", [scaler.__class__.__name__ for scaler in param_grid['scaler']])

    grid_search.fit(x_train, np.log1p(y_train))
    best_pipeline = grid_search.best_estimator_
    
    cv_results = grid_search.cv_results_
    train_rmse = np.sqrt(-cv_results['mean_train_score']).mean()
    val_rmse = np.sqrt(-cv_results['mean_test_score']).mean()
    
    y_test_pred = np.expm1(best_pipeline.predict(x_test))
    test_rmse = log_rmse(y_test, y_test_pred)
    
    gap_train_val = abs(train_rmse - val_rmse)
    gap_val_test = abs(val_rmse - test_rmse)
    
    for param, value in grid_search.best_params_.items():
        mlflow.log_param(f"best_{param}", value.__class__.__name__)
    
    mlflow.log_metrics({
        "train_log_rmse": train_rmse,
        "validation_log_rmse": val_rmse,
        "test_log_rmse": test_rmse,
        "gap_train_validation": gap_train_val,
        "gap_validation_test": gap_val_test
    })
    
    mlflow.sklearn.log_model(best_pipeline, "pipeline_model")