In [133]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_squared_error as mse

df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
y_train = df['SalePrice']
df = df.drop('SalePrice', axis = 1)

In [134]:
category_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
len(category_cols) + len(num_cols) , len(df.columns), X_test.shape[1]

(79, 79, 79)

In [135]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5, top=0.75, bottom=0.25):
        self.threshold = threshold
        self.top = top
        self.bottom = bottom
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_out = X.copy()
        for column in X_out.select_dtypes(include=[np.number]).columns:
            # Replace NaNs with mean
            mean_val = X_out[column].mean()
            X_out[column].fillna(mean_val, inplace=True)
            
            Q1 = X_out[column].quantile(self.bottom)
            Q3 = X_out[column].quantile(self.top)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            X_out[column] = X_out[column].clip(lower=lower_bound, upper=upper_bound)
        return X_out
outlier = OutlierRemover()

In [136]:
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
category_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_trans, num_cols),
        ('cat', category_trans, category_cols),
        ('outlier', outlier, num_cols)
    ]
)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge()),
])

In [137]:
for k in model_pipeline.get_params().keys():
    if 'outlier' in k : print (k)

preprocessor__outlier
preprocessor__outlier__bottom
preprocessor__outlier__threshold
preprocessor__outlier__top


In [138]:
y_train_log = np.log1p(y_train)

In [139]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

scorer = make_scorer(msle, greater_is_better = False)

params={ 'regressor__fit_intercept': [True], 'regressor__alpha': [ 26.868686 ] ,
         'preprocessor__outlier__top' : [0.23_5555], 'preprocessor__outlier__bottom': [0.73_2222],
         'preprocessor__outlier__threshold': [1.48_8888]}
gscv = GridSearchCV(model_pipeline, params, cv = 5, scoring=scorer)
gscv.fit(df, y_train_log)

In [140]:
print(gscv.best_params_)
print(gscv.best_score_)
preds_gs = np.expm1(gscv.predict(df))
rmsle_train = msle(y_train, preds_gs) ** 0.5
print(rmsle_train)

{'preprocessor__outlier__bottom': 0.732222, 'preprocessor__outlier__threshold': 1.488888, 'preprocessor__outlier__top': 0.235555, 'regressor__alpha': 26.868686, 'regressor__fit_intercept': True}
-8.930512814679732e-05
0.10229434027408181


In [141]:
y_pred_log_test = gscv.predict(X_test)
y_pred_test = np.expm1(y_pred_log_test)

In [142]:
sample_submission_df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = y_pred_test
sample_submission_df.to_csv('./kaggle/working/submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,119258.946301
1,1462,156676.660069
2,1463,183440.322044
3,1464,202024.957409
4,1465,192122.817074
