In [30]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_squared_error as mse

df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
y_train = df['SalePrice']
df = df.drop('SalePrice', axis = 1)

In [31]:
category_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
len(category_cols) + len(num_cols) , len(df.columns), X_test.shape[1]

(79, 79, 79)

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = X.copy()
        if isinstance(X, pd.DataFrame):
            for column in X_out.select_dtypes(include=[np.number]).columns:
                Q1 = X_out[column].quantile(0.25)
                Q3 = X_out[column].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.threshold * IQR
                upper_bound = Q3 + self.threshold * IQR
                X_out[column] = X_out[column].clip(lower=lower_bound, upper=upper_bound)
        return X_out
    
df = OutlierRemover().fit_transform(df)

In [33]:
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
category_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_trans, num_cols),
        ('cat', category_trans, category_cols)
    ]
)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(max_iter=100000))
])

In [34]:
y_train_log = np.log1p(y_train)

In [35]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV


gscv = GridSearchCV(model_pipeline, {
    'regressor__fit_intercept': [True, False]
}, cv = 5, scoring='neg_mean_squared_log_error')

In [37]:
gscv.fit(df, y_train_log)

In [38]:
preds_gs = np.expm1(gscv.predict(df))
preds_gs_mean = gscv_mean.predict(df)
rmsle_train = msle(y_train, preds_gs) ** 0.5
print(rmsle_train)

0.09673435602904129


In [39]:
y_pred_log_test = gscv.predict(X_test)
y_pred_test = np.expm1(y_pred_log_test)

In [40]:
sample_submission_df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = y_pred_test
sample_submission_df.to_csv('./kaggle/working/submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,118159.755373
1,1462,158106.353701
2,1463,186260.453681
3,1464,203628.364112
4,1465,190167.246853


: 