In [496]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error as msle

df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
df_test = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
y = df['SalePrice']
df = df.drop('SalePrice', axis = 1)

In [497]:
category_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
len(category_cols) + len(num_cols) , len(df.columns), X_test.shape[1]

(79, 79, 79)

In [498]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = X.copy()
        if isinstance(X, pd.DataFrame):
            for column in X_out.select_dtypes(include=[np.number]).columns:
                Q1 = X_out[column].quantile(0.25)
                Q3 = X_out[column].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.threshold * IQR
                upper_bound = Q3 + self.threshold * IQR
                X_out[column] = X_out[column].clip(lower=lower_bound, upper=upper_bound)
        return X_out
df = OutlierRemover().fit_transform(df)

In [499]:
num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

category_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers= [
        ('num', num_trans, num_cols),
        ('cat', category_trans, category_cols),  # Используйте category_trans_one для one_ec
    ]
)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])


In [500]:
y_log = np.log1p(y)

In [505]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

gscv = GridSearchCV(model_pipeline, {
    'regressor__fit_intercept': [True, False]
}, cv = 30, scoring='neg_mean_squared_log_error')

gscv.fit(df, y_log)

In [506]:
preds_gs = gscv.predict(df)
y_pred = np.expm1(preds_gs)
rmsle_train_2 = msle(y_pred, y) ** 0.5
print(rmsle_train_2)

0.09404641491958816


In [503]:
y_pred_log_test = gscv.predict(df_test)
y_pred_test = np.expm1(y_pred_log_test)

In [504]:
sample_submission_df = pd.read_csv('./kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = y_pred_test
sample_submission_df.to_csv('./kaggle/working/submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,119938.40609
1,1462,167153.321101
2,1463,187348.955418
3,1464,204822.576282
4,1465,189258.84586
