In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
import src.config as cfg
import src.features.build_features as build_features
import pickle
from sklearn.preprocessing import LabelEncoder

### Preprocessing

In [2]:
TARGET_COL = 'SalePrice'
ID_COL = 'Id'
SCALE_COLS = [
    'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
    'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
    '2ndFlrSF', 'LowQualFinSF','GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
    'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea',
    'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal','MoSold', 'YrSold'
]
OHE_COLS = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
    'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle','RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType','ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond','BsmtExposure',
    'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC','CentralAir', 'Electrical', 'KitchenQual',
    'Functional', 'FireplaceQu', 'GarageType','GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
    'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'
]

In [3]:
train = pd.read_csv(str(os.getcwd() + "/../data/raw/train.csv"))
# test = pd.read_csv(str(os.getcwd() + "/../data/raw/test.csv"))

In [4]:
train, target = train.drop(TARGET_COL, axis=1), train[TARGET_COL]

In [5]:
print(train.shape)
# print(test.shape)
print(target.shape)

(1460, 80)
(1460,)


In [6]:
train = build_features.feautre_build(train)
# test = build_features.feautre_build(test)

In [7]:
print(train.shape)
# print(test.shape)
print(target.shape)

(1460, 81)
(1460,)


In [8]:
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[cfg.SCALE_COLS] = df[cfg.SCALE_COLS].astype(np.float32)

    ohe_int_cols = train[cfg.OHE_COLS].select_dtypes('number').columns
    df[ohe_int_cols] = df[ohe_int_cols].astype(np.int8)
    return df


def set_idx(df: pd.DataFrame, idx_col: str) -> pd.DataFrame:
    df = df.set_index(idx_col)
    return df


def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, cfg.ID_COL)
    df = cast_types(df)
    return df

def preprocess_target(df: pd.DataFrame) -> pd.DataFrame:
    df[cfg.TARGET_COL] = df[cfg.TARGET_COL].astype(np.int8)
    return df

In [9]:
pickle.dump(preprocess_data(train), open('../data/processed/train_data.pkl', 'wb'))
# pickle.dump(preprocess_data(test), open('../data/processed/test_data.pkl', 'wb'))
pickle.dump(target, open('../data/processed/target_data.pkl', 'wb'))

### Model

In [10]:
from sklearn.svm import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.multioutput import *

В качестве оcновной метрики оценки качества будем использовать **MSE**, поскольку нам важно заметить любые сильные отклнения от правильного ответа.

Кроме того, в качестве дополнительной прверки посчитаем метрику R2, чтобы посмотреть, насколько хорошом модель работате на новых данных.

In [11]:
def scoring_mse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return mean_absolute_error(y_true, y_pred)

def scoring_r2(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return r2_score(y_true, y_pred)

In [12]:
train_data = pd.read_pickle("../data/processed/train_data.pkl")
target_data = pd.read_pickle("../data/processed/target_data.pkl")

In [13]:
print(train_data.shape)
print(target_data.shape)

(1460, 80)
(1460,)


In [14]:
train_x, test_x, train_y, test_y = train_test_split(train_data, target_data, train_size=0.8, random_state=7) 

In [15]:
print(train_x.shape)
print(test_x.shape)

(1168, 80)
(292, 80)


In [16]:
scale_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
]
)

ohe_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocess_pipe = ColumnTransformer(transformers=[
    ('scale_cols', scale_pipe, cfg.SCALE_COLS),
    ('ohe_cols', ohe_pipe, cfg.OHE_COLS)
]
)

In [17]:
from catboost import CatBoostRegressor, metrics, Pool

In [18]:
model = CatBoostRegressor(iterations = 10000,
            verbose = 1000, learning_rate = 0.01, eval_metric = 'RMSE',
            random_seed = 37, loss_function = 'RMSE', l2_leaf_reg = 100,
            depth = 3, rsm = 0.6, random_strength = 2)

train_x_copy = train_x.copy()
my_pipe = preprocess_pipe.fit(train_x_copy)

train_x_copy = my_pipe.transform(train_x_copy)
model.fit(train_x_copy, train_y)

0:	learn: 77618.3265134	total: 192ms	remaining: 31m 57s
1000:	learn: 30424.3110955	total: 3.6s	remaining: 32.4s
2000:	learn: 26969.0337781	total: 6.98s	remaining: 27.9s
3000:	learn: 25544.5091471	total: 10.4s	remaining: 24.2s
4000:	learn: 24493.8869063	total: 13.7s	remaining: 20.6s
5000:	learn: 23587.9828723	total: 17.1s	remaining: 17.1s
6000:	learn: 22860.8869897	total: 20.4s	remaining: 13.6s
7000:	learn: 22225.6620588	total: 23.7s	remaining: 10.2s
8000:	learn: 21673.8284620	total: 27s	remaining: 6.75s
9000:	learn: 21170.1330282	total: 30.4s	remaining: 3.38s
9999:	learn: 20688.5316706	total: 33.8s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1e7d174fbe0>

In [19]:
model_pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', model)
]
)

In [20]:
test_x_copy = test_x.copy()
test_x_copy = my_pipe.transform(test_x_copy)
y_pred = np.array(model.predict(test_x_copy))

In [21]:
print(train_x.shape)
print(train_x_copy.shape)

print(test_x.shape)
print(test_x_copy.shape)

(1168, 80)
(1168, 299)
(292, 80)
(292, 299)


In [22]:
print(f'MSE: {scoring_mse(test_y, y_pred)}')
print(f'R2 score: {scoring_r2(test_y, y_pred)}')


MSE: 16800.65137312084
R2 score: 0.8968406344097348


По критерию R2 видно, что модель хорошо работает на новых данных (т.к. значение достаточно близко к 1)

Для сравнения посмотрмм модель линейной регресии

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
model_lin_reg = LinearRegression()
model_lin_reg.fit(train_x_copy, train_y)

y_pred_lr = np.array(model_lin_reg.predict(test_x_copy))

In [25]:
print(f'MSE: {scoring_mse(test_y, y_pred_lr)}')
# print(f'R2 score: {scoring_r2(test_y, y_pred_lr)}')

MSE: 50790460607401.89


Очевидно, что результаты, полученные из модели линейной регрессии заметно хуже, чем реультаты CatBoost.

In [26]:
from sklearn.svm import SVR

In [27]:
model_svr = SVR(C=1.0, epsilon=0.2)
model_svr.fit(train_x_copy, train_y)

y_pred_svc = np.array(model_svr.predict(test_x_copy))

In [28]:
print(f'MSE: {scoring_mse(test_y, y_pred_svc)}')
# print(f'R2 score: {scoring_r2(test_y, y_pred_lr)}')

MSE: 59661.0293932256


Метрики качества этой модели намного лучше, однако CatBoost все-таки показал лучший результат