In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LassoCV, LinearRegression
%matplotlib inline

In [2]:
df_raw = pd.read_table("AmesHousing.csv", sep=",")
df_raw.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
(df_raw.isna().sum() / len(df_raw)).loc[lambda x: x > 0] # процент пропусков по столбцам

Lot Frontage      0.167235
Alley             0.932423
Mas Vnr Type      0.605802
Mas Vnr Area      0.007850
Bsmt Qual         0.027304
Bsmt Cond         0.027304
Bsmt Exposure     0.028328
BsmtFin Type 1    0.027304
BsmtFin SF 1      0.000341
BsmtFin Type 2    0.027645
BsmtFin SF 2      0.000341
Bsmt Unf SF       0.000341
Total Bsmt SF     0.000341
Electrical        0.000341
Bsmt Full Bath    0.000683
Bsmt Half Bath    0.000683
Fireplace Qu      0.485324
Garage Type       0.053584
Garage Yr Blt     0.054266
Garage Finish     0.054266
Garage Cars       0.000341
Garage Area       0.000341
Garage Qual       0.054266
Garage Cond       0.054266
Pool QC           0.995563
Fence             0.804778
Misc Feature      0.963823
dtype: float64

In [4]:
y = df_raw['SalePrice']
X = df_raw.drop(columns=['SalePrice', 'Order', 'PID'])  # удалим ID

### Масштабирование

In [5]:
num_features = X.select_dtypes(include=np.number).columns.tolist() # Выбираем числовые признаки

X_num = X[num_features].copy()
X_num.fillna(X_num.median(), inplace=True)

# 1. Стандартизация
X_standard = pd.DataFrame(StandardScaler().fit_transform(X_num), columns=X_num.columns)

# 2. Мин-Макс масштабирование
X_minmax = pd.DataFrame(MinMaxScaler().fit_transform(X_num), columns=X_num.columns)

# 3. Robust Scaling
X_robust = pd.DataFrame(RobustScaler().fit_transform(X_num), columns=X_num.columns)

In [6]:
X_standard.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,-0.877005,3.375742,2.744381,-0.067254,-0.506718,-0.375537,-1.163488,0.061046,0.431223,-0.293918,...,0.256673,0.920121,0.214409,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.448057,1.678499
1,-0.877005,0.514952,0.187097,-0.776079,0.393091,-0.342468,-1.115542,-0.566039,0.05576,0.557582,...,1.196325,0.366061,-0.704493,-0.358838,-0.103134,1.85453,-0.063031,-0.089422,-0.079602,1.678499
2,-0.877005,0.56185,0.522814,-0.067254,0.393091,-0.441674,-1.25938,0.03865,1.0548,-0.293918,...,-0.748103,2.368594,-0.170937,-0.358838,-0.103134,-0.285354,-0.063031,21.985725,-0.079602,1.678499
3,-0.877005,1.124628,0.128458,0.641571,-0.506718,-0.110988,-0.779919,-0.566039,1.366588,-0.293918,...,0.228763,-0.74206,-0.704493,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-0.816513,1.678499
4,0.061285,0.233563,0.467348,-0.776079,-0.506718,0.848,0.658466,-0.566039,0.764969,-0.293918,...,0.042693,0.935952,-0.200579,-0.358838,-0.103134,-0.285354,-0.063031,-0.089422,-1.184969,1.678499


In [7]:
X_minmax.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,0.0,0.410959,0.14242,0.555556,0.5,0.637681,0.166667,0.07,0.113218,0.0,...,0.354839,0.147472,0.083558,0.0,0.0,0.0,0.0,0.0,0.363636,1.0
1,0.0,0.202055,0.048246,0.444444,0.625,0.644928,0.183333,0.0,0.08292,0.094364,...,0.490591,0.098315,0.0,0.0,0.0,0.208333,0.0,0.0,0.454545,1.0
2,0.0,0.205479,0.060609,0.555556,0.625,0.623188,0.133333,0.0675,0.163536,0.0,...,0.209677,0.275983,0.048518,0.0,0.0,0.0,0.0,0.735294,0.454545,1.0
3,0.0,0.246575,0.046087,0.666667,0.5,0.695652,0.3,0.0,0.188696,0.0,...,0.350806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.272727,1.0
4,0.235294,0.181507,0.058566,0.444444,0.5,0.905797,0.8,0.0,0.140149,0.0,...,0.323925,0.148876,0.045822,0.0,0.0,0.0,0.0,0.0,0.181818,1.0


In [8]:
X_robust.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,-0.6,4.055556,5.427339,0.0,0.0,-0.276596,-0.846154,0.688172,0.366485,0.0,...,0.1875,1.25,0.5,0.0,0.0,0.0,0.0,0.0,-0.25,1.0
1,-0.6,0.666667,0.531106,-0.5,1.0,-0.255319,-0.820513,0.0,0.133515,144.0,...,0.976562,0.833333,-0.385714,0.0,0.0,120.0,0.0,0.0,0.0,1.0
2,-0.6,0.722222,1.173876,0.0,1.0,-0.319149,-0.897436,0.663594,0.753406,0.0,...,-0.65625,2.339286,0.128571,0.0,0.0,0.0,0.0,12500.0,0.0,1.0
3,-0.6,1.388889,0.418834,0.5,0.0,-0.106383,-0.641026,0.0,0.946866,0.0,...,0.164062,0.0,-0.385714,0.0,0.0,0.0,0.0,0.0,-0.5,1.0
4,0.2,0.333333,1.067679,-0.5,0.0,0.510638,0.128205,0.0,0.573569,0.0,...,0.007812,1.261905,0.1,0.0,0.0,0.0,0.0,0.0,-0.75,1.0


In [10]:
# Выбросы по Z-оценке
from scipy.stats import zscore
z_scores = np.abs(zscore(X_num))
X_no_outliers = X_num[(z_scores < 3).all(axis=1)]

# Замена выбросов на медиану (пример для одного признака)
X_replace_outliers = X_num.copy()
q1 = X_replace_outliers['Lot Frontage'].quantile(0.25)
q3 = X_replace_outliers['Lot Frontage'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

median_value = X_replace_outliers['Lot Frontage'].median()
X_replace_outliers['Lot Frontage'] = np.where(
    (X_replace_outliers['Lot Frontage'] < lower_bound) | (X_replace_outliers['Lot Frontage'] > upper_bound),
    median_value,
    X_replace_outliers['Lot Frontage']
)

In [11]:
X_replace_outliers.head()

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,20,68.0,31770,6,5,1960,1960,112.0,639.0,0.0,...,528.0,210,62,0,0,0,0,0,5,2010
1,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,730.0,140,0,0,0,120,0,0,6,2010
2,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,312.0,393,36,0,0,0,0,12500,6,2010
3,20,93.0,11160,7,5,1968,1968,0.0,1065.0,0.0,...,522.0,0,0,0,0,0,0,0,4,2010
4,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,482.0,212,34,0,0,0,0,0,3,2010


In [12]:
X.columns

Index(['MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Gara

In [13]:
# Пример: Ordinal Encoding для качества кухни
quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
X['Kitchen Qual_encoded'] = X['Kitchen Qual'].map(quality_map)

In [15]:
X['Kitchen Qual_encoded']

0       3
1       3
2       4
3       5
4       3
       ..
2925    3
2926    3
2927    3
2928    3
2929    3
Name: Kitchen Qual_encoded, Length: 2930, dtype: int64

In [None]:
X_num = X_num.dropna(axis=1)  # удалим NaN
selector_filter = SelectKBest(score_func=f_regression, k=5)
X_filter = selector_filter.fit_transform(X_num, y)
selected_filter_cols = X_num.columns[selector_filter.get_support()]
print("Filter method selected:", selected_filter_cols.tolist())

Filter method selected: ['Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'Garage Cars', 'Garage Area']


In [None]:
estimator = LinearRegression()
selector_wrapper = RFE(estimator, n_features_to_select=5, step=1)
X_wrapper = selector_wrapper.fit_transform(X_num, y)
selected_wrapper_cols = X_num.columns[selector_wrapper.get_support()]
print("Wrapper method selected:", selected_wrapper_cols.tolist())

Wrapper method selected: ['Overall Qual', 'Bsmt Full Bath', 'Full Bath', 'Fireplaces', 'Garage Cars']


In [None]:
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_num.fillna(0), y)
selected_embedded_cols = X_num.columns[lasso.coef_ != 0]
print("Embedded method selected:", selected_embedded_cols.tolist())

Embedded method selected: ['MS SubClass', 'Lot Area', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'Total Bsmt SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Screen Porch', 'Misc Val']
