In [65]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import auc, roc_curve, roc_auc_score
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
from sklearn.svm import LinearSVR
from sklearn.linear_model import RidgeCV


In [19]:
df = pd.read_csv("train.csv")

#удаляю признаки и строки в которых больше 80 процентов пропусков
df.dropna(thresh=int(0.8*df.shape[1]), axis=0, inplace=True)
df.dropna(thresh=int(0.8*df.shape[0]), axis=1, inplace=True)


# заменяю на моду для категориальных и медиану для непрерывных признаков.
# можно было бы замениить на значение вне области значения фич, но в стекинге 
# буду использовать линейную модель , а для неё вроде такая замена не релевантна
cat_pr = list(df.dtypes[df.dtypes == object].index)
for column in cat_pr:
    df[column].fillna(df[column].mode()[0], inplace=True)


num_feat = [f for f in df if f not in (cat_pr)]
for column in num_feat:
    df[column].fillna(df[column].median(), inplace=True)

# print(df.info())

#у нас не так много данных, чтоб не плодить много признаков, буд заменять категориальные признаки на Frequency Encoding 
# как вариант можно было бы использовать one hot encoding c последующим применением PCA 
for column in cat_pr:
    fe = df.groupby(column).size()/len(df)
    df.loc[:,column]=df[column].map(fe)

X = df.iloc[:,1:-1]
y = df["SalePrice"]

X.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   float64
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   float64
 5   LotShape       1460 non-null   float64
 6   LandContour    1460 non-null   float64
 7   Utilities      1460 non-null   float64
 8   LotConfig      1460 non-null   float64
 9   LandSlope      1460 non-null   float64
 10  Neighborhood   1460 non-null   float64
 11  Condition1     1460 non-null   float64
 12  Condition2     1460 non-null   float64
 13  BldgType       1460 non-null   float64
 14  HouseStyle     1460 non-null   float64
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [24]:
#делю на трейн и тест выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

#маштабирую трейн и тест выборку
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test= sc.transform(X_test)


In [45]:
mod = RandomForestRegressor(n_estimators=10, max_depth=5, min_samples_leaf=10 )
mod.fit(X_train, y_train)  
res_train = mod.score(X_train, y_train)
print("Train prediction result",res_train)
res_test = mod.score(X_test, y_test)
print("Test prediction result",res_test)

Train prediction result 0.8441206701194168
Test prediction result 0.844396553214504


In [46]:
# важность признаков
imp = pd.Series(mod.feature_importances_)
pd.DataFrame({"column":list(df.iloc[:,1:-1].columns), "importance":imp}).sort_values(by="importance", ascending=False)

Unnamed: 0,column,importance
15,OverallQual,0.693347
44,GrLivArea,0.129288
41,1stFlrSF,0.058453
36,TotalBsmtSF,0.025644
58,GarageCars,0.015280
...,...,...
34,BsmtFinSF2,0.000000
38,HeatingQC,0.000000
40,Electrical,0.000000
43,LowQualFinSF,0.000000


In [70]:
# стэкинг
st_mod = StackingRegressor(
    [
        ('svr', LinearSVR(random_state=42)),
        ('lr', RidgeCV()),
        ('dt', DecisionTreeRegressor(max_depth=10, min_samples_leaf=10))
    ],
LinearRegression())
st_mod.fit(X_train, y_train)
print("Final predict: ", st_mod.score(X_test, y_test))
print("LinearSVR predict: ",st_mod.named_estimators_['svr'].score(X_test, y_test))
print("RidgeCV predict: ",st_mod.named_estimators_['lr'].score(X_test, y_test))
print("DecisionTreeRegressor predict: ",st_mod.named_estimators_['dt'].score(X_test, y_test))

Final predict:  0.8343093270667657
LinearSVR predict:  -4.5909216823676715
RidgeCV predict:  0.842347539182106
DecisionTreeRegressor predict:  0.7409260262163782
