In [182]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy import stats

#Визуализация
import matplotlib.pyplot as plt
import seaborn as sns
#метрики и модели
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
#clustering
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import rand_score, jaccard_score, adjusted_mutual_info_score, silhouette_score, mean_squared_error
from sklearn.cluster import KMeans, AffinityPropagation, DBSCAN

import optuna

In [183]:
test = pd.read_csv('/Users/andrei/repos/HousePricing/Data/test.csv')
train = pd.read_csv('/Users/andrei/repos/HousePricing/Data/train.csv')

In [184]:
train['SalePrice'] = np.log1p(train['SalePrice'])
target = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

fdf = pd.concat([train_features, test_features]).reset_index(drop=True)
print(fdf.shape)

(2919, 80)


In [185]:
#Избавляемся от нулевых значений
nonelist = ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType')

zerolist = ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtHalfBath', 'BsmtFullBath', 'MasVnrArea')

for col in nonelist:
    fdf[col].fillna('None', inplace=True)
    
for col in zerolist:
    fdf[col].fillna(0, inplace=True)
    
fdf["LotFrontage"] = fdf.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

fdf['MSZoning'].fillna('RL', inplace=True)
fdf['Functional'].fillna('Typ', inplace=True)
fdf['Electrical'].fillna('SBrkr', inplace=True)
fdf['KitchenQual'].fillna('TA', inplace=True)
fdf['Exterior1st'].fillna('TA', inplace=True)
fdf['Exterior2nd'].fillna('TA', inplace=True)
fdf['SaleType'].fillna('WD', inplace=True)
fdf['Utilities'].fillna('AllPub', inplace=True)

In [186]:
fdf['TotalSF']=fdf['TotalBsmtSF'] + fdf['1stFlrSF'] + fdf['2ndFlrSF']

fdf['Total_Bathrooms'] = (fdf['FullBath'] + (0.5 * fdf['HalfBath']) +
                               fdf['BsmtFullBath'] + (0.5 * fdf['BsmtHalfBath']))

fdf['Total_porch_sf'] = (fdf['OpenPorchSF'] + fdf['3SsnPorch'] +
                              fdf['EnclosedPorch'] + fdf['ScreenPorch'] +
                              fdf['WoodDeckSF'])

fdf['haspool'] = fdf['PoolArea'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['has2ndfloor'] = fdf['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasgarage'] = fdf['GarageArea'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasbsmt'] = fdf['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasfireplace'] = fdf['Fireplaces'].apply(lambda x: 1 if x > 0 else 0).apply(str)

In [187]:
fdf['MSSubClass'] = fdf['MSSubClass'].apply(str)
fdf['YrSold'] = fdf['YrSold'].astype(str)
fdf['MoSold'] = fdf['MoSold'].astype(str)
fdf['OverallCond'] = fdf['OverallCond'].astype(str)

In [188]:
cfdf = pd.concat([fdf[:1460], target], axis = 1)
corrmat = cfdf.corr()
corfeat = corrmat['SalePrice'][abs(corrmat['SalePrice']) >= 0.6].sort_values(ascending=False)
clist = corfeat.index.tolist()
clist.remove('SalePrice')
cldf = pd.DataFrame(fdf[clist])
scaler = StandardScaler()
scaledX = scaler.fit_transform(cldf)
cluster = KMeans(3, random_state=10)
cluster_labels = cluster.fit_predict(scaledX)
fdf['Cluster'] = cluster_labels
fdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,SaleCondition,TotalSF,Total_Bathrooms,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace,Cluster
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,Normal,2566.0,3.5,61,0,1,1,1,0,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,Normal,2524.0,2.5,298,0,0,1,1,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,Normal,2706.0,3.5,42,0,1,1,1,1,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,Abnorml,2473.0,2.0,307,0,1,1,1,1,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,Normal,3343.0,3.5,276,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,Normal,1638.0,1.5,0,0,1,0,1,0,2
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,Abnorml,1638.0,1.5,24,0,1,1,1,0,2
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,Abnorml,2448.0,2.0,474,0,0,1,1,1,0
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,Normal,1882.0,1.5,112,0,0,0,1,0,2


In [189]:
clabels = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope' \
,'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl' \
,'Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual' \
,'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir' \
,'Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual' \
,'GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']

In [190]:
encoder = CatBoostEncoder()
enc_df = encoder.fit_transform(fdf[:1460], target)


test_encoded = encoder.transform(fdf[1460:])

In [191]:
scaler = StandardScaler()
scaledX = scaler.fit_transform(enc_df)
scaledtest = scaler.transform(test_encoded)

#xtrain, xvalid, ytrain, yvalid = train_test_split(scaledX, target, test_size = 0.25)

In [192]:
xtrain, xeval, ytrain, yeval = train_test_split(scaledX, target, test_size=0.25, random_state=RANDOM_SEED)

In [193]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 666
EARLY_STOPPING_ROUND = 100

In [194]:
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.03, 0.08, 0.005)
    param['depth'] = trial.suggest_int('depth', 2, 5)
    #param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 2.0, 5.0, 0.5)
    #param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = trial.suggest_int('iterations', 1800, 2200)
    param['use_best_model'] = True
    param['eval_metric'] = 'MSLE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'
    
    regressor = CatBoostRegressor(**param)

    regressor.fit(xtrain.copy(), ytrain.copy(),
                  eval_set=[(xeval.copy(), yeval.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND)
    loss = mean_squared_error(yvalid, regressor.predict(xvalid.copy()))
    return loss

In [195]:
study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=100, n_jobs=-1, timeout=24000)

[32m[I 2022-01-28 11:29:53,241][0m A new study created in memory with name: catboost-seed666[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.

Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m[I 2022-01-28 11:30:00,871][0m Trial 2 finished with value: 0.011667667959739399 and parameters: {'learning_rate': 0.03, 'depth': 2, 'iterations': 2047}. Best is trial 2 with value: 0.011667667959739399.[0m
[32m[I 2022-01-28 11:30:02,952][0m Trial 3 finished with value: 0.01029607415315263 and parameters: {'learning_rate': 0.04, 'depth': 3, 'iterations': 1925}. Best is trial 3 with value: 0.01029607415315263.[0m
[32m[I 2022-01-28 11:30:11,352][0

[32m[I 2022-01-28 11:33:17,889][0m Trial 35 finished with value: 0.00905781784045752 and parameters: {'learning_rate': 0.05, 'depth': 5, 'iterations': 2156}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:33:26,478][0m Trial 37 finished with value: 0.00914901786650638 and parameters: {'learning_rate': 0.06, 'depth': 4, 'iterations': 2197}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:33:28,143][0m Trial 38 finished with value: 0.00914901786650638 and parameters: {'learning_rate': 0.06, 'depth': 4, 'iterations': 2057}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:33:35,446][0m Trial 41 finished with value: 0.011667667959739399 and parameters: {'learning_rate': 0.03, 'depth': 2, 'iterations': 2110}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:33:42,024][0m Trial 40 finished with value: 0.009419118412094639 and parameters: {'learning_rate': 0.03, 'depth': 3,

[32m[I 2022-01-28 11:36:50,727][0m Trial 73 finished with value: 0.008762220806961025 and parameters: {'learning_rate': 0.05, 'depth': 4, 'iterations': 2100}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:36:57,008][0m Trial 75 finished with value: 0.008762220806961025 and parameters: {'learning_rate': 0.05, 'depth': 4, 'iterations': 2187}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:37:05,017][0m Trial 76 finished with value: 0.008762220806961025 and parameters: {'learning_rate': 0.05, 'depth': 4, 'iterations': 1940}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:37:09,928][0m Trial 74 finished with value: 0.00883683084136634 and parameters: {'learning_rate': 0.03, 'depth': 4, 'iterations': 2166}. Best is trial 15 with value: 0.008762220806961025.[0m
[32m[I 2022-01-28 11:37:11,011][0m Trial 77 finished with value: 0.008762220806961025 and parameters: {'learning_rate': 0.05, 'depth': 

In [196]:
study.best_params

{'learning_rate': 0.05, 'depth': 4, 'iterations': 2143}

In [197]:
study.best_params

{'learning_rate': 0.05, 'depth': 4, 'iterations': 2143}

In [198]:
optuna.visualization.plot_optimization_history(study)

In [199]:
optuna.visualization.plot_slice(study)

In [200]:
optuna.visualization.plot_contour(study, params=['learning_rate',
                                                 'depth'
                                                 ])

In [201]:
optuna.visualization.plot_param_importances(study)

In [202]:
ocbr = CatBoostRegressor(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        #l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        #min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=study.best_params['iterations'],
                                        use_best_model=True,
                                        eval_metric='MSLE',
                                        od_type='iter',
                                        od_wait=20,
                                        random_state=RANDOM_SEED,
                                        logging_level='Silent')

In [203]:
ocbr.fit(xtrain.copy(), ytrain.copy(),
                        eval_set=[(xeval.copy(), yeval.copy())],
                        early_stopping_rounds=EARLY_STOPPING_ROUND)

<catboost.core.CatBoostRegressor at 0x7febb9552670>

In [204]:
testresult = np.expm1(ocbr.predict(scaledtest))

In [205]:
t = pd.concat([test, pd.DataFrame(testresult)], axis=1)

In [206]:
t.rename(columns={0:'SalePrice'}, inplace=True)

In [207]:
t[['Id', 'SalePrice']].to_csv('submission.csv', sep=',', index=False)

In [208]:
testresult

array([122859.51760635, 152683.12258143, 188249.37704531, ...,
       156211.27611993, 121462.16740873, 204592.67434858])