In [257]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy import stats

#Визуализация
import matplotlib.pyplot as plt
import seaborn as sns
#метрики и модели
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
#clustering
from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import rand_score, jaccard_score, adjusted_mutual_info_score, silhouette_score, mean_squared_error
from sklearn.cluster import KMeans, AffinityPropagation, DBSCAN

import optuna

In [258]:
test = pd.read_csv('/Users/andrei/repos/HousePricing/Data/test.csv')
train = pd.read_csv('/Users/andrei/repos/HousePricing/Data/train.csv')

In [259]:
train['SalePrice'] = np.log1p(train['SalePrice'])
target = train['SalePrice'].reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

fdf = pd.concat([train_features, test_features]).reset_index(drop=True)
print(fdf.shape)

(2919, 80)


In [260]:
#Избавляемся от нулевых значений
nonelist = ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType')

zerolist = ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtHalfBath', 'BsmtFullBath', 'MasVnrArea')

for col in nonelist:
    fdf[col].fillna('None', inplace=True)
    
for col in zerolist:
    fdf[col].fillna(0, inplace=True)
    
fdf["LotFrontage"] = fdf.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

fdf['MSZoning'].fillna('RL', inplace=True)
fdf['Functional'].fillna('Typ', inplace=True)
fdf['Electrical'].fillna('SBrkr', inplace=True)
fdf['KitchenQual'].fillna('TA', inplace=True)
fdf['Exterior1st'].fillna('TA', inplace=True)
fdf['Exterior2nd'].fillna('TA', inplace=True)
fdf['SaleType'].fillna('WD', inplace=True)
fdf['Utilities'].fillna('AllPub', inplace=True)

In [261]:
fdf['TotalSF']=fdf['TotalBsmtSF'] + fdf['1stFlrSF'] + fdf['2ndFlrSF']

fdf['Total_Bathrooms'] = (fdf['FullBath'] + (0.5 * fdf['HalfBath']) +
                               fdf['BsmtFullBath'] + (0.5 * fdf['BsmtHalfBath']))

fdf['Total_porch_sf'] = (fdf['OpenPorchSF'] + fdf['3SsnPorch'] +
                              fdf['EnclosedPorch'] + fdf['ScreenPorch'] +
                              fdf['WoodDeckSF'])

fdf['haspool'] = fdf['PoolArea'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['has2ndfloor'] = fdf['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasgarage'] = fdf['GarageArea'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasbsmt'] = fdf['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0).apply(str)
fdf['hasfireplace'] = fdf['Fireplaces'].apply(lambda x: 1 if x > 0 else 0).apply(str)

In [262]:
fdf['MSSubClass'] = fdf['MSSubClass'].apply(str)
fdf['YrSold'] = fdf['YrSold'].astype(str)
fdf['MoSold'] = fdf['MoSold'].astype(str)
fdf['OverallCond'] = fdf['OverallCond'].astype(str)

In [263]:
cfdf = pd.concat([fdf[:1460], target], axis = 1)
corrmat = cfdf.corr()
corfeat = corrmat['SalePrice'][abs(corrmat['SalePrice']) >= 0.6].sort_values(ascending=False)
clist = corfeat.index.tolist()
clist.remove('SalePrice')
cldf = pd.DataFrame(fdf[clist])
scaler = StandardScaler()
scaledX = scaler.fit_transform(cldf)
cluster = KMeans(3, random_state=10)
cluster_labels = cluster.fit_predict(scaledX)
fdf['Cluster'] = cluster_labels
fdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,SaleCondition,TotalSF,Total_Bathrooms,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace,Cluster
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,Normal,2566.0,3.5,61,0,1,1,1,0,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,Normal,2524.0,2.5,298,0,0,1,1,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,Normal,2706.0,3.5,42,0,1,1,1,1,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,Abnorml,2473.0,2.0,307,0,1,1,1,1,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,Normal,3343.0,3.5,276,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,Normal,1638.0,1.5,0,0,1,0,1,0,2
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,Abnorml,1638.0,1.5,24,0,1,1,1,0,2
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,Abnorml,2448.0,2.0,474,0,0,1,1,1,0
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,Normal,1882.0,1.5,112,0,0,0,1,0,2


In [264]:
clabels = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope' \
,'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl' \
,'Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual' \
,'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir' \
,'Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual' \
,'GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']

In [265]:
encoder = CatBoostEncoder()
enc_df = encoder.fit_transform(fdf[:1460], target)


test_encoded = encoder.transform(fdf[1460:])

In [266]:
scaler = StandardScaler()
scaledX = scaler.fit_transform(enc_df)
scaledtest = scaler.transform(test_encoded)

#xtrain, xvalid, ytrain, yvalid = train_test_split(scaledX, target, test_size = 0.25)

In [267]:
xtrain, xeval, ytrain, yeval = train_test_split(scaledX, target, test_size=0.25, random_state=RANDOM_SEED)

In [268]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 666
EARLY_STOPPING_ROUND = 100

In [269]:
cbm = CatBoostRegressor(depth=4, iterations=2000, learning_rate=0.05)
cbm.fit(xtrain, ytrain)

0:	learn: 0.3892665	total: 3.73ms	remaining: 7.46s
1:	learn: 0.3768492	total: 6.55ms	remaining: 6.54s
2:	learn: 0.3653433	total: 9.05ms	remaining: 6.02s
3:	learn: 0.3539064	total: 12.1ms	remaining: 6.05s
4:	learn: 0.3433328	total: 15.1ms	remaining: 6.03s
5:	learn: 0.3332133	total: 18.4ms	remaining: 6.13s
6:	learn: 0.3228088	total: 21ms	remaining: 5.99s
7:	learn: 0.3137100	total: 23.5ms	remaining: 5.85s
8:	learn: 0.3052852	total: 26.8ms	remaining: 5.94s
9:	learn: 0.2975100	total: 29.4ms	remaining: 5.86s
10:	learn: 0.2899944	total: 32.2ms	remaining: 5.82s
11:	learn: 0.2827955	total: 35.1ms	remaining: 5.81s
12:	learn: 0.2773907	total: 37.7ms	remaining: 5.76s
13:	learn: 0.2698895	total: 40.9ms	remaining: 5.8s
14:	learn: 0.2634374	total: 43.5ms	remaining: 5.75s
15:	learn: 0.2572233	total: 47.1ms	remaining: 5.83s
16:	learn: 0.2519887	total: 50.2ms	remaining: 5.86s
17:	learn: 0.2466961	total: 52.8ms	remaining: 5.82s
18:	learn: 0.2413892	total: 56.1ms	remaining: 5.85s
19:	learn: 0.2364190	tota

201:	learn: 0.0970358	total: 582ms	remaining: 5.18s
202:	learn: 0.0968513	total: 587ms	remaining: 5.2s
203:	learn: 0.0965210	total: 593ms	remaining: 5.22s
204:	learn: 0.0964162	total: 597ms	remaining: 5.23s
205:	learn: 0.0963546	total: 601ms	remaining: 5.23s
206:	learn: 0.0960487	total: 604ms	remaining: 5.23s
207:	learn: 0.0957984	total: 607ms	remaining: 5.22s
208:	learn: 0.0956387	total: 609ms	remaining: 5.22s
209:	learn: 0.0954391	total: 612ms	remaining: 5.22s
210:	learn: 0.0952494	total: 616ms	remaining: 5.22s
211:	learn: 0.0950603	total: 619ms	remaining: 5.22s
212:	learn: 0.0948730	total: 622ms	remaining: 5.21s
213:	learn: 0.0946697	total: 624ms	remaining: 5.21s
214:	learn: 0.0946047	total: 627ms	remaining: 5.21s
215:	learn: 0.0945183	total: 630ms	remaining: 5.2s
216:	learn: 0.0943613	total: 632ms	remaining: 5.19s
217:	learn: 0.0941479	total: 635ms	remaining: 5.19s
218:	learn: 0.0939997	total: 638ms	remaining: 5.18s
219:	learn: 0.0939459	total: 640ms	remaining: 5.18s
220:	learn: 0.

409:	learn: 0.0709488	total: 1.16s	remaining: 4.49s
410:	learn: 0.0708423	total: 1.16s	remaining: 4.5s
411:	learn: 0.0707666	total: 1.17s	remaining: 4.5s
412:	learn: 0.0706641	total: 1.17s	remaining: 4.5s
413:	learn: 0.0705238	total: 1.17s	remaining: 4.5s
414:	learn: 0.0703797	total: 1.18s	remaining: 4.5s
415:	learn: 0.0702972	total: 1.18s	remaining: 4.5s
416:	learn: 0.0701775	total: 1.18s	remaining: 4.49s
417:	learn: 0.0700993	total: 1.19s	remaining: 4.49s
418:	learn: 0.0700412	total: 1.19s	remaining: 4.48s
419:	learn: 0.0700052	total: 1.19s	remaining: 4.48s
420:	learn: 0.0698257	total: 1.19s	remaining: 4.48s
421:	learn: 0.0697531	total: 1.2s	remaining: 4.48s
422:	learn: 0.0697365	total: 1.2s	remaining: 4.47s
423:	learn: 0.0696782	total: 1.2s	remaining: 4.47s
424:	learn: 0.0695909	total: 1.2s	remaining: 4.46s
425:	learn: 0.0695656	total: 1.21s	remaining: 4.46s
426:	learn: 0.0694904	total: 1.21s	remaining: 4.46s
427:	learn: 0.0694023	total: 1.21s	remaining: 4.45s
428:	learn: 0.0693040	

628:	learn: 0.0558506	total: 1.73s	remaining: 3.78s
629:	learn: 0.0557848	total: 1.74s	remaining: 3.78s
630:	learn: 0.0557436	total: 1.74s	remaining: 3.78s
631:	learn: 0.0556810	total: 1.75s	remaining: 3.78s
632:	learn: 0.0556377	total: 1.75s	remaining: 3.78s
633:	learn: 0.0556121	total: 1.75s	remaining: 3.78s
634:	learn: 0.0554973	total: 1.75s	remaining: 3.77s
635:	learn: 0.0554464	total: 1.76s	remaining: 3.77s
636:	learn: 0.0553649	total: 1.76s	remaining: 3.77s
637:	learn: 0.0552794	total: 1.76s	remaining: 3.77s
638:	learn: 0.0552257	total: 1.77s	remaining: 3.76s
639:	learn: 0.0551732	total: 1.77s	remaining: 3.76s
640:	learn: 0.0550810	total: 1.77s	remaining: 3.76s
641:	learn: 0.0550020	total: 1.77s	remaining: 3.75s
642:	learn: 0.0549733	total: 1.78s	remaining: 3.75s
643:	learn: 0.0549487	total: 1.78s	remaining: 3.75s
644:	learn: 0.0548758	total: 1.78s	remaining: 3.74s
645:	learn: 0.0548390	total: 1.78s	remaining: 3.74s
646:	learn: 0.0547625	total: 1.79s	remaining: 3.74s
647:	learn: 

844:	learn: 0.0451489	total: 2.3s	remaining: 3.15s
845:	learn: 0.0451450	total: 2.31s	remaining: 3.15s
846:	learn: 0.0450974	total: 2.31s	remaining: 3.15s
847:	learn: 0.0450574	total: 2.31s	remaining: 3.15s
848:	learn: 0.0450033	total: 2.32s	remaining: 3.14s
849:	learn: 0.0449828	total: 2.32s	remaining: 3.14s
850:	learn: 0.0449338	total: 2.32s	remaining: 3.14s
851:	learn: 0.0449033	total: 2.33s	remaining: 3.13s
852:	learn: 0.0448606	total: 2.33s	remaining: 3.13s
853:	learn: 0.0448263	total: 2.33s	remaining: 3.13s
854:	learn: 0.0447525	total: 2.33s	remaining: 3.13s
855:	learn: 0.0446998	total: 2.34s	remaining: 3.12s
856:	learn: 0.0446427	total: 2.34s	remaining: 3.12s
857:	learn: 0.0446280	total: 2.34s	remaining: 3.12s
858:	learn: 0.0445857	total: 2.35s	remaining: 3.12s
859:	learn: 0.0445291	total: 2.35s	remaining: 3.11s
860:	learn: 0.0444570	total: 2.35s	remaining: 3.11s
861:	learn: 0.0444102	total: 2.35s	remaining: 3.11s
862:	learn: 0.0443667	total: 2.36s	remaining: 3.1s
863:	learn: 0.

1063:	learn: 0.0369670	total: 2.88s	remaining: 2.53s
1064:	learn: 0.0369414	total: 2.88s	remaining: 2.53s
1065:	learn: 0.0369302	total: 2.89s	remaining: 2.53s
1066:	learn: 0.0369217	total: 2.89s	remaining: 2.53s
1067:	learn: 0.0368672	total: 2.89s	remaining: 2.52s
1068:	learn: 0.0368605	total: 2.9s	remaining: 2.52s
1069:	learn: 0.0368236	total: 2.9s	remaining: 2.52s
1070:	learn: 0.0367920	total: 2.9s	remaining: 2.52s
1071:	learn: 0.0367748	total: 2.9s	remaining: 2.52s
1072:	learn: 0.0367367	total: 2.91s	remaining: 2.51s
1073:	learn: 0.0367086	total: 2.91s	remaining: 2.51s
1074:	learn: 0.0366665	total: 2.91s	remaining: 2.51s
1075:	learn: 0.0366031	total: 2.92s	remaining: 2.5s
1076:	learn: 0.0365660	total: 2.92s	remaining: 2.5s
1077:	learn: 0.0365049	total: 2.92s	remaining: 2.5s
1078:	learn: 0.0364846	total: 2.92s	remaining: 2.5s
1079:	learn: 0.0364625	total: 2.92s	remaining: 2.49s
1080:	learn: 0.0364184	total: 2.93s	remaining: 2.49s
1081:	learn: 0.0363883	total: 2.93s	remaining: 2.49s
1

1284:	learn: 0.0304137	total: 3.45s	remaining: 1.92s
1285:	learn: 0.0303981	total: 3.46s	remaining: 1.92s
1286:	learn: 0.0303800	total: 3.46s	remaining: 1.92s
1287:	learn: 0.0303605	total: 3.47s	remaining: 1.92s
1288:	learn: 0.0303303	total: 3.47s	remaining: 1.91s
1289:	learn: 0.0303016	total: 3.47s	remaining: 1.91s
1290:	learn: 0.0302810	total: 3.48s	remaining: 1.91s
1291:	learn: 0.0302679	total: 3.48s	remaining: 1.91s
1292:	learn: 0.0302386	total: 3.48s	remaining: 1.9s
1293:	learn: 0.0302208	total: 3.48s	remaining: 1.9s
1294:	learn: 0.0302143	total: 3.49s	remaining: 1.9s
1295:	learn: 0.0301771	total: 3.49s	remaining: 1.9s
1296:	learn: 0.0301369	total: 3.49s	remaining: 1.89s
1297:	learn: 0.0301287	total: 3.49s	remaining: 1.89s
1298:	learn: 0.0300882	total: 3.5s	remaining: 1.89s
1299:	learn: 0.0300445	total: 3.5s	remaining: 1.88s
1300:	learn: 0.0300127	total: 3.5s	remaining: 1.88s
1301:	learn: 0.0299688	total: 3.5s	remaining: 1.88s
1302:	learn: 0.0299411	total: 3.51s	remaining: 1.88s
1

1503:	learn: 0.0256562	total: 4.03s	remaining: 1.33s
1504:	learn: 0.0256521	total: 4.03s	remaining: 1.33s
1505:	learn: 0.0256335	total: 4.04s	remaining: 1.32s
1506:	learn: 0.0256317	total: 4.04s	remaining: 1.32s
1507:	learn: 0.0256223	total: 4.04s	remaining: 1.32s
1508:	learn: 0.0256121	total: 4.05s	remaining: 1.32s
1509:	learn: 0.0255915	total: 4.05s	remaining: 1.31s
1510:	learn: 0.0255629	total: 4.05s	remaining: 1.31s
1511:	learn: 0.0255287	total: 4.06s	remaining: 1.31s
1512:	learn: 0.0255064	total: 4.06s	remaining: 1.31s
1513:	learn: 0.0254841	total: 4.06s	remaining: 1.3s
1514:	learn: 0.0254607	total: 4.06s	remaining: 1.3s
1515:	learn: 0.0254312	total: 4.07s	remaining: 1.3s
1516:	learn: 0.0254267	total: 4.07s	remaining: 1.29s
1517:	learn: 0.0253985	total: 4.07s	remaining: 1.29s
1518:	learn: 0.0253918	total: 4.07s	remaining: 1.29s
1519:	learn: 0.0253709	total: 4.08s	remaining: 1.29s
1520:	learn: 0.0253608	total: 4.08s	remaining: 1.28s
1521:	learn: 0.0253308	total: 4.08s	remaining: 1.

1717:	learn: 0.0217862	total: 4.61s	remaining: 756ms
1718:	learn: 0.0217826	total: 4.61s	remaining: 754ms
1719:	learn: 0.0217721	total: 4.61s	remaining: 751ms
1720:	learn: 0.0217497	total: 4.62s	remaining: 749ms
1721:	learn: 0.0217183	total: 4.62s	remaining: 746ms
1722:	learn: 0.0217137	total: 4.62s	remaining: 743ms
1723:	learn: 0.0216864	total: 4.63s	remaining: 741ms
1724:	learn: 0.0216494	total: 4.63s	remaining: 738ms
1725:	learn: 0.0216353	total: 4.63s	remaining: 735ms
1726:	learn: 0.0216096	total: 4.63s	remaining: 733ms
1727:	learn: 0.0215953	total: 4.64s	remaining: 730ms
1728:	learn: 0.0215777	total: 4.64s	remaining: 727ms
1729:	learn: 0.0215577	total: 4.64s	remaining: 725ms
1730:	learn: 0.0215360	total: 4.64s	remaining: 722ms
1731:	learn: 0.0215211	total: 4.65s	remaining: 719ms
1732:	learn: 0.0215051	total: 4.65s	remaining: 716ms
1733:	learn: 0.0214859	total: 4.65s	remaining: 714ms
1734:	learn: 0.0214844	total: 4.66s	remaining: 711ms
1735:	learn: 0.0214655	total: 4.66s	remaining:

1935:	learn: 0.0182734	total: 5.18s	remaining: 171ms
1936:	learn: 0.0182612	total: 5.18s	remaining: 169ms
1937:	learn: 0.0182562	total: 5.19s	remaining: 166ms
1938:	learn: 0.0182442	total: 5.19s	remaining: 163ms
1939:	learn: 0.0182255	total: 5.2s	remaining: 161ms
1940:	learn: 0.0182022	total: 5.2s	remaining: 158ms
1941:	learn: 0.0181860	total: 5.2s	remaining: 155ms
1942:	learn: 0.0181725	total: 5.2s	remaining: 153ms
1943:	learn: 0.0181578	total: 5.21s	remaining: 150ms
1944:	learn: 0.0181456	total: 5.21s	remaining: 147ms
1945:	learn: 0.0181343	total: 5.21s	remaining: 145ms
1946:	learn: 0.0181272	total: 5.21s	remaining: 142ms
1947:	learn: 0.0181239	total: 5.22s	remaining: 139ms
1948:	learn: 0.0181035	total: 5.22s	remaining: 137ms
1949:	learn: 0.0180902	total: 5.22s	remaining: 134ms
1950:	learn: 0.0180740	total: 5.22s	remaining: 131ms
1951:	learn: 0.0180686	total: 5.23s	remaining: 129ms
1952:	learn: 0.0180478	total: 5.23s	remaining: 126ms
1953:	learn: 0.0180225	total: 5.23s	remaining: 123

<catboost.core.CatBoostRegressor at 0x7febb814c0a0>

In [270]:
testresult = np.expm1(cbm.predict(scaledtest))

In [271]:
t = pd.concat([test, pd.DataFrame(testresult)], axis=1)

In [272]:
t.rename(columns={0:'SalePrice'}, inplace=True)

In [273]:
t[['Id', 'SalePrice']].to_csv('pseudolabel1.csv', sep=',', index=False)

In [274]:
testresult

array([125396.29995734, 168560.91229262, 187936.08872831, ...,
       153548.69765196, 108739.25654903, 214676.79245005])

In [275]:
test2 = pd.read_csv('/Users/andrei/repos/HousePricing/Data/test.csv')
train2 = pd.read_csv('/Users/andrei/repos/HousePricing/Data/train.csv')

In [276]:
test2['SalePrice'] = testresult

In [277]:
mdf = pd.concat([train2, test2]).reset_index(drop=True)
print(mdf.shape)

(2919, 81)


In [278]:
mdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.000000
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.000000
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.000000
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.000000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,76624.371335
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,81718.543408
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,153548.697652
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,108739.256549


In [279]:
#Избавляемся от нулевых значений
nonelist = ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType')

zerolist = ('GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtHalfBath', 'BsmtFullBath', 'MasVnrArea')

for col in nonelist:
    mdf[col].fillna('None', inplace=True)
    
for col in zerolist:
    mdf[col].fillna(0, inplace=True)
    
mdf["LotFrontage"] = mdf.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

mdf['MSZoning'].fillna('RL', inplace=True)
mdf['Functional'].fillna('Typ', inplace=True)
mdf['Electrical'].fillna('SBrkr', inplace=True)
mdf['KitchenQual'].fillna('TA', inplace=True)
mdf['Exterior1st'].fillna('TA', inplace=True)
mdf['Exterior2nd'].fillna('TA', inplace=True)
mdf['SaleType'].fillna('WD', inplace=True)
mdf['Utilities'].fillna('AllPub', inplace=True)

In [280]:
mdf['TotalSF']=mdf['TotalBsmtSF'] + mdf['1stFlrSF'] + mdf['2ndFlrSF']

mdf['Total_Bathrooms'] = (mdf['FullBath'] + (0.5 * mdf['HalfBath']) +
                               mdf['BsmtFullBath'] + (0.5 * mdf['BsmtHalfBath']))

mdf['MSSubClass'] = mdf['MSSubClass'].apply(str)
mdf['YrSold'] = mdf['YrSold'].astype(str)
mdf['MoSold'] = mdf['MoSold'].astype(str)
mdf['OverallCond'] = mdf['OverallCond'].astype(str)

In [281]:
corrmat = mdf.corr()
corfeat = corrmat['SalePrice'][abs(corrmat['SalePrice']) >= 0.6].sort_values(ascending=False)
clist = corfeat.index.tolist()
clist.remove('SalePrice')
cldf = pd.DataFrame(mdf[clist])
scaler = StandardScaler()
scaledX = scaler.fit_transform(cldf)
cluster = KMeans(3, random_state=10)
cluster_labels = cluster.fit_predict(scaledX)
mdf['Cluster'] = cluster_labels
mdf

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,TotalSF,Total_Bathrooms,Cluster
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,0,2,2008,WD,Normal,208500.000000,2566.0,3.5,1
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,0,5,2007,WD,Normal,181500.000000,2524.0,2.5,1
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,0,9,2008,WD,Normal,223500.000000,2706.0,3.5,1
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,0,2,2006,WD,Abnorml,140000.000000,2473.0,2.0,1
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,0,12,2008,WD,Normal,250000.000000,3343.0,3.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,,0,6,2006,WD,Normal,76624.371335,1638.0,1.5,2
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,,0,4,2006,WD,Abnorml,81718.543408,1638.0,1.5,2
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,,0,9,2006,WD,Abnorml,153548.697652,2448.0,2.0,1
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,Shed,700,7,2006,WD,Normal,108739.256549,1882.0,1.5,2


In [282]:
mdf['SalePrice'] = np.log1p(mdf['SalePrice'])
target2 = mdf['SalePrice']
features2 = mdf.drop(['SalePrice'], axis=1)

In [283]:
encoder = CatBoostEncoder()
enc_mdf = encoder.fit_transform(features2, target2)

In [284]:
scaler = StandardScaler()
scaledX2 = scaler.fit_transform(enc_mdf)

x2train, x2valid, y2train, y2valid = train_test_split(scaledX2, target2, test_size = 0.25)

In [285]:
cbr2 = CatBoostRegressor(depth=4, iterations=2000, learning_rate=0.05)
cbr2.fit(xtrain, ytrain)

0:	learn: 0.3892665	total: 2.73ms	remaining: 5.46s
1:	learn: 0.3768492	total: 5.3ms	remaining: 5.29s
2:	learn: 0.3653433	total: 7.79ms	remaining: 5.18s
3:	learn: 0.3539064	total: 10.5ms	remaining: 5.23s
4:	learn: 0.3433328	total: 13.1ms	remaining: 5.22s
5:	learn: 0.3332133	total: 15.6ms	remaining: 5.19s
6:	learn: 0.3228088	total: 18.3ms	remaining: 5.2s
7:	learn: 0.3137100	total: 20.8ms	remaining: 5.18s
8:	learn: 0.3052852	total: 23.3ms	remaining: 5.16s
9:	learn: 0.2975100	total: 25.9ms	remaining: 5.15s
10:	learn: 0.2899944	total: 28.4ms	remaining: 5.14s
11:	learn: 0.2827955	total: 30.9ms	remaining: 5.12s
12:	learn: 0.2773907	total: 33.5ms	remaining: 5.11s
13:	learn: 0.2698895	total: 36ms	remaining: 5.1s
14:	learn: 0.2634374	total: 38.7ms	remaining: 5.12s
15:	learn: 0.2572233	total: 41.3ms	remaining: 5.12s
16:	learn: 0.2519887	total: 43.8ms	remaining: 5.11s
17:	learn: 0.2466961	total: 46.4ms	remaining: 5.11s
18:	learn: 0.2413892	total: 48.9ms	remaining: 5.1s
19:	learn: 0.2364190	total: 

207:	learn: 0.0957984	total: 580ms	remaining: 5s
208:	learn: 0.0956387	total: 585ms	remaining: 5.01s
209:	learn: 0.0954391	total: 590ms	remaining: 5.03s
210:	learn: 0.0952494	total: 593ms	remaining: 5.03s
211:	learn: 0.0950603	total: 597ms	remaining: 5.04s
212:	learn: 0.0948730	total: 600ms	remaining: 5.03s
213:	learn: 0.0946697	total: 602ms	remaining: 5.03s
214:	learn: 0.0946047	total: 606ms	remaining: 5.03s
215:	learn: 0.0945183	total: 608ms	remaining: 5.03s
216:	learn: 0.0943613	total: 611ms	remaining: 5.02s
217:	learn: 0.0941479	total: 614ms	remaining: 5.02s
218:	learn: 0.0939997	total: 617ms	remaining: 5.02s
219:	learn: 0.0939459	total: 619ms	remaining: 5.01s
220:	learn: 0.0937311	total: 622ms	remaining: 5.01s
221:	learn: 0.0936094	total: 625ms	remaining: 5s
222:	learn: 0.0934056	total: 628ms	remaining: 5s
223:	learn: 0.0932013	total: 630ms	remaining: 5s
224:	learn: 0.0931519	total: 633ms	remaining: 4.99s
225:	learn: 0.0929625	total: 635ms	remaining: 4.99s
226:	learn: 0.0929130	to

418:	learn: 0.0700412	total: 1.15s	remaining: 4.34s
419:	learn: 0.0700052	total: 1.16s	remaining: 4.35s
420:	learn: 0.0698257	total: 1.16s	remaining: 4.35s
421:	learn: 0.0697531	total: 1.16s	remaining: 4.35s
422:	learn: 0.0697365	total: 1.17s	remaining: 4.35s
423:	learn: 0.0696782	total: 1.17s	remaining: 4.35s
424:	learn: 0.0695909	total: 1.17s	remaining: 4.34s
425:	learn: 0.0695656	total: 1.18s	remaining: 4.34s
426:	learn: 0.0694904	total: 1.18s	remaining: 4.34s
427:	learn: 0.0694023	total: 1.18s	remaining: 4.33s
428:	learn: 0.0693040	total: 1.18s	remaining: 4.33s
429:	learn: 0.0691679	total: 1.19s	remaining: 4.33s
430:	learn: 0.0691050	total: 1.19s	remaining: 4.33s
431:	learn: 0.0690314	total: 1.19s	remaining: 4.32s
432:	learn: 0.0689325	total: 1.19s	remaining: 4.32s
433:	learn: 0.0688168	total: 1.2s	remaining: 4.32s
434:	learn: 0.0687964	total: 1.2s	remaining: 4.31s
435:	learn: 0.0686859	total: 1.2s	remaining: 4.31s
436:	learn: 0.0686317	total: 1.2s	remaining: 4.3s
437:	learn: 0.068

634:	learn: 0.0554973	total: 1.73s	remaining: 3.72s
635:	learn: 0.0554464	total: 1.73s	remaining: 3.72s
636:	learn: 0.0553649	total: 1.74s	remaining: 3.72s
637:	learn: 0.0552794	total: 1.74s	remaining: 3.72s
638:	learn: 0.0552257	total: 1.75s	remaining: 3.72s
639:	learn: 0.0551732	total: 1.75s	remaining: 3.72s
640:	learn: 0.0550810	total: 1.75s	remaining: 3.71s
641:	learn: 0.0550020	total: 1.75s	remaining: 3.71s
642:	learn: 0.0549733	total: 1.76s	remaining: 3.71s
643:	learn: 0.0549487	total: 1.76s	remaining: 3.71s
644:	learn: 0.0548758	total: 1.76s	remaining: 3.7s
645:	learn: 0.0548390	total: 1.76s	remaining: 3.7s
646:	learn: 0.0547625	total: 1.77s	remaining: 3.7s
647:	learn: 0.0547119	total: 1.77s	remaining: 3.69s
648:	learn: 0.0546984	total: 1.77s	remaining: 3.69s
649:	learn: 0.0546610	total: 1.78s	remaining: 3.69s
650:	learn: 0.0546024	total: 1.78s	remaining: 3.69s
651:	learn: 0.0545526	total: 1.78s	remaining: 3.68s
652:	learn: 0.0544772	total: 1.78s	remaining: 3.68s
653:	learn: 0.0

852:	learn: 0.0448606	total: 2.3s	remaining: 3.1s
853:	learn: 0.0448263	total: 2.31s	remaining: 3.1s
854:	learn: 0.0447525	total: 2.31s	remaining: 3.1s
855:	learn: 0.0446998	total: 2.32s	remaining: 3.1s
856:	learn: 0.0446427	total: 2.32s	remaining: 3.1s
857:	learn: 0.0446280	total: 2.33s	remaining: 3.1s
858:	learn: 0.0445857	total: 2.33s	remaining: 3.09s
859:	learn: 0.0445291	total: 2.33s	remaining: 3.09s
860:	learn: 0.0444570	total: 2.33s	remaining: 3.09s
861:	learn: 0.0444102	total: 2.33s	remaining: 3.08s
862:	learn: 0.0443667	total: 2.34s	remaining: 3.08s
863:	learn: 0.0443441	total: 2.34s	remaining: 3.08s
864:	learn: 0.0442957	total: 2.34s	remaining: 3.08s
865:	learn: 0.0442309	total: 2.35s	remaining: 3.07s
866:	learn: 0.0441834	total: 2.35s	remaining: 3.07s
867:	learn: 0.0441535	total: 2.35s	remaining: 3.07s
868:	learn: 0.0440887	total: 2.35s	remaining: 3.06s
869:	learn: 0.0440369	total: 2.36s	remaining: 3.06s
870:	learn: 0.0439881	total: 2.36s	remaining: 3.06s
871:	learn: 0.04391

1071:	learn: 0.0367748	total: 2.88s	remaining: 2.49s
1072:	learn: 0.0367367	total: 2.88s	remaining: 2.49s
1073:	learn: 0.0367086	total: 2.89s	remaining: 2.49s
1074:	learn: 0.0366665	total: 2.89s	remaining: 2.49s
1075:	learn: 0.0366031	total: 2.9s	remaining: 2.49s
1076:	learn: 0.0365660	total: 2.9s	remaining: 2.48s
1077:	learn: 0.0365049	total: 2.9s	remaining: 2.48s
1078:	learn: 0.0364846	total: 2.9s	remaining: 2.48s
1079:	learn: 0.0364625	total: 2.9s	remaining: 2.48s
1080:	learn: 0.0364184	total: 2.91s	remaining: 2.47s
1081:	learn: 0.0363883	total: 2.91s	remaining: 2.47s
1082:	learn: 0.0363806	total: 2.91s	remaining: 2.47s
1083:	learn: 0.0363375	total: 2.92s	remaining: 2.46s
1084:	learn: 0.0363034	total: 2.92s	remaining: 2.46s
1085:	learn: 0.0362774	total: 2.92s	remaining: 2.46s
1086:	learn: 0.0362392	total: 2.92s	remaining: 2.46s
1087:	learn: 0.0362101	total: 2.93s	remaining: 2.45s
1088:	learn: 0.0361523	total: 2.93s	remaining: 2.45s
1089:	learn: 0.0361454	total: 2.93s	remaining: 2.45

1290:	learn: 0.0302810	total: 3.46s	remaining: 1.9s
1291:	learn: 0.0302679	total: 3.46s	remaining: 1.9s
1292:	learn: 0.0302386	total: 3.46s	remaining: 1.89s
1293:	learn: 0.0302208	total: 3.47s	remaining: 1.89s
1294:	learn: 0.0302143	total: 3.47s	remaining: 1.89s
1295:	learn: 0.0301771	total: 3.47s	remaining: 1.89s
1296:	learn: 0.0301369	total: 3.48s	remaining: 1.88s
1297:	learn: 0.0301287	total: 3.48s	remaining: 1.88s
1298:	learn: 0.0300882	total: 3.48s	remaining: 1.88s
1299:	learn: 0.0300445	total: 3.48s	remaining: 1.88s
1300:	learn: 0.0300127	total: 3.49s	remaining: 1.87s
1301:	learn: 0.0299688	total: 3.49s	remaining: 1.87s
1302:	learn: 0.0299411	total: 3.49s	remaining: 1.87s
1303:	learn: 0.0299149	total: 3.5s	remaining: 1.86s
1304:	learn: 0.0298892	total: 3.5s	remaining: 1.86s
1305:	learn: 0.0298825	total: 3.5s	remaining: 1.86s
1306:	learn: 0.0298626	total: 3.5s	remaining: 1.86s
1307:	learn: 0.0298373	total: 3.5s	remaining: 1.85s
1308:	learn: 0.0297894	total: 3.51s	remaining: 1.85s


1511:	learn: 0.0255287	total: 4.03s	remaining: 1.3s
1512:	learn: 0.0255064	total: 4.03s	remaining: 1.3s
1513:	learn: 0.0254841	total: 4.04s	remaining: 1.3s
1514:	learn: 0.0254607	total: 4.04s	remaining: 1.29s
1515:	learn: 0.0254312	total: 4.05s	remaining: 1.29s
1516:	learn: 0.0254267	total: 4.05s	remaining: 1.29s
1517:	learn: 0.0253985	total: 4.05s	remaining: 1.29s
1518:	learn: 0.0253918	total: 4.05s	remaining: 1.28s
1519:	learn: 0.0253709	total: 4.06s	remaining: 1.28s
1520:	learn: 0.0253608	total: 4.06s	remaining: 1.28s
1521:	learn: 0.0253308	total: 4.06s	remaining: 1.28s
1522:	learn: 0.0252926	total: 4.07s	remaining: 1.27s
1523:	learn: 0.0252685	total: 4.07s	remaining: 1.27s
1524:	learn: 0.0252431	total: 4.07s	remaining: 1.27s
1525:	learn: 0.0252338	total: 4.07s	remaining: 1.26s
1526:	learn: 0.0252273	total: 4.08s	remaining: 1.26s
1527:	learn: 0.0252098	total: 4.08s	remaining: 1.26s
1528:	learn: 0.0252068	total: 4.08s	remaining: 1.26s
1529:	learn: 0.0251896	total: 4.08s	remaining: 1.

1731:	learn: 0.0215211	total: 4.61s	remaining: 713ms
1732:	learn: 0.0215051	total: 4.61s	remaining: 710ms
1733:	learn: 0.0214859	total: 4.62s	remaining: 708ms
1734:	learn: 0.0214844	total: 4.62s	remaining: 706ms
1735:	learn: 0.0214655	total: 4.62s	remaining: 703ms
1736:	learn: 0.0214495	total: 4.63s	remaining: 700ms
1737:	learn: 0.0214161	total: 4.63s	remaining: 698ms
1738:	learn: 0.0214065	total: 4.63s	remaining: 695ms
1739:	learn: 0.0213925	total: 4.63s	remaining: 692ms
1740:	learn: 0.0213686	total: 4.64s	remaining: 690ms
1741:	learn: 0.0213529	total: 4.64s	remaining: 687ms
1742:	learn: 0.0213278	total: 4.64s	remaining: 684ms
1743:	learn: 0.0213260	total: 4.64s	remaining: 682ms
1744:	learn: 0.0213022	total: 4.65s	remaining: 679ms
1745:	learn: 0.0212699	total: 4.65s	remaining: 676ms
1746:	learn: 0.0212548	total: 4.65s	remaining: 674ms
1747:	learn: 0.0212481	total: 4.65s	remaining: 671ms
1748:	learn: 0.0212193	total: 4.66s	remaining: 668ms
1749:	learn: 0.0211941	total: 4.66s	remaining:

1939:	learn: 0.0182255	total: 5.17s	remaining: 160ms
1940:	learn: 0.0182022	total: 5.18s	remaining: 157ms
1941:	learn: 0.0181860	total: 5.18s	remaining: 155ms
1942:	learn: 0.0181725	total: 5.19s	remaining: 152ms
1943:	learn: 0.0181578	total: 5.19s	remaining: 150ms
1944:	learn: 0.0181456	total: 5.19s	remaining: 147ms
1945:	learn: 0.0181343	total: 5.2s	remaining: 144ms
1946:	learn: 0.0181272	total: 5.2s	remaining: 142ms
1947:	learn: 0.0181239	total: 5.2s	remaining: 139ms
1948:	learn: 0.0181035	total: 5.21s	remaining: 136ms
1949:	learn: 0.0180902	total: 5.21s	remaining: 134ms
1950:	learn: 0.0180740	total: 5.21s	remaining: 131ms
1951:	learn: 0.0180686	total: 5.21s	remaining: 128ms
1952:	learn: 0.0180478	total: 5.21s	remaining: 126ms
1953:	learn: 0.0180225	total: 5.22s	remaining: 123ms
1954:	learn: 0.0180160	total: 5.22s	remaining: 120ms
1955:	learn: 0.0179842	total: 5.22s	remaining: 118ms
1956:	learn: 0.0179782	total: 5.22s	remaining: 115ms
1957:	learn: 0.0179684	total: 5.23s	remaining: 11

<catboost.core.CatBoostRegressor at 0x7febb74c8fa0>

In [286]:
testresult = np.expm1(cbr2.predict(scaledtest))
t = pd.concat([test, pd.DataFrame(testresult)], axis=1)
t.rename(columns={0:'SalePrice'}, inplace=True)
t[['Id', 'SalePrice']].to_csv('pseudolabel2.csv', sep=',', index=False)