In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, cv, sum_models
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import random

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/realestatepriceprediction/train.csv')
test = pd.read_csv('/kaggle/input/realestatepriceprediction/test.csv')
test['Price'] = ''
combine = [train, test]
combine = pd.concat(combine, ignore_index=True)

# перенесем столбец с таргетом на первое место
cols = combine.columns.tolist()
cols = cols[-1:] + cols[:-1]
combine = combine[cols]

combine

In [None]:
combine.info()

In [None]:
plt.figure(figsize = (16, 4))

plt.subplot(121)
combine['Square'][:10000].hist(density=True)
plt.ylabel('count')
plt.xlabel('Square / train')

plt.subplot(122)
sns.kdeplot(combine['Square'][:10000], shade=True, legend=False)
plt.xlabel('Square / train')

plt.suptitle('Distribution of Square / train')
plt.show()

#

plt.figure(figsize = (16, 4))

plt.subplot(121)
combine['Square'][10000:].hist(density=True)
plt.ylabel('count')
plt.xlabel('Square / test')

plt.subplot(122)
sns.kdeplot(combine['Square'][10000:], shade=True, legend=False)
plt.xlabel('Square / test')

plt.suptitle('Distribution of Square / test')
plt.show()

In [None]:
# #scatter plot totalbsmtsf/saleprice
var = 'Square'
data = pd.concat([combine[:10000].loc[combine['Rooms'] < 200]['Price'], combine[:10000].loc[combine['Rooms'] < 200][var]], axis=1)
data.plot.scatter(x=var, y='Price', ylim=(0,800000));

In [None]:
# Заменяем пустые значения средними
combine['LifeSquare'].fillna(value=combine['LifeSquare'].mean(), inplace=True)
combine['Healthcare_1'].fillna(value=combine['Healthcare_1'].mean(), inplace=True)

combine.isnull().sum().sum()

In [None]:
combine.nunique()

In [None]:
# Заменяем Rooms == 0 на 1
combine.at[combine.query('Rooms == 0').index, 'Rooms'] = 1

# Заменяем Rooms > 6 на 2
combine.at[combine.query('Rooms > 6').index, 'Rooms'] = 2

In [None]:
plt.figure(figsize = (16, 4))

plt.subplot(121)
combine.loc[combine['KitchenSquare'] < 25, 'KitchenSquare'][:10000].hist(density=True)
plt.ylabel('count')
plt.xlabel('KitchenSquare / train')

plt.subplot(122)
sns.kdeplot(combine.loc[combine['KitchenSquare'] < 25, 'KitchenSquare'][:10000], shade=True, legend=False)
plt.xlabel('KitchenSquare / train')

plt.suptitle('KitchenSquare of Square / train')
plt.show()

#

plt.figure(figsize = (16, 4))

plt.subplot(121)
combine.loc[combine['KitchenSquare'] < 25, 'KitchenSquare'][10000:].hist(density=True)
plt.ylabel('count')
plt.xlabel('KitchenSquare / test')

plt.subplot(122)
sns.kdeplot(combine.loc[combine['KitchenSquare'] < 25, 'KitchenSquare'][10000:], shade=True, legend=False)
plt.xlabel('KitchenSquare / test')

plt.suptitle('Distribution of KitchenSquare / test')
plt.show()

In [None]:
# Добавляем дополнительные фичи
combine['squ_room'] = (combine['Square'] / combine['Rooms']).astype('float')

In [None]:
target = 'Price'
numfeat, catfeat = list(combine.select_dtypes(include=np.number))[1:], list(combine.select_dtypes(exclude=np.number))
catfeat.remove(target)
print(numfeat)
print(catfeat)

In [None]:
# Удаляем выбросы

to_drop1 = combine[:10000].query('Square > 200').index
to_drop2 = combine[:10000].query('LifeSquare > 200 & Rooms < 4').index
to_drop3 = combine[:10000].query('KitchenSquare > 200').index
to_drop4 = combine[:10000].query('HouseYear > 2020').index
to_drop5 = combine[:10000].query('(Square < 13) | (Square < 20 & Rooms > 1)').index

combine.drop(list(to_drop1) + list(to_drop2) + list(to_drop3) + list(to_drop4) + list(to_drop5), axis=0, inplace=True)

to_replace1 = combine[10000:].query('KitchenSquare > 200').index
combine.at[to_replace1, 'KitchenSquare'] = 24

In [None]:
# Добавляем дополнительные фичи
combine['squ_room'] = (combine['Square'] / combine['Rooms']).astype('float')
combine['squ_room']

In [None]:
len_combine = len(combine)
len_test = 5000
len_validate = 3000
len_train = len_combine - (len_test+ len_validate)
print(len_combine)
print(len_test)
print(len_validate)
print(len_train)

In [None]:
# Создадим отдельно числовые слолбцы для категориальных
for n in numfeat:
    combine[n + '_num'] = combine[n]

In [None]:
combine['Rooms'] = combine['Rooms'].astype('int')
combine['KitchenSquare'] = combine['KitchenSquare'].astype('int')
combine['HouseFloor'] = combine['HouseFloor'].astype('int')
combine['Square'] = combine['Square'].astype('int')
combine['Ecology_1'] = (combine['Ecology_1'] * 1000000).astype('int')
combine['LifeSquare'] = combine['LifeSquare'].astype('int')
combine[numfeat] = combine[numfeat].astype('str')

In [None]:
numfeat, catfeat = list(combine.select_dtypes(include=np.number))[1:], list(combine.select_dtypes(exclude=np.number))
catfeat.remove(target)
print(numfeat)
print(catfeat)

In [None]:
corr = combine.corr()
#Plot figsize
fig, ax = plt.subplots(figsize=(10, 10))
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
plt.show()

### Удаляем из трейна неиспользуемые в тесте Square, HouseYear и DistrictId

In [None]:
notest_Square = []
for x in combine.Square[:9972].sort_values().unique():
    if len(combine[-5000:].loc[combine.Square == x]) > 0:
        continue
    else:
        notest_Square.append(x)
print('Нет в тесте')
print(notest_Square) 

notrain_Square = [] 
for x in combine.Square[-5000:].sort_values().unique():
    if len(combine[:9972].loc[combine.Square == x]) > 0:
        continue
    else:
        notrain_Square.append(x)
print('Нет в трейне')
print(notrain_Square) 

In [None]:
x14 = combine.loc[combine.Square_num < 14].index
combine.loc[x14, 'Square'] = combine[:10000]['Square'].mode()[0]
combine.loc[x14, 'LifeSquare'] = combine[:10000]['LifeSquare'].mode()[0]
combine.loc[x14, 'KitchenSquare'] = combine[:10000]['KitchenSquare'].mode()[0]

In [None]:
combine = combine.drop(combine.loc[combine.Square_num.isin(notest_Square)].index)
len(combine)

In [None]:
notest_HouseYear = []
for x in combine.HouseYear[:9972].sort_values().unique():
    if len(combine[-5000:].loc[combine.HouseYear == x]) > 0:
        continue
    else:
        notest_HouseYear.append(x)
print('Нет в тесте')
print(notest_HouseYear) 

notrain_HouseYear = [] 
for x in combine.HouseYear[-5000:].sort_values().unique():
    if len(combine[:9972].loc[combine.HouseYear == x]) > 0:
        continue
    else:
        notrain_HouseYear.append(x)
print('Нет в трейне')
print(notrain_HouseYear) 

In [None]:
combine = combine.drop(combine.loc[combine.HouseYear.isin(notest_HouseYear)].index)
len(combine)

In [None]:
notest_DistrictId = []
for x in combine.DistrictId[:9972].sort_values().unique():
    if len(combine[-5000:].loc[combine.DistrictId == x]) > 0:
        continue
    else:
        notest_DistrictId.append(x)
print('Нет в тесте')
print(notest_DistrictId) 

notrain_DistrictId = [] 
for x in combine.DistrictId[-5000:].sort_values().unique():
    if len(combine[:9972].loc[combine.DistrictId == x]) > 0:
        continue
    else:
        notrain_DistrictId.append(x)
print('Нет в трейне')
print(notrain_DistrictId) 

In [None]:
combine = combine.drop(combine.loc[combine.DistrictId.isin(notest_DistrictId)].index)
# combine = combine.drop(combine.loc[combine.DistrictId.isin(notrain_DistrictId)].index)
len(combine)

In [None]:
len_combine = len(combine)
len_test = len(combine[combine.Price == ''])
len_validate = 3000
len_train = len_combine - (len_test+ len_validate)
print(len_combine)
print(len_test)
print(len_validate)
print(len_train)

____


In [None]:
def display_clusters_distribution(unique_labels, labels_counts):
    """Визуализация распределения классов по кластерам"""
    plt.figure(figsize=(8,5))

    plt.bar(unique, counts)

    plt.xlabel('Clusters')
    plt.xticks(unique)
    plt.ylabel('Count')
    plt.title('Clusters distribution')
    plt.show()

In [None]:
scaler = StandardScaler()

combine[numfeat] = scaler.fit_transform(combine[numfeat])
combine.head()

In [None]:
X_train_scaled = combine[:len_train + len_validate][numfeat]
X_test_scaled = combine[len_train + len_validate:][numfeat]

In [None]:
kmeans_3 = KMeans(n_clusters=3, random_state=42)
labels_clast_3 = kmeans_3.fit_predict(X_train_scaled)
labels_clast_3 = pd.Series(labels_clast_3, name='clusters_3')
labels_clast_3_test = kmeans_3.predict(X_test_scaled)
labels_clast_3_test = pd.Series(labels_clast_3_test, name='clusters_3')

unique, counts = np.unique(labels_clast_3, return_counts=True)
display_clusters_distribution(unique, counts)

In [None]:
kmeans_5 = KMeans(n_clusters=5, random_state=42)
labels_clast_5 = kmeans_5.fit_predict(X_train_scaled)
labels_clast_5 = pd.Series(labels_clast_5, name='clusters_5')
labels_clast_5_test = kmeans_5.predict(X_test_scaled)
labels_clast_5_test = pd.Series(labels_clast_5_test, name='clusters_5')

unique, counts = np.unique(labels_clast_5, return_counts=True)
display_clusters_distribution(unique, counts)

In [None]:
kmeans_10 = KMeans(n_clusters=10, random_state=42)
labels_clast_10 = kmeans_10.fit_predict(X_train_scaled)
labels_clast_10 = pd.Series(labels_clast_10, name='clusters_10')
labels_clast_10_test = kmeans_10.predict(X_test_scaled)
labels_clast_10_test = pd.Series(labels_clast_10_test, name='clusters_10')

unique, counts = np.unique(labels_clast_10, return_counts=True)
display_clusters_distribution(unique, counts)

In [None]:
clusters_3_dummies = pd.get_dummies(labels_clast_3, drop_first=False, prefix='clusters_3') #True
clusters_5_dummies = pd.get_dummies(labels_clast_5, drop_first=False, prefix='clusters_5') #True
clusters_10_dummies = pd.get_dummies(labels_clast_10, drop_first=False, prefix='clusters_10') #True

clusters_3_dummies_test = pd.get_dummies(labels_clast_3_test, drop_first=False, prefix='clusters_3') #True
clusters_5_dummies_test = pd.get_dummies(labels_clast_5_test, drop_first=False, prefix='clusters_5') #True
clusters_10_dummies_test = pd.get_dummies(labels_clast_10_test, drop_first=False, prefix='clusters_10') #True

clusters_3 = pd.concat([clusters_3_dummies, clusters_3_dummies_test], axis=0, ignore_index=True)
clusters_5 = pd.concat([clusters_5_dummies, clusters_5_dummies_test], axis=0, ignore_index=True)
clusters_10 = pd.concat([clusters_10_dummies, clusters_10_dummies_test], axis=0, ignore_index=True)
clusters_all = pd.concat([clusters_5, clusters_10], axis=1)

combine[clusters_all.columns] = clusters_all.values
combine.head()

In [None]:
# Удаляем столбцы с малозначащами фичами
combine = combine.drop(['Ecology_2', 'Shops_2'], axis=1)

----

### Обучаем модель

In [None]:
combine[:len_train + len_validate] = combine[:len_train + len_validate].sample(frac=1, random_state=5)
combine[:len_train + len_validate]

train_data = combine[:len_train]
validate_data = combine[len_train:len_train + len_validate]
cv_data = combine[:len_train + len_validate]
test_data = combine[len_train + len_validate:]

In [None]:
features = train_data.iloc[:, 2:].values
labels = train_data.loc[:, 'Price'].values
features2 = validate_data.iloc[:, 2:].values
labels2 = validate_data.loc[:, 'Price'].values
features_test = test_data.iloc[:, 2:].values
features_cv = cv_data.iloc[:, 2:].values
labels_cv = cv_data.loc[:, 'Price'].values
print(features.shape, labels.shape, features2.shape, labels2.shape, features_test.shape, features_cv.shape)

In [None]:
X_train = features
X_validation = features2
y_train = labels
y_validation = labels2
X_cv = features_cv
y_cv = labels_cv
print(X_train.shape, X_validation.shape, y_train.shape, y_validation.shape, X_cv.shape)

# from sklearn.model_selection import train_test_split
# X_train, X_validation, y_train, y_validation = train_test_split(features_cv, labels_cv, train_size=0.5, random_state=100, shuffle=True)
# print(X_train.shape, X_validation.shape, y_train.shape, y_validation.shape, X_cv.shape)

In [None]:
cat_features = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]


train_dataset = Pool(data=X_train,
                     label=y_train,
                     cat_features=cat_features)

eval_dataset = Pool(data=X_validation,
                    label=y_validation,
                    cat_features=cat_features)

# pool_cv = Pool(data=X_cv, 
#                label=y_cv, 
#                cat_features=cat_features)

# # Initialize CatBoostClassifier
# # for classifier use Logloss, CrossEntropy, MultiClass, MultiClassOneVsAll or custom objective object
# # for regressor use RMSE, MultiRMSE, MAE, Quantile, LogLinQuantile, Poisson, MAPE, Lq or custom objective object
model = CatBoostRegressor(iterations=20000,
                           #feature_weights = {"35":0.05},
                           #thread_count=5,
                           learning_rate = 0.03,
                           depth=6, # только для grow_policy="Depthwise"
                           l2_leaf_reg=3,
                           grow_policy="Depthwise", # нессимитричное дерево ("Depthwise" как xgboost) ("Lossguide" как)
                           #loss_function = 'Logloss', #MultiClass
                           loss_function = 'RMSE', #RMSE
                           eval_metric = 'R2', #R2
                           early_stopping_rounds = 1000,
                           task_type="GPU", # закомментировать, если не на GPU
                           devices='0', # закомментировать, если не на GPU
                           #one_hot_max_size=10,
                           verbose = False,
                           use_best_model=True
                         )

# grid = {'learning_rate': [0.01, 0.03, 0.05],
#         'depth': [5, 6],
#         'l2_leaf_reg': [1, 3]}

# model.grid_search(grid, X=pool_cv, verbose = False, plot=True, cv = 5)

# Fit model
model.fit(train_dataset, eval_set=eval_dataset, logging_level='Silent', plot=True)

In [None]:
model.best_iteration_

In [None]:
# model.get_feature_importance(eval_dataset, prettified=True)

feats = {}
for feature, importance in zip(combine.iloc[:, 2:].columns, model.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-Importance'})
importances = importances.sort_values(by='Gini-Importance', ascending=False)
importances = importances.reset_index()
importances = importances.rename(columns={'index': 'Features'})
sns.set(font_scale = 5)
sns.set(style="whitegrid", color_codes=True, font_scale = 1.7)
fig, ax = plt.subplots()
fig.set_size_inches(30,15)
sns.barplot(x=importances['Gini-Importance'], y=importances['Features'], data=importances, color='skyblue')
plt.xlabel('Importance', fontsize=25, weight = 'bold')
plt.ylabel('Features', fontsize=25, weight = 'bold')
plt.title('Feature Importance', fontsize=25, weight = 'bold')
display(plt.show())
display(importances)

In [None]:
Y_hat_train = [yhat for yhat in model.predict(X_train)]
Y_hat = [yhat for yhat in model.predict(X_validation)]

from sklearn.metrics import r2_score
print(r2_score(y_train, Y_hat_train))
print(r2_score(y_validation, Y_hat))

### Кросс валидация

In [None]:
# cv
params = model.get_params()
# del params['use_best_model']
pool_cv = Pool(data=X_cv, 
               label=y_cv, 
               cat_features=cat_features)

cv_data = cv(
   params = params,
   pool = pool_cv,
   fold_count=5,
   inverted=False,
   shuffle=True,
   stratified=False,
   partition_random_seed=0,
    plot="True"
)

In [None]:
cv_data
best_value = np.max(cv_data['test-R2-mean'])
best_iter = np.argmax(cv_data['test-R2-mean'])
print('Best validation R2 score: {:.4f}±{:.4f} on step {}'.format(
   best_value,
   cv_data['test-R2-mean'][best_iter],
   best_iter
))

### Predict

In [None]:
# Get predicted classes
preds_class = model.predict(features_test)
pred = [value for value in preds_class]

submission = pd.DataFrame({
        "Id": test_data["Id"],
        "Price": pred
    })
submission.to_csv('prediction.csv', index=False)
submission.values