In [8]:
!pip install catboost



In [9]:
!pip install pytorch-tabnet



In [10]:
# General libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Skleran modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler #check if we really need it
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold

# Tensorflow modules
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.layers as L
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import *

# Tabnet import
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetRegressor
import torch


from catboost import CatBoostRegressor

In [11]:
# Common variables
RANDOM_SEED = 42
VAL_SIZE = 0.15 # the size of a validation sample
np.random.seed(RANDOM_SEED)

In [12]:
# Common functions

# A target metric function
def mape(y_true, y_pred):    
    return np.mean(np.abs((y_pred-y_true)/y_true))

# A function for distribution visualisation
def visualize_distributions(titles_values_dict):
  columns = min(3, len(titles_values_dict))
  rows = (len(titles_values_dict) - 1) // columns + 1
  fig = plt.figure(figsize = (columns * 5, rows * 3))
  for i, (title, values) in enumerate(titles_values_dict.items()):
    hist, bins = np.histogram(values, bins = 20)
    ax = fig.add_subplot(rows, columns, i + 1)
    ax.bar(bins[:-1], hist, width = (bins[1] - bins[0]) * 0.7)
    ax.set_title(title)
  plt.show()

In [13]:
df = pd.read_csv('land_lots_eda_available.csv')

In [14]:
df.status.value_counts()

3    359
Name: status, dtype: int64

In [15]:
df.koatuuLocation.values

array(['Іванків, Іванківська, Вишгородський, Київська, Україна',
       'Іванківська, Вишгородський, Київська, Україна',
       'Подо-Калинівка, Ювілейна, Херсонський, Херсонська, Україна',
       'Щасливе, Ювілейна, Херсонський, Херсонська, Україна',
       'Зеленопідська, Каховський, Херсонська, Україна',
       'Урожайне, Бериславська, Бериславський, Херсонська, Україна',
       'Урожайне, Бериславська, Бериславський, Херсонська, Україна',
       'Раківка, Бериславська, Бериславський, Херсонська, Україна',
       'Тягинська, Бериславський, Херсонська, Україна',
       'Демидівка, Демидівська, Дубенський, Рівненська, Україна',
       'Рудка, Демидівська, Дубенський, Рівненська, Україна',
       'Мар’їнська, Покровський, Донецька, Україна',
       'Придніпровське, Червоногригорівська, Нікопольський, Дніпропетровська, Україна',
       'Верхня Ланна, Ланнівська, Полтавський, Полтавська, Україна',
       'Тягинська, Бериславський, Херсонська, Україна',
       'Тягинська, Бериславський, Х

In [16]:
df.head()

Unnamed: 0,id,status,price,pricePerOne,estimatePrice,rentRate,rentalYield,purpose,koatuuLocation,ownerEdrpou,...,renterEdrpou,isAvailable,region_id,estimateMonth,estimateDay,estimateYear,daysDelta,daysRentPayDelta,daysRentPayDeltaSign,area_win
0,740,3,120000,61137,42477,3812,2.6,1,"Іванків, Іванківська, Вишгородський, Київська,...",3578305088,...,41102844,True,3222055100,12,28,2021,68,-66,0,1.9628
1,739,3,60000,53893,5917,967,1.3,1,"Іванківська, Вишгородський, Київська, Україна",3578305088,...,41102844,True,3222086800,12,28,2021,68,-66,0,1.1133
2,737,3,680000,67156,204989,21120,2.5,1,"Подо-Калинівка, Ювілейна, Херсонський, Херсонс...",2284411333,...,41101589,True,6525082500,9,13,2021,174,-66,0,10.1256
3,736,3,385000,67444,142833,22995,4.8,1,"Щасливе, Ювілейна, Херсонський, Херсонська, Ук...",2520006184,...,41101589,True,6525085600,9,13,2021,174,-66,0,5.7084
4,735,3,400000,56441,187952,22424,4.5,1,"Зеленопідська, Каховський, Херсонська, Україна",2762921072,...,41101589,True,6523584000,12,9,2021,87,299,1,7.087


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359 entries, 0 to 358
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    359 non-null    int64  
 1   status                359 non-null    int64  
 2   price                 359 non-null    int64  
 3   pricePerOne           359 non-null    int64  
 4   estimatePrice         359 non-null    int64  
 5   rentRate              359 non-null    int64  
 6   rentalYield           359 non-null    float64
 7   purpose               359 non-null    int64  
 8   koatuuLocation        359 non-null    object 
 9   ownerEdrpou           359 non-null    int64  
 10  renterCompany         359 non-null    float64
 11  renterEdrpou          359 non-null    int64  
 12  isAvailable           359 non-null    bool   
 13  region_id             359 non-null    int64  
 14  estimateMonth         359 non-null    int64  
 15  estimateDay           3

In [None]:
df.renterEdrpou.value_counts()

In [None]:
df.renterEdrpou.value_counts()

In [None]:
# Getting rid of the id feature
df.drop(['id'], axis = 1, inplace = True)

In [None]:
# Also, because the total price of the lot heavily depends on the area of the lot maybe it makes sense to 
# set the target variable to pricePerOne, not actually price

#### Let's now try to split features into numerical and categorical

In [None]:
df['area_win'].value_counts()

In [None]:
#Numerical columns
num_col = ['pricePerOne', 'estimatePrice', 'rentalYield', 'daysDelta', 'daysRentPayDelta', 'area_win', 'price']

In [None]:
#Categorical columns
cat_col = ['status', 'cadastre', 'rentRate', 'purpose', 'koatuuLocation', 'ownerEdrpou', 'renterCompany', 'isAvailable', 'estimateMonth', 'estimateDay', 'estimateYear', 'daysRentPayDeltaSign']

In [None]:
# Setting target variable
y = df['price']

## Data Processing

#### Let's first figure out the correlation between numeric variables including target ones

In [None]:
df[num_col].corr()

#### Let's visualize correlations better with Seaborn

In [None]:
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(df[num_col].corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
# save heatmap as .png file
# dpi - sets the resolution of the saved image in dots/inches
# bbox_inches - when set to 'tight' - does not allow the labels to be cropped
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

As we can see - there is a very high correlation between the price and are_win. So, I suggest to take a pricePerOne (price per one hectare) as a target variable and remove the **price** feature

In [None]:
y = df['pricePerOne']

In [None]:
# Removing price and pricePerOne from numerical features list
spare_elem = {'pricePerOne', 'price'} 
num_col = [elem for elem in num_col if elem not in spare_elem]

In [None]:
num_col

## Model 0: Default model
Let's try to build a default naive model based on only on the lot location. Later we will compare our results with this default model

In [None]:
# Splitting data
data_train, data_test = train_test_split(df, test_size=0.15, shuffle=True, random_state=RANDOM_SEED)

In [None]:
# Default model
predicts = []

for index, row in pd.DataFrame(data_test[['koatuuLocation']]).iterrows():
    query = f"koatuuLocation == '{row[0]}'"    
    
    #Because the total price of the lot depends heavily on the area of the lot, we will take price per 1 unit as a target
    predicts.append(data_train.query(query)['pricePerOne'].median())    

predicts = pd.DataFrame(predicts)
predicts = predicts.fillna(predicts.median())

# Rounding
predicts = (predicts // 1000) * 1000

# Evaluating precision
print(f"The precision of the default model based on MAPE metric is: {(mape(data_test['pricePerOne'], predicts.values[:, 0]))*100:0.2f}%")

## Data Analysis

Let's analyze the numerical data first

In [None]:
df[num_col].head()

As we can see the scales of our numerical values are quite different. Let's apply Standard Scaler to them

In [None]:
scaled_features = MinMaxScaler().fit_transform(df[num_col].values)

In [None]:
df_num = pd.DataFrame(scaled_features, index=df[num_col].index, columns=df[num_col].columns)

In [None]:
df_num.head()

Let's see how the distribution on the numerical features look like after Standartization

In [None]:
visualize_distributions({
    'estimatePrice': df_num['estimatePrice'],
    'rentalYield': df_num['rentalYield'],
    'daysDelta': df_num['daysDelta'],
    'daysRentPayDelta': df_num['daysRentPayDelta'],
    'area_win': df_num['area_win']        
})

As we can see only **rentalYield** feature has a distribution which is close to normal. All the other features have different distributions, especially strange one is **estimatePrice**. Let's see how their distributions will look like after applying Logarithmization

In [None]:
df_num2 = df_num[['estimatePrice', 'daysDelta', 'daysRentPayDelta', 'area_win']]

In [None]:
# Applying logarithmization to the numerical features, to also avoid errors in case the value is 0
df_num2['estimatePrice'] = df_num2['estimatePrice'].apply(lambda x: x if x == 0 else np.log(x))
df_num2['daysDelta'] = df_num2['daysDelta'].apply(lambda x: x if x == 0 else np.log(x))
df_num2['daysRentPayDelta'] = df_num2['daysRentPayDelta'].apply(lambda x: x if x == 0 else np.log(x))
df_num2['area_win'] = df_num2['area_win'].apply(lambda x: x if x == 0 else np.log(x))

In [None]:
df_num2.head()

In [None]:
visualize_distributions({
    'estimatePrice': df_num2['estimatePrice'],    
    'daysDelta': df_num2['daysDelta'],
    'daysRentPayDelta': df_num2['daysRentPayDelta'],
    'area_win': df_num2['area_win']        
})

As we can see all the features except **area_win** feature now look more close to a Normal distribution, so probably it makes sense to use logarithmization of these three features, but we will check this hypothesis later when doing experiments. For now we will keep the numerical features as is.

### Let's now take a look at categorical features

Basically, what we need to do alter categorical features either with One hot encoding or with labelling

In [None]:
df_cat = df[cat_col]

In [None]:
# Label Encoding
for column in cat_col:
    df_cat[column] = df_cat[column].astype('category').cat.codes

In [None]:
# One-Hot Encoding
df_cat = pd.get_dummies(df_cat, columns=cat_col, dummy_na=False)

In [None]:
df_cat.head()

In [None]:
df_cat.shape

### Let's now concatenate both categorical and numerical feature after preprocessing into one DataFrame

In [None]:
df_result = pd.concat([df_num, df_cat], axis=1)

In [None]:
df_result.shape

In [None]:
df_result.head()

In [None]:
#Checking for the NaN values just in case
df_result.isnull().sum().sum()

Well, I guess we're ready to experiment with the models

## Models

In [None]:
# Splitting data
X_train, X_valid, y_train, y_valid = train_test_split(df_result, y, test_size=VAL_SIZE, shuffle = True, random_state=RANDOM_SEED)

## Model 1: Simple Linear Regression

In [None]:
linreg_model = LinearRegression().fit(X_train, y_train)

In [None]:
y_pred = linreg_model.predict(X_valid)
print(f"The precision of the Linear Regression model on MAPE metric is: {(mape(y_valid, y_pred))*100:0.2f}%")

Obviously, the default linear regression model doesn't fir for this dataset. Let's try to build a linear regression model basen only on a numeric features

### Model 1.2: Simple Linear Regression Numeric Features Only

In [None]:
df_res_num = df_result[['estimatePrice', 'daysDelta', 'daysRentPayDelta', 'area_win']]

In [None]:
# Splitting data
X_train_num, X_valid_num, y_train_num, y_valid_num = train_test_split(df_res_num, y, test_size=VAL_SIZE, shuffle = True, random_state=RANDOM_SEED)

In [None]:
linreg_num_model = LinearRegression().fit(X_train_num, y_train_num)

In [None]:
y_pred_num = linreg_num_model.predict(X_valid_num)
print(f"The precision of the Linear Regression model on MAPE metric is: {(mape(y_valid_num, y_pred_num))*100:0.2f}%")

Well, the result which is based only on numeric features is definitely better than the one with all the features, but it is still worse than the results of the default model. It looks like Linear Regression is definitely not the best model to use in this case.

## Model 2: CatboostRegressor

In [None]:
model_catboost = CatBoostRegressor(iterations = 5000,                       
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['RMSE', 'MAE'],
                          od_wait=500                          
                         )
model_catboost.fit(X_train, y_train,
         eval_set=(X_valid, y_valid),
         verbose_eval=100,
         use_best_model=True       
         )

In [None]:
test_predict_catboost = model_catboost.predict(X_valid)
print(f"The precision of the Catboosting Regressor by the MAPE metrics is: {(mape(y_valid, test_predict_catboost))*100:0.2f}%")

So, with initial CatBoost setup we were able to improve the default model metrics and get pretty good result. Let's think how to improve it

## Model 3: Gradient Boosting

In [None]:
# Default Gradient Bossting Regressor
model_gbr = GradientBoostingRegressor(n_estimators=300, random_state=RANDOM_SEED)
model_gbr.fit(X_train, y_train)
y_pred_gbr = model_gbr.predict(X_valid)
print(f"The precision of the default Gradient Bossting Regressor model on MAPE metric is: {(mape(y_valid, y_pred_gbr))*100:0.2f}%")

As we can see the result of the default Gradient Boosting Regressor model is much better than default model, however, not as good as CatBoosting. Let's try to fine-tune the Gradient Boosting with hyperparameters

### Model 3.1: Gradient Boosting with Hyperparameters Tuned

In [None]:
gbr = GradientBoostingRegressor()
parameters = {
    "n_estimators":[5,50,250,500],
    "max_depth":[1,3,5,7,9],
    "learning_rate":[0.01,0.1,1,10,100]
}

In [None]:
# Using Grid Search to find the best parameters
'''cv_gbr = GridSearchCV(gbr,parameters,cv=5, n_jobs=-1)
cv_gbr.fit(X_train,y_train)'''

In [None]:
#cv_gbr.best_params_

In [None]:
# Gradient Bossting Regressor with hyperparameters tuned
model_gbr_1 = GradientBoostingRegressor(n_estimators=250, learning_rate= 0.1, max_depth=5, random_state=RANDOM_SEED)
model_gbr_1.fit(X_train, y_train)
y_pred_gbr_1 = model_gbr_1.predict(X_valid)
print(f"The precision of the Gradient Bossting Regressor with hyperparameters tuned on MAPE metric is: {(mape(y_valid, y_pred_gbr_1))*100:0.2f}%")

Wow, with the hyperparameters tuned for Gradient Booster Regressor we were able to achieve even better score than CatBoosting Regressor. 

## Model 4:  Default Decision Tree Regressor

In [None]:
# Default Decision Tree Regressor
model_dectree = DecisionTreeRegressor(random_state = RANDOM_SEED)
model_dectree.fit(X_train, y_train)
y_pred_dectree = model_dectree.predict(X_valid)
print(f"The precision of the DecisionTreeRegressor model by the MAPE metrics is: {(mape(y_valid, y_pred_dectree))*100:0.2f}%")

### Model 4.1 Decision Tree Regressor With Hyperparameters Tuned

Well, the the results of the default Decision Tree Model are not very promising, but anyway let's try to play with the hyperparameters tuning

In [None]:
dtr = DecisionTreeRegressor(random_state = RANDOM_SEED)

parameters_dtr = {"splitter":["best","random"],
            "max_depth" : [7,9,11,12],
           "min_samples_leaf":[1,2,3,6,7,8],
           "min_weight_fraction_leaf":[0.1,0.3, 0.5],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50] }

In [None]:
# Using Grid Search to find the best parameters
#cv_dtr = GridSearchCV(dtr,parameters_dtr, cv=5, n_jobs=-1)
#cv_dtr.fit(X_train,y_train)

In [None]:
# best hyperparameters
#cv_dtr.best_params_

In [None]:
# Decision Tree Regressor with the best hyperparameters
model_dectree_1 = DecisionTreeRegressor(max_depth = 7, max_features = 'auto', min_samples_leaf = 1,
                                      min_weight_fraction_leaf = 0.1, splitter = 'best', random_state = RANDOM_SEED)
model_dectree_1.fit(X_train, y_train)
y_pred_dectree_1 = model_dectree_1.predict(X_valid)
print(f"The precision of the DecisionTreeRegressor model by the MAPE metrics is: {(mape(y_valid, y_pred_dectree_1))*100:0.2f}%")

The Decision Tree regressor with hyperparameters tuned showed even worse result than without hyperparameters. So, we would probably refuse using this model

## Model 5: Random Forest Regressor

In [None]:
# Default Random Forest Regressor
model_randfor = RandomForestRegressor(n_estimators = 100, random_state = RANDOM_SEED)
model_randfor.fit(X_train, y_train)  
y_pred_randfor = model_randfor.predict(X_valid)
print(f"The precision of the default Random Forest Regressor on the MAPE metrics is: {(mape(y_valid, y_pred_randfor))*100:0.2f}%")

As we can see the default Random Forest Regressor showed approximately the same accuracy as the default Gradient Bossting Regressor. So, let's try to play with the hyperparameters to improve the result

### Model 5.1: Random Forest Regressor with the hypermarameters tuned

In [None]:
rf = RandomForestRegressor()

parameters_rf = {
    'bootstrap': [True, False],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3, 5, 10],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [None]:
# Using Grid Search to find the best parameters
#cv_randf = GridSearchCV(rf, parameters_rf,cv = 5, n_jobs = -1, verbose = 2)
#cv_randf.fit(X_train, y_train)

In [None]:
# best parameters
#cv_randf.best_params_

In [None]:
# Random Forest Regressor with the best hyperparameters
model_randfor_1 = RandomForestRegressor(bootstrap= 'False', max_depth = 110, max_features = 10, min_samples_leaf = 3, 
                                        min_samples_split = 10, n_estimators = 100, random_state = RANDOM_SEED)
model_randfor_1.fit(X_train, y_train)  
y_pred_randfor_1 = model_randfor_1.predict(X_valid)
print(f"The precision of the Random Forest Regressor with the hyperparameters on the MAPE metrics is: {(mape(y_valid, y_pred_randfor_1))*100:0.2f}%")

As we can see the result of the Random Forest Regressor with the best hyperparameters is worse than the result of the default model. So, we will probably stick to the default model if using this algorithm at all

## Model 6: K-Nearest Neighbors Regressor

In [None]:
# Default K-Nearest Neighbours Regressor
model_knn = KNeighborsRegressor(n_neighbors=2)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_valid)
print(f"The precision of the default K-Nearest Neighbours Regressor on the MAPE metrics is: {(mape(y_valid, y_pred_knn))*100:0.2f}%")

The result of the default K-Nearest Neighbors Regressor doesn't look promising. Probably we even won't play with the hyperparameteres

## Model 7: Support Vector Machine Regressor

In [None]:
# Default Support Vector Machine Regressor
model_svm = SVR(kernel = 'rbf')
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_valid)
print(f"The precision of the default Support Vector Machine Regressor on the MAPE metrics is: {(mape(y_valid, y_pred_svm))*100:0.2f}%")

The result of the default Support Vector Machine Regressor doesn't look promising. Probably we even won't play with the hyperparameteres

## Model 8: Ensemble of the Regressors

Let's now try to teach an ensemple of the regression models that prooved themselves the best

In [None]:
# Define basic models
level0 = list()
level0.append(('catb', CatBoostRegressor()))
level0.append(('rand', RandomForestRegressor()))
level0.append(('gradb', GradientBoostingRegressor()))

In [None]:
# Define meta-model
level1 = LinearRegression()

In [None]:
# Ensemble model
model_ens = StackingRegressor(estimators=level0, final_estimator=level1, cv=5, verbose = 100)
model_ens.fit(X_train, y_train)  
y_pred_ens = model_ens.predict(X_valid)
print(f"The precision of the Ensemble Model by the MAPE metrics is: {(mape(y_valid, y_pred_ens))*100:0.2f}%")

Unfortunately this particular ensemble of the regressors didn't show the improvement of the score that we saw in CatBooosting and Gradient Bossting Regressors

## Model 9: Simple Dense Neural Network

In [None]:
# Simple Dense Neural Network
model_dnn = Sequential()
model_dnn.add(L.Dense(512, input_dim=X_train.shape[1], activation="relu"))
model_dnn.add(L.Dropout(0.5))
model_dnn.add(L.Dense(256, activation="relu"))
model_dnn.add(L.Dropout(0.5))
model_dnn.add(L.Dense(1, activation="linear"))

In [None]:
model_dnn.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)
model_dnn.compile(loss='MAPE', optimizer = optimizer, metrics = ['MAPE'])
checkpoint_dnn = ModelCheckpoint('../working/best_model_dnn.hdf5', monitor = ['val_MAPE'], verbose=0, mode='min')
earlystop_dnn = EarlyStopping(monitor='val_MAPE', patience=50, restore_best_weights=True)
callbacks_list_dnn = [checkpoint_dnn,earlystop_dnn]

In [None]:
history_dnn = model_dnn.fit(X_train, y_train,
                   batch_size=512,
                   epochs = 500,
                   validation_data = (X_valid, y_valid),
                   callbacks = callbacks_list_dnn,
                   verbose = 0,
                   )

In [None]:
plt.title('Loss')
plt.plot(history_dnn.history['MAPE'], label='train')
plt.plot(history_dnn.history['val_MAPE'], label='test')
plt.show();

In [None]:
model_dnn.load_weights('../working/best_model_dnn.hdf5')
model_dnn.save('../working/nn_1.hdf5')

In [None]:
test_predict_nn1 = model_dnn.predict(X_valid)[:, 0]
print(f"TEST MAPE:{(mape(y_valid, test_predict_nn1))*100:0.2f}%")

Unfortunately, simple Dense Neural Network with initial parameters wasn't able to produce any better results than regressors did

## Model 10: TabNet 

In [None]:
# Reshaping data to the needed format
'''X = X_train.to_numpy()
y = y_train.to_numpy().reshape(-1, 1)
X_valid = X_valid.to_numpy()'''

In [None]:
# TabNet Neural Network
'''model_tabnet = TabNetRegressor(verbose=0,seed=42)
model_tabnet.fit(X_train=X, y_train=y,
              eval_set=[(X_valid, y_valid)],
              patience=300, max_epochs=2000)'''

In [None]:
'''test_predict_tabnet = model_tabnet.predict(X_valid)
print(f"The precision of TabNet Neural Network by the MAPE metrics is:{(mape(y_valid, test_predict_tabnet))*100:0.2f}%")'''

In [None]:
'''kf = KFold(n_splits=5, random_state=42, shuffle=True)
predictions_array =[]
CV_score_array    =[]
for train_index, test_index in kf.split(X):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    regressor = TabNetRegressor(verbose=0,seed=42)
    regressor.fit(X_train=X_train, y_train=y_train,
              eval_set=[(X_valid, y_valid)],
              patience=300, max_epochs=2000,
              eval_metric=['rmse'])
    CV_score_array.append(regressor.best_cost)
    predictions_array.append(np.expm1(regressor.predict(X_valid)))

predictions = np.mean(predictions_array,axis=0)'''

In [None]:
'''test_predict_tabnet_1 = regressor.predict(X_valid)
print(f"The precision of TabNet Neural Network by the MAPE metrics is:{(mape(y_valid, test_predict_tabnet_1))*100:0.2f}%")'''

## Model 11: Blend of the best models

Let's try now to use blend of the best models

In [None]:
# Doing blend prediction of the Catboost algorithm and Gradiend Boosting
blend_predict_1 = (test_predict_catboost + y_pred_gbr_1) / 2
print(f"The precision of the blend of the best models by the MAPE metric is: {(mape(y_valid, blend_predict_1))*100:0.2f}%")

In [None]:
# Doing blend prediction of the Catboost algorithm, Gradient Boosting and Random Forest
blend_predict_2 = (test_predict_catboost + y_pred_gbr_1 + y_pred_randfor) / 3
print(f"The precision of the blend of the best models by the MAPE metric is: {(mape(y_valid, blend_predict_2))*100:0.2f}%")

## Cheking the results

In [None]:
prices_real = y_valid.to_numpy()

In [None]:
prices_pred = y_pred_gbr

In [None]:
d = {'Prices Real': prices_real, 'Prices Predicted': prices_pred}

In [None]:
result = pd.DataFrame(d)

In [None]:
result['Prices Predicted'] = result['Prices Predicted'].apply(lambda x: int(x))

In [None]:
result

In [None]:
result.to_excel('prices_comp_v3_available.xlsx', index=False, encoding = 'cp1252')

### Outcomes: 
As we can see when we took only available lots we got much better results then when we were using all the lots from the dataset. And the best score was shown by the blend of CatBoost and Gradient Bosting Algoritms. This is the best result so far

#### To Do: <br>
<li> Experiment with logarithmization </li>
<li> Try different layers for TNN </li>
<li> Try removing those variables were to many options</li>

#### Open questions: <br>
<li> Why linear regression gives such insane values </li>
<li> What metric to choose </li>
<li> Apply linear regression </li>
<li> What to use as a meta algorithm in an ensemable </li>
<li> When exactly to use different regressors </li>