# Case with every city as a dummy

Import basic libraries. Some of them might be unused.

In [81]:
#base + visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Preprocessing and metric calculation
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

#The models
import lightgbm as lgb
from sklearn.base import RegressorMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor


## EDA and Cleaning

The text below uses RegEx to convert comething like "12 - 17" to integer with value of 17. The lower one trims the last two characters of a string and creates a float. "4 m" -> 4.0

In [48]:
import re
def extract_highest_number(text):
    numbers = re.findall(r'\d+', text)
    numbers = [int(num) for num in numbers]
    return max(numbers)

def extract_float(text):
    number = float(text[:-2])
    return number

In [49]:
#read file
data = pd.read_csv('df_eng.csv')

In [50]:
#delete observations with NaN target
data = data.dropna(subset=['price'])

In [52]:
#Drop irrelevant columns
data_new = data.drop(columns=["Название", "Статус", "Адрес", "Ссылка", 'ads', "Застройщик", "Квартир.в.продаже", "Лифт", "Фасад", "Количество.квартир", "date"])

In [118]:
titleplt.figure(figsize=(10, 6))
plt.hist(data['price'], bins=20, edgecolor='k', alpha=0.7)
plt.title('Distribution of Property Prices', fontsize=14)
plt.xlabel('Price per Square Meter (KZT)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Format x-axis labels to include KZT with commasax = plt.gca()
ax.get_xaxis().set_major_formatter(ticker.StrMethodFormatter('{x:,.0f} KZT'))
plt.show()

NameError: name 'titleplt' is not defined

In [51]:
#create copy of data in order to be able to rollback
data_new = data

In [53]:
#Check the remains
data_new.columns

Index(['price', 'class', 'floor', 'ceiling', 'structure', 'finishing',
       'kitchen', 'heating', 'parking', 'city'],
      dtype='object')

In [54]:
#Recoding the features to tackle the problem of explicit duplicates
data_new.loc[data['parking']=='na', 'parking'] = "no"
data_new.loc[data['parking']=='нет', 'parking'] = "no"

#check
data_new["parking"].value_counts()

Unnamed: 0_level_0,count
parking,Unnamed: 1_level_1
surface,300
underground,130
both,82
no,76


In [55]:
#same as above
data_new.loc[data['heating']=='na', 'heating'] = "central"
data_new["heating"].value_counts()

Unnamed: 0_level_0,count
heating,Unnamed: 1_level_1
central,432
autonomous,156


In [56]:
#String to Integer conversion with a function outlined above
data_new['ceiling'] = data_new['ceiling'].apply(extract_float)


In [57]:
data_new['ceiling'].unique()
#check

array([3.  , 3.3 , 3.2 , 3.4 , 3.1 , 2.8 , 2.7 , 2.9 , 3.6 , 2.85, 3.15,
       3.25, 2.62, 2.95, 2.75, 5.  , 3.7 , 3.35, 3.5 , 2.65, 3.45, 8.  ,
       4.  , 2.6 , 3.05, 4.3 ])

In [58]:
#String to Float
data_new['floor'] = data_new['floor'].apply(extract_highest_number)

In [59]:
#check
data_new['floor'].unique()

array([ 5,  3, 10, 16,  6, 22, 12,  2,  8, 11,  7,  9, 15, 13, 17,  4,  1,
       33, 20, 18, 21, 24, 14, 23, 27, 19, 25, 28])

In [60]:
#check the results
data_new.head()

Unnamed: 0,price,class,floor,ceiling,structure,finishing,kitchen,heating,parking,city
0,235000.0,comfort,5,3.0,monolithic,pre-finish,na,central,no,aksaj
1,300000.0,business,3,3.0,brick,shell,discrete,central,both,aksu
2,400000.0,business,10,3.3,monolithic,shell,discrete,central,both,aktau
3,560000.0,elite,16,3.2,monolithic,shell,discrete,autonomous,underground,aktau
4,215000.0,comfort,6,3.0,brick,shell,discrete,central,surface,aktau


In [61]:
#Save the cleaned version
data_new.to_csv('df_eng_new3.csv', index=False)

## Forming the train and the test/ Encoding the features

In [62]:
#Divide into Features matrix and Target vector
X = data_new.drop(columns=['price'])
y = data_new['price']

In [63]:
#Divide into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=None)
for a in [X_train, X_test]:
  a.reset_index(inplace=True, drop=True)

In [64]:
#columns to create dummy variables for
categorical_columns = ['class','structure', 'finishing',
       'kitchen', 'heating', 'parking', 'city']

In [65]:
#Initialize the encoder
ohe = OneHotEncoder(handle_unknown='ignore')

#Let the encoder see the columns and transform
#We fit on the train set, so that the information from the test does not leak into the model
ohe_feats_train = ohe.fit_transform(X_train[categorical_columns ])
#Encoder knows what to encode, so only transform
ohe_feats_test = ohe.transform(X_test[categorical_columns ])

#Create the panels of encoded data with the column names
df_ohe_train = pd.DataFrame(ohe_feats_train.toarray(), columns=ohe.get_feature_names_out())
df_ohe_test = pd.DataFrame(ohe_feats_test.toarray(), columns=ohe.get_feature_names_out())

#Add the encoded data to the Test and Train sets, drop the original(not encoded) columns
X_train = pd.concat([X_train, df_ohe_train], axis=1).drop(columns=categorical_columns )
X_test = pd.concat([X_test, df_ohe_test], axis=1).drop(columns=categorical_columns )


In [66]:
#check if it works
X_train.head()

Unnamed: 0,floor,ceiling,class_business,class_comfort,class_economy,class_elite,structure_brick,structure_monolithic,structure_panel,finishing_finish,...,city_shymkent,city_taldykorgan,city_talgar,city_taraz,city_turkestan,city_tuzdybastau.kalinino,city_uralsk,city_ust.kamenogorsk,city_zhana.kuat,city_zhanaozen
0,6,2.85,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6,2.7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3.3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22,3.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
data_new.to_csv('df_eng_new35.csv', index=False)

## Training and checking the Gradient Boosting

In [87]:
X1 = pd.concat([X_train, X_test])
y1 = pd.concat([y_train, y_test])

In [88]:
#The grid was respecified so it does not take as much time, some values that
#would be important only in production are dropped
param_grid = {
    'max_depth': [3, 4, 5], #We do not have too many of variables, many of them are mutually exclusive, so we want short trees
    'num_leaves': [8, 16, 32], #Standard conventional values
    'learning_rate': [0.01, 0.05, 0.1],
    'min_data_in_leaf': [10, 20, 30],
}

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # Change metric if needed
    cv=5,  # Stratified K-Fold Cross Validation
    verbose=2,
    n_jobs=-1
)

# Perform the search
grid_search.fit(X1, y1)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 83
[LightGBM] [Info] Number of data points in the train set: 588, number of used features: 23
[LightGBM] [Info] Start training from score 441303.767007


Running Gridsearch, presentes a model specified as
LGBMRegressor(learning_rate=0.05, max_depth=3,    
              metric='mae',
              min_data_in_leaf=30, n_jobs=-1, num_leaves=8,
              objective='regression')



This however, is much much better than untuned Tree, is still non-optimal. Slight tweaking yields slightly better mse score. This fact, though, is not of any importance as the tree already outperforms other methods

In [95]:
#Specify the model, fit and predict.
#The model here is slightly hand-tweaked,
lgbm = lgb.LGBMRegressor(learning_rate=0.05, max_depth=5, metric='rmse',
              min_data_in_leaf=30, n_jobs=-1, num_leaves=16,
              objective='regression')
lgbm.fit(X_train, y_train)
y_pred_lgbm = lgbm.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 82
[LightGBM] [Info] Number of data points in the train set: 470, number of used features: 23
[LightGBM] [Info] Start training from score 435112.259574


In [96]:
rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
print(rmse_lgbm)
print(mae_lgbm)

107976.79616344199
73959.24801534494


## Manual mean regressor

Here is the MeanRegressor that was written manually. The idea to use it as a baseline is taken from Yandex ML Handbook.

In [98]:
#this weird orange text, is, for some reason, included when people write their classifiers

class MeanRegressor(RegressorMixin):
    # Predicts the mean of y_train
    def fit(self, X=None, y=None):
        '''
        Parameters
        ----------
        X : array like, shape = (n_samples, n_features)
        Training data features
        y : array like, shape = (_samples,)
        Training data targets
        '''
        self.pred = np.mean(y)
        return self

    def predict(self, X=None):
        '''
        Parameters
        ----------
        X : array like, shape = (n_samples, n_features)
        Data to predict
        '''
        #return the vector of length y_test, with every component being the mean of train
        return np.full(shape=X.shape[0], fill_value=self.pred)

In [99]:
mr = MeanRegressor()
mr.fit(X_train, y_train)
y_pred_base = mr.predict(X_test)

In [100]:
rmse_mr = np.sqrt(mean_squared_error(y_test, y_pred_base))
mae_mr = mean_absolute_error(y_test, y_pred_base)
print(rmse_mr)
print(mae_mr)

229454.27179986093
165719.7649837721


## Linear models

In [101]:
lin = LinearRegression()
lin.fit(X_train, y_train)
y_pred_lin = lin.predict(X_test)

In [102]:
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
mae_lin = mean_absolute_error(y_test, y_pred_lin)
print(rmse_lin)
print(mae_lin)

126486.14840210708
90839.72895012867


In [103]:
coefficients = pd.Series(lin.coef_, index=X_train.columns)
coefficients

Unnamed: 0,0
floor,2279.186143
ceiling,135248.697014
class_business,21462.723402
class_comfort,-116679.592741
class_economy,-138131.493551
class_elite,233348.362889
structure_brick,21645.334119
structure_monolithic,18931.721612
structure_panel,-40577.055731
finishing_finish,81470.825202


Create the dicts that will contain {regularization rate : the resulting mae}.

Create the list of possible lambda values to iterate through

In [105]:
lister = {}
lister2 = {}
#Fill the array with 100 equally-distant from 0 to 5
range_las = np.linspace(0, 5, 100)

In [111]:
#check the lambdas
for i in range_las:
  #Create lasso with lambda equal to one currently iterated
  lasso = Lasso(alpha=i)
  #fit, train, predict, record the results to the dict
  lasso.fit(X_train, y_train)
  y_pred_lasso = lasso.predict(X_test)
  mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
  lister[i] = mae_lasso

  #Finding the regressor that is LAD with Lasso reg did not
  #result in anything, so, Custom stochastic gradient descent was
  #the most suitable method. The exact intuition behind the specification
  #can be found in documentation

  #fit, train, predict, record to the dict
  lasso_mae = SGDRegressor(loss='epsilon_insensitive', #Looks like a generalization of LAD
                           penalty='l1', #lasso
                           alpha=i, #reg
                           random_state=42, # for reproductobility
                           epsilon=0, #For zero epsilon, the loss function is just sum of absolute errors
                           max_iter=25000, tol=1e-3)
  lasso_mae.fit(X_train, y_train)
  y_pred_lasso_mae = lasso_mae.predict(X_test)
  mae_lasso_mae = mean_absolute_error(y_test, y_pred_lasso_mae)
  lister2[i] = mae_lasso_mae

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  mode

In [112]:
#function to retrieve the minimal value in the dict
def lowest_value(dictionary):
  min_key = min(dictionary, key=dictionary.get)
  min_value = dictionary[min_key]

  return min_key, min_value

Rerunning the code on slightly altered dataset has showed quite different results. Though, Gradient Descent is known to sometimes get stuck in some local minima and not findgin the global minima of loss functions, this is especially true for Stochastic GD, which can be seen here

In [113]:
print(lowest_value(lister))
print(lowest_value(lister2))

(5.0, 90808.26693111443)
(0.0, 235308.40974327963)


The 5.0 lambda in lasso shows that regularization gives the best results. Though, it barely outperforms the OLS, so the go-to move is to use the simpler model, so OLS it is

## Results

In [114]:
print(f'lgbm mae: {mae_lgbm}')
print(f'linear regression mae: {mae_lin}')
print(f'mean regressor mae: {mae_mr}')

lgbm mae: 73959.24801534494
linear regression mae: 90839.72895012867
mean regressor mae: 165719.7649837721


# Case for Almaty dummy only. It is meant to be run successively, after the previous one

## EDA once again

In [183]:
#Once again imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


import lightgbm as lgb
from sklearn.base import RegressorMixin

from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

In [184]:
data = pd.read_csv('df_eng.csv')

In [185]:
data = data.dropna(subset=['price'])

In [186]:
data_new = data.drop(columns=["Название", "Статус", "Адрес", "Ссылка", 'ads', "Застройщик", "Квартир.в.продаже", "Лифт", "Фасад", "Количество.квартир", "date"])

In [187]:
data_new.loc[data['parking']=='na', 'parking'] = "no"
data_new.loc[data['parking']=='нет', 'parking'] = "no"

data_new["parking"].value_counts()

Unnamed: 0_level_0,count
parking,Unnamed: 1_level_1
surface,300
underground,130
both,82
no,76


In [188]:
data_new.loc[data['heating']=='na', 'heating'] = "central"
data_new["heating"].value_counts()

Unnamed: 0_level_0,count
heating,Unnamed: 1_level_1
central,432
autonomous,156


In [189]:
data_new['ceiling'] = data_new['ceiling'].apply(extract_float)

In [190]:
data_new['ceiling'].unique()

array([3.  , 3.3 , 3.2 , 3.4 , 3.1 , 2.8 , 2.7 , 2.9 , 3.6 , 2.85, 3.15,
       3.25, 2.62, 2.95, 2.75, 5.  , 3.7 , 3.35, 3.5 , 2.65, 3.45, 8.  ,
       4.  , 2.6 , 3.05, 4.3 ])

In [191]:
data_new['floor'] = data_new['floor'].apply(extract_highest_number)

## Splits

In [192]:
X = data_new.drop(columns=['price'])
y = data_new['price']

In [193]:
#Create dummy for Almaty
X['city'] = (X['city'] == 'almaty').astype(int)

In [194]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=None)
for a in [X_train, X_test]:
  a.reset_index(inplace=True, drop=True)

In [195]:
#columns to create dummy variables for. City is no longer here
categorical_columns = ['class','structure', 'finishing',
       'kitchen', 'heating', 'parking']

In [196]:
#Same as before, no city encoding though
ohe = OneHotEncoder(handle_unknown='ignore')
ohe_feats_train = ohe.fit_transform(X_train[categorical_columns ])
ohe_feats_test = ohe.transform(X_test[categorical_columns ])
df_ohe_train = pd.DataFrame(ohe_feats_train.toarray(), columns=ohe.get_feature_names_out())
df_ohe_test = pd.DataFrame(ohe_feats_test.toarray(), columns=ohe.get_feature_names_out())
X_train = pd.concat([X_train, df_ohe_train], axis=1).drop(columns=categorical_columns )
X_test = pd.concat([X_test, df_ohe_test], axis=1).drop(columns=categorical_columns )


## Models

---



In [197]:
lgbm = lgb.LGBMRegressor(learning_rate=0.05, max_depth=5, metric='mae',
              min_data_in_leaf=30, n_jobs=-1, num_leaves=16,
              objective='regression')
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 470, number of used features: 20
[LightGBM] [Info] Start training from score 435112.259574


In [198]:
y_pred_lgbm = lgbm.predict(X_test)



In [199]:
rmse_lgbm = np.sqrt(mean_squared_error(y_test, y_pred_lgbm))
mae_lgbm = mean_absolute_error(y_test, y_pred_lgbm)
print(rmse_lgbm)
print(mae_lgbm)

114899.75153788763
80388.53141291392


In [200]:
lin = LinearRegression()
lin.fit(X_train, y_train)
y_pred_lin = lin.predict(X_test)

In [201]:
rmse_lin = np.sqrt(mean_squared_error(y_test, y_pred_lin))
mae_lin = mean_absolute_error(y_test, y_pred_lin)
print(rmse_lin)
print(mae_lin)

136000.88670862303
97972.67550163779


In [202]:
meanreg = MeanRegressor()
meanreg.fit(X_train, y_train)
y_pred_meanreg = meanreg.predict(X_test)
rmse_meanreg = np.sqrt(mean_squared_error(y_test, y_pred_meanreg))
mae_meanreg = mean_absolute_error(y_test, y_pred_meanreg)
print(rmse_meanreg)
print(mae_meanreg)

229454.27179986093
165719.7649837721


In [203]:
lister = {}

In [204]:
lister2 = {}

In [205]:
range_las = np.linspace(0, 5, 100)

In [206]:
range_las

array([0.        , 0.05050505, 0.1010101 , 0.15151515, 0.2020202 ,
       0.25252525, 0.3030303 , 0.35353535, 0.4040404 , 0.45454545,
       0.50505051, 0.55555556, 0.60606061, 0.65656566, 0.70707071,
       0.75757576, 0.80808081, 0.85858586, 0.90909091, 0.95959596,
       1.01010101, 1.06060606, 1.11111111, 1.16161616, 1.21212121,
       1.26262626, 1.31313131, 1.36363636, 1.41414141, 1.46464646,
       1.51515152, 1.56565657, 1.61616162, 1.66666667, 1.71717172,
       1.76767677, 1.81818182, 1.86868687, 1.91919192, 1.96969697,
       2.02020202, 2.07070707, 2.12121212, 2.17171717, 2.22222222,
       2.27272727, 2.32323232, 2.37373737, 2.42424242, 2.47474747,
       2.52525253, 2.57575758, 2.62626263, 2.67676768, 2.72727273,
       2.77777778, 2.82828283, 2.87878788, 2.92929293, 2.97979798,
       3.03030303, 3.08080808, 3.13131313, 3.18181818, 3.23232323,
       3.28282828, 3.33333333, 3.38383838, 3.43434343, 3.48484848,
       3.53535354, 3.58585859, 3.63636364, 3.68686869, 3.73737

In [207]:
for i in range_las:
  lasso = Lasso(alpha=i)
  lasso.fit(X_train, y_train)
  y_pred_lasso = lasso.predict(X_test)
  mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
  lister[i] = mae_lasso


  lasso_mae = SGDRegressor(loss='epsilon_insensitive',
                           penalty='l1',
                           alpha=i,
                           random_state=42,
                           epsilon=0,
                           max_iter=25000, tol=1e-3)
  lasso_mae.fit(X_train, y_train)
  y_pred_lasso_mae = lasso_mae.predict(X_test)
#
  mae_lasso_mae = mean_absolute_error(y_test, y_pred_lasso_mae)
  lister2[i] = mae_lasso_mae

  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

Same issue with Stochastic Gradient Descent is evident here.

In [None]:
print(lowest_value(lister))
print(lowest_value(lister2))

In [208]:
print(f'lgbm mae: {mae_lgbm}')
print(f'linear regression mae: {mae_lin}')
print(f'mean regressor mae: {mae_meanreg}')

lgbm mae: 80388.53141291392
linear regression mae: 97972.67550163779
mean regressor mae: 165719.7649837721
