# 03 Hedonic Modeling
Semi-log OLS with structural controls and district fixed effects. Uses `data/processed/processed.csv`.
Includes basic filtering to avoid singular design matrices (drops zero/NA key fields and districts with very few listings). Uses EUR prices for interpretation consistency.

### Load processed data

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

ROOT = Path.cwd()
if ROOT.name == 'notebooks':
    ROOT = ROOT.parent

data_path = ROOT / 'data/processed/processed.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,url,listing_id,price_bgn,price_eur,area_m2,rooms,floor,max_floor,is_ground_floor,is_top_floor,heat,construction_type,newbuild,district
0,https://www.imot.bg/obiava-1a176060746046054-p...,ida1a176060746046054,224920.45,115000.0,55.0,1,6.0,10.0,0,0,other,brick,1.0,Овча купел 2
1,https://www.imot.bg/obiava-1a176352251944193-p...,ida1a176352251944193,244478.75,125000.0,44.0,1,12.0,14.0,0,0,gas,epk,0.0,Банишора
2,https://www.imot.bg/obiava-1a176276279674731-p...,ida1a176276279674731,318800.29,163000.0,50.0,1,6.0,6.0,0,1,other,brick,0.0,Борово
3,https://www.imot.bg/obiava-1a176183615194066-p...,ida1a176183615194066,192062.506,98200.0,42.0,1,3.0,4.0,0,0,gas,brick,1.0,Витоша
4,https://www.imot.bg/obiava-1a176170780994189-p...,ida1a176170780994189,201450.49,103000.0,47.0,1,6.0,10.0,0,0,gas,brick,1.0,Витоша


### Filter: keep positive price/area and drop missing key fields

In [None]:
df_model = df.copy()
df_model = df_model[(df_model['price_eur'] > 0) & (df_model['area_m2'] > 0)]
df_model = df_model.dropna(subset=['price_eur', 'area_m2', 'rooms', 'district'])

district_counts = df_model['district'].value_counts()
keep_districts = district_counts[district_counts >= 3].index  
df_model = df_model[df_model['district'].isin(keep_districts)].copy()

df_model['log_price'] = np.log(df_model['price_eur'])
df_model['log_area'] = np.log(df_model['area_m2'])
df_model[['log_price','log_area','rooms','district']].head()

Unnamed: 0,log_price,log_area,rooms,district
0,11.652687,4.007333,1,Овча купел 2
1,11.736069,3.78419,1,Банишора
2,12.001505,3.912023,1,Борово
3,11.494761,3.73767,1,Витоша
4,11.542484,3.850148,1,Витоша


### Baseline structural model (no district FE)

In [None]:
formula_base = 'log_price ~ log_area + rooms + floor + is_ground_floor + is_top_floor + newbuild + C(heat) + C(construction_type)'
model_base = smf.ols(formula_base, data=df_model).fit()
model_base.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,863.4
Date:,"Tue, 25 Nov 2025",Prob (F-statistic):,0.0
Time:,23:15:40,Log-Likelihood:,18.068
No. Observations:,2807,AIC:,-16.14
Df Residuals:,2797,BIC:,43.26
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.0630,0.082,110.944,0.000,8.903,9.223
C(heat)[T.gas],-0.0380,0.034,-1.113,0.266,-0.105,0.029
C(heat)[T.other],-0.0337,0.035,-0.975,0.330,-0.101,0.034
C(construction_type)[T.epk],-0.1898,0.025,-7.693,0.000,-0.238,-0.141
C(construction_type)[T.panel],-0.2308,0.016,-14.402,0.000,-0.262,-0.199
log_area,0.7068,0.022,32.490,0.000,0.664,0.749
rooms,0.1286,0.011,11.933,0.000,0.108,0.150
floor,0.0075,0.002,4.039,0.000,0.004,0.011
is_ground_floor,-3.211e-17,7.37e-18,-4.355,0.000,-4.66e-17,-1.77e-17

0,1,2,3
Omnibus:,84.116,Durbin-Watson:,0.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,130.262
Skew:,0.285,Prob(JB):,5.17e-29
Kurtosis:,3.888,Cond. No.,3.5e+17


### District fixed-effects model

In [None]:
formula_fe = 'log_price ~ log_area + rooms + floor + is_ground_floor + is_top_floor + newbuild + C(heat) + C(construction_type) + C(district)'
model_fe = smf.ols(formula_fe, data=df_model).fit()
model_fe.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.844
Model:,OLS,Adj. R-squared:,0.838
Method:,Least Squares,F-statistic:,134.1
Date:,"Tue, 25 Nov 2025",Prob (F-statistic):,0.0
Time:,23:15:51,Log-Likelihood:,761.97
No. Observations:,2807,AIC:,-1304.0
Df Residuals:,2697,BIC:,-650.6
Df Model:,109,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.9899,0.072,125.718,0.000,8.850,9.130
C(heat)[T.gas],-0.0314,0.027,-1.144,0.253,-0.085,0.022
C(heat)[T.other],-0.0338,0.028,-1.220,0.223,-0.088,0.021
C(construction_type)[T.epk],-0.0451,0.021,-2.148,0.032,-0.086,-0.004
C(construction_type)[T.panel],-0.0248,0.015,-1.630,0.103,-0.055,0.005
C(district)[T.Белите брези],0.3041,0.065,4.668,0.000,0.176,0.432
C(district)[T.Борово],0.1646,0.044,3.783,0.000,0.079,0.250
C(district)[T.Бояна],0.1122,0.034,3.315,0.001,0.046,0.179
C(district)[T.Бъкстон],0.1166,0.048,2.449,0.014,0.023,0.210

0,1,2,3
Omnibus:,116.394,Durbin-Watson:,1.211
Prob(Omnibus):,0.0,Jarque-Bera (JB):,329.467
Skew:,0.142,Prob(JB):,2.86e-72
Kurtosis:,4.654,Cond. No.,1.06e+16


### Extract district effects

In [None]:
district_effects = model_fe.params.filter(like='C(district)')
district_effects.sort_values(ascending=False).head()

C(district)[T.Докторски паметник]            0.633212
C(district)[T.в.з.Симеоново - Драгалевци]    0.571932
C(district)[T.Медицинска академия]           0.507205
C(district)[T.Яворов]                        0.503165
C(district)[T.Иван Вазов]                    0.496023
dtype: float64

### Extract dstrict mean residuals (over/undervaluation signal)

In [None]:
df_model['resid'] = model_fe.resid
district_resid = df_model.groupby('district')['resid'].mean().sort_values(ascending=False)
district_resid.head()

district
Експериментален    9.651539e-14
Люлин 4            8.112030e-14
Мусагеница         7.993606e-14
Зона Б-5           7.769630e-14
Младост 1А         7.673862e-14
Name: resid, dtype: float64