In [208]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error


In [209]:
#Load the trainig and test dataset, 

apartments_train = pd.read_csv("appartments_train.csv")
apartments_test = pd.read_csv("appartments_test.csv")


print(apartments_train.head())
print(apartments_train.info())

            unit_id  obj_type  dim_m2  n_rooms  floor_no  floor_max  \
0  a3a463617a5c0439  0d6c4dfc   45.89      2.0       1.0        4.0   
1  23a92531fcb238b4  0c238f18   27.64      1.0       1.0        2.0   
2  d158671401f9fc34  0d6c4dfc   62.18      2.0       1.0        2.0   
3  280aced4655b7a96  2a6d5c01   53.68      2.0       NaN        4.0   
4  2315fa621e746fe4  2a6d5c01   70.89      3.0       2.0        3.0   

   year_built  dist_centre  n_poi  dist_sch  ...     price_z  src_month  \
0      1999.0       13.496   11.0     0.541  ...   519626.21    2023-09   
1      1940.0        2.440   16.0     0.377  ...   162959.26    2024-04   
2      2000.0       10.284    8.0     0.391  ...  1167571.51    2023-10   
3      2018.0       10.589   30.0     0.729  ...   907071.16    2024-01   
4      2015.0        8.305    7.0     1.226  ...  1080383.19    2023-11   

   loc_code  market_volatility  infrastructure_quality  \
0  693f303c          501710.76                   14.02   
1  8d5

In [210]:
# I am removing the non-predictive features

apartments_train = apartments_train.drop(columns=['unit_id', 'obj_type', 'own_type', 'build_mat', 'cond_class', 'src_month', 'loc_code'])
apartments_test = apartments_test.drop(columns=['unit_id', 'obj_type', 'own_type', 'build_mat', 'cond_class', 'src_month', 'loc_code'])


In [211]:
# Handling missing values 


num_missing = apartments_train.select_dtypes(include=[np.number]).isnull().sum()
cat_missing = apartments_train.select_dtypes(exclude=[np.number]).isnull().sum()

num_missing1 = apartments_test.select_dtypes(include=[np.number]).isnull().sum()
cat_missing1 = apartments_test.select_dtypes(exclude=[np.number]).isnull().sum()

print(apartments_train.isnull().sum().sort_values(ascending=False))


floor_no                      27698
year_built                    25713
infrastructure_quality        25713
has_lift                       7727
dist_uni                       4317
floor_max                      1921
dist_clinic                     571
dist_rest                       391
dist_pharma                     219
dist_post                       188
dist_kind                       163
dist_sch                        130
price_z                           0
popularity_index                  0
neighborhood_crime_rate           0
green_space_ratio                 0
estimated_maintenance_cost        0
market_volatility                 0
dim_m2                            0
has_store                         0
has_sec                           0
has_balcony                       0
has_park                          0
n_rooms                           0
n_poi                             0
dist_centre                       0
global_economic_index             0
dtype: int64


In [212]:
# For every numerical column in apartments_train and apartment_test with missing values - i fill the missing values with 
# median from apartment_train 

#For categorical columns with missing values i fill them with the mode from the training set.


for col in num_missing[num_missing > 0].index:
    median_value = apartments_train[col].median()
    apartments_train[col] = apartments_train[col].fillna(median_value)
    apartments_test[col] = apartments_test[col].fillna(median_value)

for col in cat_missing[cat_missing > 0].index:
    mode_value = apartments_train[col].mode()[0]
    apartments_train[col] = apartments_train[col].fillna(mode_value)
    apartments_test[col] = apartments_test[col].fillna(mode_value)

print(apartments_train.isnull().sum().sort_values(ascending=False))


dim_m2                        0
has_park                      0
estimated_maintenance_cost    0
green_space_ratio             0
popularity_index              0
neighborhood_crime_rate       0
infrastructure_quality        0
market_volatility             0
price_z                       0
has_store                     0
has_sec                       0
has_lift                      0
has_balcony                   0
dist_pharma                   0
n_rooms                       0
dist_uni                      0
dist_rest                     0
dist_kind                     0
dist_post                     0
dist_clinic                   0
dist_sch                      0
n_poi                         0
dist_centre                   0
year_built                    0
floor_max                     0
floor_no                      0
global_economic_index         0
dtype: int64


In [213]:
# I decided that I will just encode them using 0 and 1 to be more efficient 

binary_columns = ['has_park', 'has_balcony', 'has_lift', 'has_sec', 'has_store']

for col in binary_columns:
    apartments_train[col] = apartments_train[col].map({'yes': 1, 'no': 0})

for col in binary_columns:
    apartments_test[col] = apartments_test[col].map({'yes': 1, 'no': 0})


apartments_train.head()


Unnamed: 0,dim_m2,n_rooms,floor_no,floor_max,year_built,dist_centre,n_poi,dist_sch,dist_clinic,dist_post,...,has_sec,has_store,price_z,market_volatility,infrastructure_quality,neighborhood_crime_rate,popularity_index,green_space_ratio,estimated_maintenance_cost,global_economic_index
0,45.89,2.0,1.0,4.0,1999.0,13.496,11.0,0.541,0.878,0.438,...,0,1,519626.21,501710.76,14.02,95.39,44.51,0.999,13.99,100.291946
1,27.64,1.0,1.0,2.0,1940.0,2.44,16.0,0.377,0.979,0.062,...,0,0,162959.26,147763.87,110.55,46.17,56.25,1.0,7.59,91.315644
2,62.18,2.0,1.0,2.0,2000.0,10.284,8.0,0.391,1.242,0.563,...,0,1,1167571.51,1042847.59,31.15,18.94,50.36,0.999,21.14,93.681619
3,53.68,2.0,3.0,4.0,2018.0,10.589,30.0,0.729,1.911,0.045,...,0,0,907071.16,728839.39,7.52,11.84,46.69,0.999,7.91,94.192062
4,70.89,3.0,2.0,3.0,2015.0,8.305,7.0,1.226,1.974,1.39,...,0,0,1080383.19,1263171.15,11.2,89.64,45.6,0.999,8.63,96.166051


In [214]:
# Handling skewness 

numeric_cols = apartments_train.select_dtypes(include=['number']).columns

skewness = apartments_train[numeric_cols].skew().sort_values(ascending=False)


print("Skewness of variables:\n")
print(skewness)

skewed_cols = skewness[skewness.abs() > 0.75].index.tolist()

print("\nHighly skewed columns (|skew| > 0.75):")
print(skewed_cols)


Skewness of variables:

dist_rest                     5.101114
dist_kind                     5.077459
dist_pharma                   5.058296
dist_sch                      4.446922
dist_post                     3.848432
n_poi                         2.927133
has_sec                       2.455248
floor_no                      2.372522
market_volatility             1.909276
floor_max                     1.865876
price_z                       1.840627
dist_clinic                   1.759203
infrastructure_quality        1.554280
estimated_maintenance_cost    1.362868
dim_m2                        1.278990
dist_uni                      1.150679
has_park                      1.026185
dist_centre                   0.866919
n_rooms                       0.701467
has_store                     0.266990
has_lift                      0.107022
neighborhood_crime_rate       0.002969
popularity_index             -0.001736
global_economic_index        -0.038939
has_balcony                  -0.295347
g

In [215]:
skewed_cols_to_log = [
    'dist_rest', 'dist_kind', 'dist_pharma', 'dist_sch', 'dist_post',
    'n_poi', 'floor_no', 'floor_max', 'dist_clinic', 'infrastructure_quality',
    'estimated_maintenance_cost', 'dim_m2', 'dist_uni', 'dist_centre'
]


for col in skewed_cols_to_log:
    if (apartments_train[col] >= 0).all():
        apartments_train[col] = np.log1p(apartments_train[col])
        print(f"Transformed: {col}")

for col in skewed_cols_to_log:
    if (apartments_test[col] >= 0).all():
        apartments_test[col] = np.log1p(apartments_test[col])
        print(f"Transformed: {col}")

max_year = apartments_train['year_built'].max()
apartments_train['year_built'] = np.log1p(max_year + 1 - apartments_train['year_built'])

max_year = apartments_test['year_built'].max()
apartments_test['year_built'] = np.log1p(max_year + 1 - apartments_test['year_built'])

Transformed: dist_rest
Transformed: dist_kind
Transformed: dist_pharma
Transformed: dist_sch
Transformed: dist_post
Transformed: n_poi
Transformed: floor_no
Transformed: floor_max
Transformed: dist_clinic
Transformed: infrastructure_quality
Transformed: estimated_maintenance_cost
Transformed: dim_m2
Transformed: dist_uni
Transformed: dist_centre
Transformed: dist_rest
Transformed: dist_kind
Transformed: dist_pharma
Transformed: dist_sch
Transformed: dist_post
Transformed: n_poi
Transformed: floor_no
Transformed: floor_max
Transformed: dist_clinic
Transformed: infrastructure_quality
Transformed: estimated_maintenance_cost
Transformed: dim_m2
Transformed: dist_uni
Transformed: dist_centre


In [216]:
columns_to_drop = [
    'n_rooms',             # redundant with dim_m2
    'dist_sch',            # redundant with dist_pharma
    'dist_clinic',         # same as earlier
    'dist_kind',           # highly correlated with many
    'dist_post',           # redundant with dist_kind
    'dist_rest',           # redundant with infrastructure_quality
    'dist_centre',         # redundant with green_space_ratio
]

apartments_train.drop(columns=columns_to_drop, inplace=True)
apartments_test.drop(columns=columns_to_drop, inplace=True)



In [217]:
X_full = apartments_train.drop(columns=['price_z'])
y_full = apartments_train['price_z']

X_temp, X_test, y_temp, y_test = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)


X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42  
)


pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

cv = KFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    pipeline, X_train, y_train, cv=cv, scoring='neg_root_mean_squared_error'
)
print("Cross-validated RMSE(train set):", round(-cv_scores.mean(), 2))
pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
print("Validation RMSE:", round(val_rmse, 2))


y_test_pred = pipeline.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print("Final Test RMSE:", round(test_rmse, 2))


Cross-validated RMSE(train set): 96807.51
Validation RMSE: 97304.67
Final Test RMSE: 97935.09




In [218]:

X_final_test = apartments_test.copy()


predictions = pipeline.predict(X_final_test)


results = pd.DataFrame({
    "observation_id": range(len(predictions)), 
    "predicted_price_z": predictions
})


results.to_csv("test_predictions.csv", index=False)

print(results.head())




   observation_id  predicted_price_z
0               0      573243.317524
1               1      638530.884129
2               2      581157.964659
3               3      256058.655108
4               4      304464.647552
