# Not Standardizated Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("Preprocessed_before_standardization_and_normalization.csv")
df.head()

Unnamed: 0,price,m2_price,square,rooms,district,micro_district,building_type,floor,floors,condition,building_age,floor_to_floors,is_good_floor
0,110880,720,154,4,1,0,2,12,14,2,2,0.857143,0
1,57000,740,77,2,1,0,2,5,12,2,2,0.416667,1
2,57750,750,77,2,1,0,1,5,12,2,2,0.416667,1
3,57000,740,77,2,1,0,2,5,12,2,2,0.416667,1
4,55000,1279,43,1,2,3,2,1,12,6,12,0.083333,0


### troubleshooting unnoticed problems

In [3]:

df['rooms'] = df['rooms'].replace({
    '6 –∏ –±–æ–ª–µ–µ': 6,
    '—Å–≤–æ–±–æ–¥–Ω–∞—è –ø–ª–∞–Ω–∏—Ä–æ–≤–∫–∞': 7
}).astype(int)

In [4]:
df.drop(["floor", "floors"], axis=1, inplace=True)# no need columns

## Dividing into train and test data

In [5]:
X = df.drop(columns=['price'])  # goal variable
y = df['price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Trying a models

### DecisionTreeRegressor

In [7]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Regressor:")
print("R¬≤ Score:", r2_score(y_test, y_pred_dt))
print("MSE:", mean_squared_error(y_test, y_pred_dt))

Decision Tree Regressor:
R¬≤ Score: 0.9956338784789327
MSE: 5838587.156765535


### GradientBoostingRegressor

In [8]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

print("Gradient Boosting Regressor:")
print("R¬≤ Score:", r2_score(y_test, y_pred_gb))
print("MSE:", mean_squared_error(y_test, y_pred_gb))

Gradient Boosting Regressor:
R¬≤ Score: 0.9974636423265404
MSE: 3391739.1592905014


### cross-validation of gb_model

In [9]:
from sklearn.model_selection import cross_val_score
import numpy as np

# 5-fold cross-validation
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='r2')

print("Cross-validation R¬≤ Scores:", cv_scores)
print("Mean Cross-validation R¬≤ Score:", np.mean(cv_scores))


Cross-validation R¬≤ Scores: [0.99675298 0.9968115  0.99716674 0.99724117 0.99760398]
Mean Cross-validation R¬≤ Score: 0.9971152739671416


In [10]:
# deleting m2_price from tainig data
X_train_new = X_train.drop(columns=['m2_price'])
X_test_new = X_test.drop(columns=['m2_price'])

from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train_new, y_train)

In [11]:
from scipy.stats import ttest_ind

# –ü—Ä–∏–º–µ—Ä –¥–∞–Ω–Ω—ã—Ö
sample1 = X_train['square']  # –í—ã–±–æ—Ä–∫–∞ 1 (–Ω–∞–ø—Ä–∏–º–µ—Ä, –ø–ª–æ—â–∞–¥—å –∫–≤–∞—Ä—Ç–∏—Ä –≤ –æ–±—É—á–∞—é—â–µ–π –≤—ã–±–æ—Ä–∫–µ)
sample2 = X_test['square']   # –í—ã–±–æ—Ä–∫–∞ 2 (–Ω–∞–ø—Ä–∏–º–µ—Ä, –ø–ª–æ—â–∞–¥—å –∫–≤–∞—Ä—Ç–∏—Ä –≤ —Ç–µ—Å—Ç–æ–≤–æ–π –≤—ã–±–æ—Ä–∫–µ)

# t-—Ç–µ—Å—Ç
t_stat, p_value = ttest_ind(sample1, sample2)

print("t-Statistic:", t_stat)
print("p-Value:", p_value)

if p_value < 0.05:
    print("–ì–∏–ø–æ—Ç–µ–∑–∞ –æ —Ä–∞–≤–µ–Ω—Å—Ç–≤–µ —Å—Ä–µ–¥–Ω–∏—Ö –æ—Ç–≤–µ—Ä–≥–∞–µ—Ç—Å—è.")
else:
    print("–ù–µ—Ç –æ—Å–Ω–æ–≤–∞–Ω–∏–π –æ—Ç–≤–µ—Ä–≥–Ω—É—Ç—å –≥–∏–ø–æ—Ç–µ–∑—É –æ —Ä–∞–≤–µ–Ω—Å—Ç–≤–µ —Å—Ä–µ–¥–Ω–∏—Ö.")

t-Statistic: -0.0428688750179904
p-Value: 0.9658069458939689
–ù–µ—Ç –æ—Å–Ω–æ–≤–∞–Ω–∏–π –æ—Ç–≤–µ—Ä–≥–Ω—É—Ç—å –≥–∏–ø–æ—Ç–µ–∑—É –æ —Ä–∞–≤–µ–Ω—Å—Ç–≤–µ —Å—Ä–µ–¥–Ω–∏—Ö.


In [12]:
from scipy.stats import chi2_contingency
import numpy as np

# –ü—Ä–∏–º–µ—Ä —Ç–∞–±–ª–∏—Ü—ã —Å–æ–ø—Ä—è–∂–µ–Ω–Ω–æ—Å—Ç–∏
contingency_table = pd.crosstab(df['district'], df['building_type'])

# –•–∏-–∫–≤–∞–¥—Ä–∞—Ç —Ç–µ—Å—Ç
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print("Chi-Square Statistic:", chi2_stat)
print("p-Value:", p_value)
print("Degrees of Freedom:", dof)
print("Expected Frequencies:\n", expected)

if p_value < 0.05:
    print("–ì–∏–ø–æ—Ç–µ–∑–∞ –æ –Ω–µ–∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç–≤–µ—Ä–≥–∞–µ—Ç—Å—è.")
else:
    print("–ù–µ—Ç –æ—Å–Ω–æ–≤–∞–Ω–∏–π –æ—Ç–≤–µ—Ä–≥–Ω—É—Ç—å –≥–∏–ø–æ—Ç–µ–∑—É –æ –Ω–µ–∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏.")

Chi-Square Statistic: 615.2550660951492
p-Value: 1.1937236949656734e-129
Degrees of Freedom: 6
Expected Frequencies:
 [[ 224.48468543  491.68201573  328.83329884]
 [1078.60057947 2362.42622103 1579.9731995 ]
 [ 422.11713576  924.55039321  618.33247103]
 [ 350.79759934  768.34137003  513.86103063]]
–ì–∏–ø–æ—Ç–µ–∑–∞ –æ –Ω–µ–∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç–≤–µ—Ä–≥–∞–µ—Ç—Å—è.


In [108]:
feature_columns = [
    'square', 'rooms', 'district', 'micro_district',
    'building_type', 'condition', 'building_age',
    'floor_to_floors', 'is_good_floor'
]

micro_district_options = {
    4: ['–ó–æ–ª–æ—Ç–æ–π –∫–≤–∞–¥—Ä–∞—Ç', '–ê—Å–∞–Ω–±–∞–π –º-–Ω', '–ë–∏—à–∫–µ–∫-–ü–∞—Ä–∫', '–¶–£–ú', '–û—Ä–æ–∑–±–µ–∫–æ–≤–∞ - –ñ–∏–±–µ–∫-–ñ–æ–ª—É',
        '–ì–æ—Ä—å–∫–æ–≥–æ - –ü–∞–Ω—Ñ–∏–ª–æ–≤–∞', '–ú–æ—Å–∫–æ–≤—Å–∫–∞—è - –ë–µ–ª–∏–Ω–∫–∞', '–ú–æ—Å–∫–æ–≤—Å–∫–∞—è - –£–º–µ—Ç–∞–ª–∏–µ–≤–∞',
        '–ü–∞—Ä–∫ –ü–∞–Ω—Ñ–∏–ª–æ–≤–∞/–°–ø–∞—Ä—Ç–∞–∫', '–§–∏–ª–∞—Ä–º–æ–Ω–∏—è'],
    3: ['–ê–∫–∞–¥–µ–º–∏—è –ù–∞—É–∫', '–ö–ù–£', '–ê–£–¶–ê', '–î–≤–æ—Ä–µ—Ü —Å–ø–æ—Ä—Ç–∞', '–ú–µ–¥. –∞–∫–∞–¥–µ–º–∏—è', '–ê–∫ –ö–µ–º–µ',
        '–ú–æ–ª–æ–¥–∞—è –ì–≤–∞—Ä–¥–∏—è', '–í–æ—Å—Ç–æ–∫-5 –º-–Ω', '–Æ–≥-2 –º-–Ω', '–ö–æ–∫-–ñ–∞—Ä –º-–Ω', '–¶–µ–Ω—Ç—Ä–∞–ª—å–Ω–∞—è –º–µ—á–µ—Ç—å',
        '–ü–∞—Ä–∫ –ê—Ç–∞—Ç—é—Ä–∫', '–ì–µ–Ω –ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä–∞'],
    2: ['4 –º-–Ω', '5 –º-–Ω', '6 –º-–Ω', '7 –º-–Ω', '8 –º-–Ω', '9 –º-–Ω', '10 –º-–Ω', '11 –º-–Ω', '12 –º-–Ω',
        '–î–∂–∞–ª 15 –º-–Ω', '–î–∂–∞–ª-23 –º-–Ω', '–î–∂–∞–ª-29 –º-–Ω', '–î–∂–∞–ª-30 –º-–Ω', '–°—Ä–µ–¥–Ω–∏–π –î–∂–∞–ª –º-–Ω',
        '–í–µ—Ä—Ö–Ω–∏–π –î–∂–∞–ª –º-–Ω', '–£–ª–∞–Ω –º-–Ω', '–¢—É–Ω–≥—É—á –º-–Ω', '–ê–ª–∞–º–µ–¥–∏–Ω-1 –º-–Ω', '–ü–æ–ª–∏—Ç–µ—Ö',
        '–ö–ì–£–°–¢–ê', '–ë–ì–£', '–ê–∑–∏—è –ú–æ–ª–ª', '–°–æ–≤–µ—Ç—Å–∫–∞—è - –°–∫—Ä—è–±–∏–Ω–∞', '–ö–∞—Ä–ø–∏–Ω–∫–∞', '–ú–∞—Ç—Ä–æ—Å–æ–≤–∞'],
    1: ['–ñ–î –≤–æ–∫–∑–∞–ª', '–í–æ–µ–Ω—Ç–æ—Ä–≥', '–ê–Æ Grand', '–ò–ø–ø–æ–¥—Ä–æ–º', '–ü–ª–æ—â–∞–¥—å –ü–æ–±–µ–¥—ã', '–ú–æ—Å—Å–æ–≤–µ—Ç',
        '–°—Ç–∞—Ä—ã–π –∞—ç—Ä–æ–ø–æ—Ä—Ç', '–ì–æ–∏–Ω', '–î—É—à–∞–Ω–±–∏–Ω–∫–∞', '–í–æ—Å—Ç–æ—á–Ω—ã–π –∞–≤—Ç–æ–≤–æ–∫–∑–∞–ª', '–Æ–±–∏–ª–µ–π–∫–∞',
        '–ö–æ—Å–º–æ—Å', '–ö–∞—Ä–∞-–ñ—ã–≥–∞—á –∂/–º', '–î–∂–∞–ª—å—Å–∫–∞—è –±–æ–ª—å–Ω–∏—Ü–∞', '–ù–∏–∂–Ω–∏–π –¢–æ–∫–æ–ª—å–¥–æ—à',
        '–ö—É–¥–∞–π–±–µ—Ä–≥–µ–Ω', '–ü–∏—à–ø–µ–∫ –∂/–º', '–¢–≠–¶', 'VEFA', '–©–µ—Ä–±–∞–∫–æ–≤–∞ –∂/–º', '–ê–∫ –≠–º–∏—Ä —Ä—ã–Ω–æ–∫',
        '–ì–æ—Å—Ä–µ–≥–∏—Å—Ç—Ä', '–¶–µ—Ä–∫–æ–≤—å', '–ß—É–π - –ê–ª–º–∞—Ç–∏–Ω–∫–∞', '–ó–∞–ø–∞–¥–Ω—ã–π –∞–≤—Ç–æ–≤–æ–∫–∑–∞–ª', '–¢–∞–∞—Ç–∞–Ω',
        '–ì–∞–≥–∞—Ä–∏–Ω–∞', '–£—á–∫—É–Ω –º-–Ω', '–ì–æ—Ä–æ–¥–æ–∫ —ç–Ω–µ—Ä–≥–µ—Ç–∏–∫–æ–≤', '–ê–ª–∞–º–µ–¥–∏–Ω—Å–∫–∏–π —Ä—ã–Ω–æ–∫',
        '–ñ–∏–ª–≥–æ—Ä–æ–¥–æ–∫ –°–æ–≤–º–∏–Ω–∞ –∂/–º', '–£–ª–∞–Ω-2 –º-–Ω'],
    0: ['–î—Ä—É–≥–∏–µ/—Å–∞–º—ã–µ –¥–µ—à—ë–≤—ã–µ —Ä–∞–π–æ–Ω—ã']
}

district_options = {
    3: ['–ü–µ—Ä–≤–æ–º–∞–π—Å–∫–∏–π —Ä–∞–π–æ–Ω'],
    2: ['–õ–µ–Ω–∏–Ω—Å–∫–∏–π —Ä–∞–π–æ–Ω'],
    1: ['–û–∫—Ç—è–±—Ä—å—Å–∫–∏–π —Ä–∞–π–æ–Ω'],
    0: ['–°–≤–µ—Ä–¥–ª–æ–≤—Å–∫–∏–π —Ä–∞–π–æ–Ω']
}

building_type_map = {
    2: '–∫–∏—Ä–ø–∏—á–Ω—ã–π',
    1: '–º–æ–Ω–æ–ª–∏—Ç–Ω—ã–π',
    0: '–ø–∞–Ω–µ–ª—å–Ω—ã–π'
}

condition_map = {
    0: '–Ω–µ –¥–æ—Å—Ç—Ä–æ–µ–Ω–æ',
    1: '—á–µ—Ä–Ω–æ–≤–∞—è –æ—Ç–¥–µ–ª–∫–∞',
    2: '–ø–æ–¥ —Å–∞–º–æ–æ—Ç–¥–µ–ª–∫—É',
    3: '—Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞',
    4: '—Å—Ä–µ–¥–Ω–µ–µ',
    5: '–µ–≤—Ä–æ—Ä–µ–º–æ–Ω—Ç',
    6: '—Ö–æ—Ä–æ—à–µ–µ',
    7: '—Å–≤–æ–±–æ–¥–Ω–∞—è –ø–ª–∞–Ω–∏—Ä–æ–≤–∫–∞'
}

#input function
def get_user_input():
    data = {}

    print("\n Select *micro_district* group:")
    for k, v in micro_district_options.items():
        print(f"\n{k}:")
        for name in v:
            print(f"   - {name}")
    data['micro_district'] = int(input("\nEnter micro_district group (0-4): "))

    print("\n Select *district* group:")
    for k, v in district_options.items():
        print(f"{k}: {', '.join(v)}")
    data['district'] = int(input("\nEnter district group (0-3): "))

    data['square'] = float(input(" Enter apartment area in m¬≤: "))
    data['rooms'] = int(input(" Enter number of rooms: "))

    print("\n Select building type:")
    for k, v in building_type_map.items():
        print(f"{k}: {v}")
    data['building_type'] = int(input("Enter building type (0-2): "))

    print("\n Select apartment condition:")
    for k, v in condition_map.items():
        print(f"{k}: {v}")
    data['condition'] = int(input("Enter condition (0-7): "))

    data['building_age'] = int(input(" Enter building age (years): "))
    data['floor_to_floors'] = float(input(" Enter floor/floors ratio (e.g. 4/9): "))
    
    print("\n Is it a good floor? (1 = yes, 0 = no)")
    data['is_good_floor'] = int(input("Enter 1 or 0: "))

    return pd.DataFrame([data])
user_df = get_user_input()
user_df = user_df[feature_columns]

# prediction
predicted_price = gb_model.predict(user_df)[0]
square = user_df.iloc[0]['square']
predicted_m2_price = predicted_price / square

# result
print(f"\nüîÆ Predicted apartment price: {int(predicted_price):,} $")
print(f"üìê Price per m¬≤: {int(predicted_m2_price):,} $")


üó∫Ô∏è Select *micro_district* group:

4:
   - –ó–æ–ª–æ—Ç–æ–π –∫–≤–∞–¥—Ä–∞—Ç
   - –ê—Å–∞–Ω–±–∞–π –º-–Ω
   - –ë–∏—à–∫–µ–∫-–ü–∞—Ä–∫
   - –¶–£–ú
   - –û—Ä–æ–∑–±–µ–∫–æ–≤–∞ - –ñ–∏–±–µ–∫-–ñ–æ–ª—É
   - –ì–æ—Ä—å–∫–æ–≥–æ - –ü–∞–Ω—Ñ–∏–ª–æ–≤–∞
   - –ú–æ—Å–∫–æ–≤—Å–∫–∞—è - –ë–µ–ª–∏–Ω–∫–∞
   - –ú–æ—Å–∫–æ–≤—Å–∫–∞—è - –£–º–µ—Ç–∞–ª–∏–µ–≤–∞
   - –ü–∞—Ä–∫ –ü–∞–Ω—Ñ–∏–ª–æ–≤–∞/–°–ø–∞—Ä—Ç–∞–∫
   - –§–∏–ª–∞—Ä–º–æ–Ω–∏—è

3:
   - –ê–∫–∞–¥–µ–º–∏—è –ù–∞—É–∫
   - –ö–ù–£
   - –ê–£–¶–ê
   - –î–≤–æ—Ä–µ—Ü —Å–ø–æ—Ä—Ç–∞
   - –ú–µ–¥. –∞–∫–∞–¥–µ–º–∏—è
   - –ê–∫ –ö–µ–º–µ
   - –ú–æ–ª–æ–¥–∞—è –ì–≤–∞—Ä–¥–∏—è
   - –í–æ—Å—Ç–æ–∫-5 –º-–Ω
   - –Æ–≥-2 –º-–Ω
   - –ö–æ–∫-–ñ–∞—Ä –º-–Ω
   - –¶–µ–Ω—Ç—Ä–∞–ª—å–Ω–∞—è –º–µ—á–µ—Ç—å
   - –ü–∞—Ä–∫ –ê—Ç–∞—Ç—é—Ä–∫
   - –ì–µ–Ω –ø—Ä–æ–∫—É—Ä–∞—Ç—É—Ä–∞

2:
   - 4 –º-–Ω
   - 5 –º-–Ω
   - 6 –º-–Ω
   - 7 –º-–Ω
   - 8 –º-–Ω
   - 9 –º-–Ω
   - 10 –º-–Ω
   - 11 –º-–Ω
   - 12 –º-–Ω
   - –î–∂–∞–ª 15 –º-–Ω
   - –î–∂–∞–ª-23 –º-–Ω
   - –î–∂–∞–ª-29 –º-–Ω
   - –î–∂–∞–ª-30 –º-–Ω
   - –°—Ä–µ–¥–Ω–∏–π –î–∂–∞–ª –º-–


Enter micro_district group (0-4):  1



üèôÔ∏è Select *district* group:
3: –ü–µ—Ä–≤–æ–º–∞–π—Å–∫–∏–π —Ä–∞–π–æ–Ω
2: –õ–µ–Ω–∏–Ω—Å–∫–∏–π —Ä–∞–π–æ–Ω
1: –û–∫—Ç—è–±—Ä—å—Å–∫–∏–π —Ä–∞–π–æ–Ω
0: –°–≤–µ—Ä–¥–ª–æ–≤—Å–∫–∏–π —Ä–∞–π–æ–Ω



Enter district group (0-3):  3
üìê Enter apartment area in m¬≤:  76
üõèÔ∏è Enter number of rooms:  3



üèóÔ∏è Select building type:
2: –∫–∏—Ä–ø–∏—á–Ω—ã–π
1: –º–æ–Ω–æ–ª–∏—Ç–Ω—ã–π
0: –ø–∞–Ω–µ–ª—å–Ω—ã–π


Enter building type (0-2):  0



üé® Select apartment condition:
0: –Ω–µ –¥–æ—Å—Ç—Ä–æ–µ–Ω–æ
1: —á–µ—Ä–Ω–æ–≤–∞—è –æ—Ç–¥–µ–ª–∫–∞
2: –ø–æ–¥ —Å–∞–º–æ–æ—Ç–¥–µ–ª–∫—É
3: —Ç—Ä–µ–±—É–µ—Ç —Ä–µ–º–æ–Ω—Ç–∞
4: —Å—Ä–µ–¥–Ω–µ–µ
5: –µ–≤—Ä–æ—Ä–µ–º–æ–Ω—Ç
6: —Ö–æ—Ä–æ—à–µ–µ
7: —Å–≤–æ–±–æ–¥–Ω–∞—è –ø–ª–∞–Ω–∏—Ä–æ–≤–∫–∞


Enter condition (0-7):  4
üè¢ Enter building age (years):  4
üìä Enter floor/floors ratio (e.g. 4/9):  0.9



‚¨ÜÔ∏è Is it a good floor? (1 = yes, 0 = no)


Enter 1 or 0:  1



üîÆ Predicted apartment price: 71,754 $
üìê Price per m¬≤: 944 $
