In [135]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [136]:
df = pd.read_csv('data_cleaned.csv')

In [137]:
df.describe()

Unnamed: 0,room,living_room,area,age,floor,price
count,1084.0,1084.0,1084.0,1084.0,1084.0,1084.0
mean,2.075646,1.0,100.20572,14.154059,2.26107,21044.182657
std,0.756503,0.0,33.027803,11.30241,1.624807,7951.782145
min,1.0,1.0,30.0,0.0,-3.0,4000.0
25%,1.0,1.0,70.0,4.0,1.0,15000.0
50%,2.0,1.0,100.0,11.0,2.0,20000.0
75%,3.0,1.0,125.0,25.0,3.0,26000.0
max,3.0,1.0,170.0,35.0,5.0,43000.0


In [138]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')

In [139]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [140]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [141]:
X = df.drop('price', axis=1)
y = df['price']

In [142]:
bins = [x for x in range(0, 50001, 5000)]       
labels = [x for x in range(1, 11)]              


In [143]:
y = pd.cut(y, bins=bins, labels= labels)
print(y.unique())

[6, 5, 4, 9, 8, 7, 3, 2, 1]
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]


In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [145]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])

In [146]:
model.fit(X_train, y_train)

In [147]:
y_pred=model.predict(X_test)
print(confusion_matrix(y_test,y_pred))

[[ 3 10  1  0  0  0  0  0]
 [ 5 31 14  2  1  0  0  0]
 [ 0 10 23  8  9  2  0  0]
 [ 0  1 15 12  7  2  0  0]
 [ 0  2  6  9  6  5  0  0]
 [ 0  0  2  6  9  2  2  0]
 [ 0  0  0  2  3  3  1  0]
 [ 0  0  0  0  0  1  2  0]]


In [148]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           2       0.38      0.21      0.27        14
           3       0.57      0.58      0.58        53
           4       0.38      0.44      0.41        52
           5       0.31      0.32      0.32        37
           6       0.17      0.21      0.19        28
           7       0.13      0.10      0.11        21
           8       0.20      0.11      0.14         9
           9       0.00      0.00      0.00         3

    accuracy                           0.36       217
   macro avg       0.27      0.25      0.25       217
weighted avg       0.35      0.36      0.35       217



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
