In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [41]:
df = pd.read_csv('data_cleaned.csv')

In [63]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')
df['area_per_room'] = df['area_per_room'].astype('int')

In [64]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor',"area_per_room"]

In [65]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [66]:
X = df.drop('price', axis=1)
y = df['price']

In [67]:
bins = [x for x in range(0, 80000, 10000)]
labels = [x for x in range(1, 8)]
print(bins)
print(labels)

[0, 10000, 20000, 30000, 40000, 50000, 60000, 70000]
[1, 2, 3, 4, 5, 6, 7]


In [68]:
y = pd.cut(y, bins=bins, labels=labels)

In [69]:
print(y.unique())

[2, 3, 7, 5, 6, 4, 1]
Categories (7, int64): [1 < 2 < 3 < 4 < 5 < 6 < 7]


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [71]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])

In [72]:
model.fit(X_train, y_train)

In [73]:
y_pred = model.predict(X_test)

In [74]:
print(confusion_matrix(y_test, y_pred))


[[  1  10   0   1   0   1   0]
 [  0 166  59   9   0   2   0]
 [  0  51 195  30   6   2   0]
 [  0  13  60  58  11   2   0]
 [  0   5  26  27  24   8   4]
 [  0   2   7  23   9  13   6]
 [  0   1   1   4   4   2   7]]


In [75]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.08      0.14        13
           2       0.67      0.70      0.69       236
           3       0.56      0.69      0.62       284
           4       0.38      0.40      0.39       144
           5       0.44      0.26      0.32        94
           6       0.43      0.22      0.29        60
           7       0.41      0.37      0.39        19

    accuracy                           0.55       850
   macro avg       0.56      0.39      0.41       850
weighted avg       0.54      0.55      0.53       850

