## Verilerin sınıflandırma ile tahmin edilmesi

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [4]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [5]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [7]:
X = df.drop('price', axis=1)
y = df['price']

In [9]:
bins = [x for x in range(0, 70000, 10000)]
labels = [x for x in range(1, 7)]
print(bins)
print(labels)

[0, 10000, 20000, 30000, 40000, 50000, 60000]
[1, 2, 3, 4, 5, 6]


In [10]:
y = pd.cut(y, bins=bins, labels=labels)

In [11]:
print(y.unique())

[1, 2, 3, 4, 5, 6]
Categories (6, int64): [1 < 2 < 3 < 4 < 5 < 6]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])

In [14]:
model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)

In [16]:
print(confusion_matrix(y_test, y_pred))

[[153 103   1   0   0   0]
 [ 44 544  27   7   2   0]
 [  2 112  80   9   6   1]
 [  2  27  15  18   7   2]
 [  1   9  15   4   8   3]
 [  0   9   5   2   3   3]]


In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.76      0.60      0.67       257
           2       0.68      0.87      0.76       624
           3       0.56      0.38      0.45       210
           4       0.45      0.25      0.32        71
           5       0.31      0.20      0.24        40
           6       0.33      0.14      0.19        22

    accuracy                           0.66      1224
   macro avg       0.51      0.41      0.44      1224
weighted avg       0.64      0.66      0.64      1224

