## Verilerin Sınıflandırma ile Tahmin Edilmesi

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report , confusion_matrix


In [2]:
df= pd.read_csv("data_cleaned.csv")

In [3]:
df["city"]= df["city"].astype("category")
df["district"]= df["district"].astype("category")
df["neighborhood"]= df["neighborhood"].astype("category")
df["room"]= df["room"].astype("int64")
df["living_room"]= df["living_room"].astype("int64")
df["area"]= df["area"].astype("int64")
df["age"]= df["age"].astype("int64")
df["floor"]= df["floor"].astype("int64")
df["price"]= df["price"].astype("int64")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          211 non-null    category
 1   district      211 non-null    category
 2   neighborhood  211 non-null    category
 3   room          211 non-null    int64   
 4   living_room   211 non-null    int64   
 5   area          211 non-null    int64   
 6   age           211 non-null    int64   
 7   floor         211 non-null    int64   
 8   price         211 non-null    int64   
dtypes: category(3), int64(6)
memory usage: 16.4 KB
None


In [4]:
categorical_features =["city","district","neighborhood"]
numerical_features =["room","living_room","area","age","floor"]

In [5]:
full_pipeline= ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

In [6]:
X= df.drop("price",axis=1)
y= df["price"]

In [8]:
bins= [x for x in range(0,70000,10000)]
labels = [x for x in range (1,7) ]
print(bins)
print(labels)

[0, 10000, 20000, 30000, 40000, 50000, 60000]
[1, 2, 3, 4, 5, 6]


In [9]:
y= pd.cut(y, bins =bins, labels=labels)

In [11]:
print(y.unique())

[2, 1, 3]
Categories (6, int64): [1 < 2 < 3 < 4 < 5 < 6]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model = Pipeline([
    ("preparation", full_pipeline),
    ("model",RandomForestClassifier(n_estimators=100))
])

In [16]:
model.fit(X_train,y_train)

In [17]:
y_pred= model.predict(X_test)

In [18]:
print(confusion_matrix(y_test, y_pred))

[[ 0  4  0]
 [ 1 33  0]
 [ 0  4  1]]


In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         4
           2       0.80      0.97      0.88        34
           3       1.00      0.20      0.33         5

    accuracy                           0.79        43
   macro avg       0.60      0.39      0.40        43
weighted avg       0.75      0.79      0.73        43

