#### Verilerin Lineer Regresyon İle Tahmin Edilmesi

In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [41]:
df = pd.read_csv("data_cleaned.csv")

In [42]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int64')
df['living_room'] = df['living_room'].astype('int64')
df['area'] = df['area'].astype('int64')
df['age'] = df['age'].astype('int64')
df['floor'] = df['floor'].astype('int64')
df['price'] = df['price'].astype('int64')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 335.0 KB


In [44]:
categorical_features = ["city", "district", "neighborhood"]
numerical_features = ["room", "living_room", "area", "age", "floor"]

In [45]:
full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

In [46]:
X = df.drop("price", axis=1)
y = df["price"]

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = Pipeline([
    ("preparation",full_pipeline),
    ("model", LinearRegression())
])

In [49]:
model.fit(X_train, y_train)

In [53]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [55]:
print(f"MSE: {mse} \nRMSE: {rmse} \nr2_Score: {r2}")

MSE: 42478779.90781519 
RMSE: 6517.574695223308 
r2_Score: 0.5815664192199554


In [57]:
feature_importances = model.named_steps["model"].coef_
print(feature_importances)

[ 9.23482232e+02  0.00000000e+00  3.28046866e+03 -2.21252598e+03
  1.72127409e+02 -3.21669840e+03  1.12673449e+03 -3.73761551e+03
  5.45072675e+03 -2.19256206e+03  2.56941472e+03 -8.66766024e+02
 -1.25027863e+03 -1.44244285e+03 -5.45333596e+03  5.02024938e+03
 -9.67706749e+03 -7.00514575e+03 -7.23528128e+03  1.71022453e+04
 -2.41301954e+03 -2.45285215e+03 -2.43918728e+03  3.06211175e+03
 -1.89953791e+03 -1.27611274e+03  1.74260196e+04 -4.97182190e+03
 -2.19816462e+03 -2.59328475e+03 -3.05822770e+03  6.24375259e+03
  1.17000163e+03 -9.47259726e+02 -1.58885219e+03 -2.91956473e+03
  9.27475227e+02  1.08683061e+04 -2.24181877e+03  9.11701946e+03
  3.14974553e+03  7.00691673e+02 -7.52579812e+03 -7.90355513e+02
 -7.35612570e+02  1.63935161e+03 -9.51454330e+03  1.44279724e+04
 -4.38208285e+03  3.74075151e+03 -1.03046735e+03  1.28226620e+04
  8.62094762e+03  8.35179944e+02 -4.24938517e+03  3.76014661e+03
 -1.94058566e+03  1.68514403e+03 -4.03648610e+03  7.40014726e+03
 -5.10122277e+03 -8.27426

In [58]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room 923.4822323428315
living_room 0.0
area 3280.4686550047313
age -2212.525984229208
floor 172.12740907180205


In [65]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
afyonkarahisar -3216.6983963923126
aydin 1126.7344914514806
denizli -3737.6155055618888
izmir 5450.726754893333
manisa -2192.562064375657
mugla 2569.4147197616844
acipayam -3216.6983963923126
akhisar 1126.7344914514806
alasehir -3737.6155055618888
aliaga 5450.726754893333
balcova -2192.562064375657
bayindir 2569.4147197616844
bayrakli -866.7660238274397
bergama -1250.278634382346
bodrum -1442.4428497602985
bornova -5453.335964281247
buca 5020.249376148315
buharkent -9677.06749244021
cameli -7005.145750300285
cardak -7235.281278053838
cay 17102.24526020252
cesme -2413.0195385429124
cigli -2452.8521530841563
cine -2439.187281496804
civril 3062.1117530779816
dalaman -1899.5379125095637
datca -1276.1127376342183
demirci 17426.019629580005
didim -4971.821898995815
dikili -2198.164623289573
efeler -2593.2847520724545
fethiye -3058.227701351554
foca 6243.752589065143
gaziemir 1170.0016288914167
germencik -947.2597256951983
guzelbahce -1588.8521936255272
honaz -2919.564734

In [66]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

In [67]:
print(model.predict(new_data))

[30086.85056045]


In [69]:
df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')]

Unnamed: 0,city,district,neighborhood,room,living_room,area,age,floor,price
2712,manisa,yunusemre,guzelyurt,1,1,65,13,5,15000
2759,manisa,yunusemre,guzelyurt,2,1,85,2,3,15000
2783,manisa,yunusemre,guzelyurt,4,1,196,5,1,36000
2800,manisa,yunusemre,guzelyurt,1,1,60,11,5,11000


In [70]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [71]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.5815664192199554
0.7230225374026313
0.8210906628047111
