## Verilerin lineer regresyon ile tahmin edilmesi

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('data_cleaned.csv')

In [3]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22927 entries, 0 to 22926
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          22927 non-null  category
 1   district      22927 non-null  category
 2   neighborhood  22927 non-null  category
 3   room          22927 non-null  int64   
 4   livingroom    22927 non-null  int64   
 5   area          22927 non-null  int64   
 6   age           22927 non-null  int64   
 7   floor         22927 non-null  int64   
 8   price         22927 non-null  int64   
dtypes: category(3), int64(6)
memory usage: 1.3 MB
None


In [5]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'livingroom', 'area', 'age','floor']

In [6]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [7]:
X = df.drop('price', axis=1)
Y = df['price']

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [10]:
model.fit(X_train, Y_train)

In [11]:
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, y_pred)

In [12]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 48302766.2703995
RMSE: 6950.01915611745
R^2: 0.6177973815371638


In [13]:
feature_importances = model.named_steps['model'].coef_
for i in  range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

room 500.4098744770494
livingroom 0.0
area 4005.129000614165
age -3321.080399290208
floor 757.373488577145


In [14]:
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

adana -2067.261893990987
adiyaman -3810.9839232270792
afyonkarahisar 851.5727585730834
agri -4019.5954741747696
aksaray -605.6920365397061
amasya -6339.7989200670845
ankara 5258.2576788728165
antalya 5235.304115523732
ardahan -543.2926139772171
artvin 202.496738017421
aydin 3617.5470550712657
balikesir 3451.4476799754293
bartin 2353.616607219782
batman -4474.3182597344485
bayburt -1467.7196416398779
bilecik -76.5319540848257
bitlis -2424.468885577641
bolu -933.241659402646
burdur -2521.6611461956945
bursa 897.5517747048066
canakkale 8805.39299143298
corum -6846.7932479235715
denizli -333.2488695389204
diyarbakir -8245.1070976773
duzce -1323.8298999473445
edirne 5283.2190938231415
elazig -7403.525770349293
erzincan 1823.0431162976001
erzurum -2082.4403557458822
eskisehir 1012.1135933706546
gaziantep -2729.268594940336
giresun -4329.797776901361
hakkari 126.15120997841103
hatay -3724.130515516633
igdir 1868.1146384587112
isparta -481.925261821683
istanbul 15185.064958337387
izmir 11114.2

In [20]:
new_data = pd.DataFrame({
    'city': ['istanbul'],
    'district': ['bayrampaşa'],
    'neighborhood': ['cevatpaşa'],
    'room': [2],
    'livingroom': [1],
    'area': [100],
    'age': [5],
    'floor': [2]
})
print(model.predict(new_data))

[31667.25523939]


In [16]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [19]:
print(r2_score(Y_test, y_pred))
print(tolerance_r2(Y_test, y_pred, 10000))

0.6177973815371638
0.750733564606252
