## Verilerin lineer regresyon ile tahmin edilmesi

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv('data_cleaned.csv')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          6116 non-null   object
 1   district      6116 non-null   object
 2   neighborhood  6116 non-null   object
 3   room          6116 non-null   int64 
 4   living_room   6116 non-null   int64 
 5   area          6116 non-null   int64 
 6   age           6116 non-null   int64 
 7   floor         6116 non-null   int64 
 8   price         6116 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 430.2+ KB
None


In [5]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 335.0 KB
None


In [7]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [8]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [9]:
X = df.drop('price', axis=1)
y = df['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

In [12]:
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [14]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 48099254.80915588
RMSE: 6935.3626299679445
R^2: 0.5897261079812823


In [16]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

732
[ 7.71391459e+02  0.00000000e+00  3.45061116e+03 -2.07541173e+03
  1.62026211e+02 -5.56049894e+03  1.78427804e+03 -3.44421030e+03
  6.39056361e+03 -1.90385521e+03  2.73372279e+03 -9.41501985e+02
  1.01403883e+01 -2.32070376e+03  1.55624584e+03  6.35209287e+03
 -7.42669908e+03 -4.36583692e+03 -5.01442430e+03  1.63302077e+04
 -5.33002845e+02 -2.02670631e+03 -2.08519786e+03 -1.82292601e+03
 -1.46021509e+03 -5.96652093e+03  1.75276414e+04 -4.63259752e+03
 -1.75790099e+03 -2.36539755e+03 -7.74143025e+02 -9.34324566e+02
 -2.95926288e+03 -2.82226617e+03 -2.32341138e+03  3.10588156e+03
  9.43255308e+03 -1.03686309e+03  9.02704284e+03  2.20569454e+03
  2.23731891e+03 -7.71437242e+03 -1.85128796e+03 -7.25365478e+01
  1.42301661e+03 -9.11882029e+03 -4.24896626e+03 -4.35026362e+03
  4.82572567e+03  1.05156650e+02  1.25817744e+04  7.99752164e+03
  8.94781971e+02 -3.87216338e+03  2.52975431e+03  9.39024839e+02
  1.75594353e+03 -2.90483498e+03  1.03372908e+04 -3.53342091e+03
 -8.13730005e+03 -3.5

In [17]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room 771.39145891298
living_room 0.0
area 3450.611155474665
age -2075.411728546521
floor 162.0262111943727


In [18]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
afyonkarahisar -5560.498937824713
aydin 1784.278043556758
denizli -3444.2102982140586
izmir 6390.563613032616
manisa -1903.855211795589
mugla 2733.7227911882187
acipayam -5560.498937824713
akhisar 1784.278043556758
alasehir -3444.2102982140586
aliaga 6390.563613032616
balcova -1903.855211795589
bayindir 2733.7227911882187
bayrakli -941.501985129468
bergama 10.140388312072075
bodrum -2320.7037609002145
bolvadin 1556.2458441940946
bornova 6352.092870125386
buca -7426.699081356405
buharkent -4365.836919110257
cardak -5014.424300118075
cay 16330.207659839378
cesme -533.0028452142279
cigli -2026.7063102723137
cine -2085.1978629265836
civril -1822.9260118159589
dalaman -1460.2150917354727
datca -5966.520931464428
didim 17527.641408221094
dikili -4632.597519711589
efeler -1757.900986061785
fethiye -2365.397553255053
foca -774.1430252043201
gaziemir -934.324565893728
germencik -2959.262876182134
guzelbahce -2822.266171316525
honaz -2323.411383534112
incirliova 3105.8815597

In [21]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[30324.76198656]


In [20]:
print(df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')])

        city   district neighborhood  room  living_room  area  age  floor  \
5151  manisa  yunusemre    guzelyurt     1            1    65   13      5   
5198  manisa  yunusemre    guzelyurt     2            1    85    2      3   
5222  manisa  yunusemre    guzelyurt     4            1   196    5      1   
5239  manisa  yunusemre    guzelyurt     1            1    60   11      5   

      price  
5151  15000  
5198  15000  
5222  36000  
5239  11000  


In [29]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [35]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.5897261079812823
0.708510246988932
0.8225924396926299
