## Verilerin regresyon kullanılarak açıklanması

Gerekli paketlerin yüklenmesi

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso, Ridge, ElasticNet

Verilerin yüklenmesi ve değişken tiplerinin ayarlanması

In [2]:
df = pd.read_csv('hepsiemlak_cleaned.csv')

In [3]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 335.0 KB
None


Nitel ve nicel değişkenlerin ayrılması

In [4]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']

In [5]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

X matrisi ve y vektörlerinin belirlenmesi

In [6]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Regresyon modellerinin oluşturulması

In [7]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
    SVR(),
    Lasso(),
    Ridge(),
    ElasticNet()
]

mse = []
rmse = []
r2 = []
mape = []

for model in models:
    model_name = model.__class__.__name__
    reg_model = Pipeline([
        ('preprocessor', full_pipeline),
        ('model', model)
    ])
    reg_model.fit(X_train, y_train)
    y_train_pred = reg_model.predict(X_train)
    y_test_pred = reg_model.predict(X_test)
    mse.append(mean_squared_error(y_test, y_test_pred))
    rmse.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))
    r2.append(r2_score(y_test, y_test_pred))
    mape.append(np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100)
    print(f'{model_name} - Train R2: {r2_score(y_train, y_train_pred):.2f}, Test R2: {r2_score(y_test, y_test_pred):.2f}')


results = pd.DataFrame({
    'Model': [model.__class__.__name__ for model in models],
    'MSE': mse,
    'RMSE': rmse,
    'R2': r2,
    'MAPE': mape
})

print(results)

LinearRegression - Train R2: 0.69, Test R2: 0.59
DecisionTreeRegressor - Train R2: 1.00, Test R2: 0.31
RandomForestRegressor - Train R2: 0.94, Test R2: 0.57
GradientBoostingRegressor - Train R2: 0.62, Test R2: 0.53
SVR - Train R2: -0.08, Test R2: -0.11
Lasso - Train R2: 0.68, Test R2: 0.60
Ridge - Train R2: 0.68, Test R2: 0.60
ElasticNet - Train R2: 0.27, Test R2: 0.24
                       Model           MSE          RMSE        R2       MAPE
0           LinearRegression  4.809925e+07   6935.362630  0.589726  26.320159
1      DecisionTreeRegressor  8.094814e+07   8997.118513  0.309534  30.633952
2      RandomForestRegressor  5.085494e+07   7131.265066  0.566221  24.030137
3  GradientBoostingRegressor  5.514433e+07   7425.922842  0.529633  28.328731
4                        SVR  1.297062e+08  11388.864443 -0.106360  40.407244
5                      Lasso  4.719705e+07   6870.010830  0.597422  25.152388
6                      Ridge  4.717677e+07   6868.534490  0.597595  25.295359
7   

  model = cd_fast.sparse_enet_coordinate_descent(


Regresyon modelinin oluşturulması ve fit işleminin gerçekleştirilmesi

In [8]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])

model.fit(X_train, y_train)

Modelin değerlendirilmesi

In [9]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

MSE: 48099254.80915588
RMSE: 6935.3626299679445
R2: 0.5897261079812823


In [10]:
def percentage_tolerance_accuracy(y_true, y_pred, tolerance_percent=5):
    tolerance = tolerance_percent / 100.0 * y_true
    return np.mean(np.abs(y_true - y_pred) <= tolerance)

print(percentage_tolerance_accuracy(y_test, y_pred))

0.14950980392156862


In [11]:
def tolerance_r2(y_true, y_pred, tolerance):
    """
    Custom R^2 score implementation with tolerance.

    Parameters:
    - y_true: actual values
    - y_pred: predicted values
    - tolerance: fixed tolerance value within which predictions are considered accurate

    Returns:
    - Custom R^2 score
    """
    # Calculate residuals, setting those within tolerance to zero
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0

    # Residual sum of squares (only counting errors outside the tolerance)
    rss = np.sum(residuals**2)

    # Total sum of squares
    tss = np.sum((y_true - np.mean(y_true))**2)

    # Custom R^2 score
    return 1 - (rss / tss)

print(tolerance_r2(y_test, y_pred, 5000))

0.6271787151309125


Değişkenlerin önemlerinin gösterilmesi

In [13]:
if hasattr(model.named_steps['model'], 'coef_'):
    feature_importances = model.named_steps['model'].coef_
    print('Linear Regression Feature Importances')
    print('Intercept:', feature_importances[0])
    print()
    print('Numerical Features')
    for i in range(len(numerical_features)):
        print(numerical_features[i], feature_importances[i])
    print()
    print('Categorical Features')
    for i in range(len(categorical_features)):
        for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
            print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Linear Regression Feature Importances
Intercept: 771.39145891298

Numerical Features
room 771.39145891298
living_room 0.0
area 3450.611155474665
age -2075.411728546521
floor 162.0262111943727

Categorical Features
afyonkarahisar -5560.498937824713
aydin 1784.278043556758
denizli -3444.2102982140586
izmir 6390.563613032616
manisa -1903.855211795589
mugla 2733.7227911882187
acipayam -5560.498937824713
akhisar 1784.278043556758
alasehir -3444.2102982140586
aliaga 6390.563613032616
balcova -1903.855211795589
bayindir 2733.7227911882187
bayrakli -941.501985129468
bergama 10.140388312072075
bodrum -2320.7037609002145
bolvadin 1556.2458441940946
bornova 6352.092870125386
buca -7426.699081356405
buharkent -4365.836919110257
cardak -5014.424300118075
cay 16330.207659839378
cesme -533.0028452142279
cigli -2026.7063102723137
cine -2085.1978629265836
civril -1822.9260118159589
dalaman -1460.2150917354727
datca -5966.520931464428
didim 17527.641408221094
dikili -4632.597519711589
efeler -1757.90098

Örnek bir tahmin

In [14]:
print(df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')])

        city   district neighborhood  room  living_room  area  age  floor  \
5151  manisa  yunusemre    guzelyurt     1            1    65   13      5   
5198  manisa  yunusemre    guzelyurt     2            1    85    2      3   
5222  manisa  yunusemre    guzelyurt     4            1   196    5      1   
5239  manisa  yunusemre    guzelyurt     1            1    60   11      5   

      price  
5151  15000  
5198  15000  
5222  36000  
5239  11000  


In [15]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [3],
    'living_room': [1],
    'area': [120],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[22447.11554738]


In [16]:
# save the model using joblib
import joblib

joblib.dump(model, 'hepsiemlak_model.pkl')

# load the model
# model = joblib.load('hepsiemlak_model.pkl')

['hepsiemlak_model.pkl']

Modelin başarısını arttırmak için düzenlileştirme denemesi

In [17]:
lasso_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', Lasso(alpha=2.0))
])
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print('Lasso Regression')
print('Mean Squared Error:', mse_lasso)
print('Root Mean Squared Error:', rmse_lasso)
print('R^2 Score:', r2_lasso)

Lasso Regression
Mean Squared Error: 46891781.59460322
Root Mean Squared Error: 6847.757413533515
R^2 Score: 0.6000255343904535


In [18]:
ridge_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', Ridge(alpha=1.5))
])
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
rmse_ridge = np.sqrt(mse_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print('Ridge Regression')
print('Mean Squared Error:', mse_ridge)
print('Root Mean Squared Error:', rmse_ridge)
print('R^2 Score:', r2_ridge)

Ridge Regression
Mean Squared Error: 47240468.01160983
Root Mean Squared Error: 6873.170157329864
R^2 Score: 0.5970513316929902


In [19]:
elasticnet_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', ElasticNet(alpha=0.1, l1_ratio=0.5))
])
elasticnet_model.fit(X_train, y_train)
y_pred_elasticnet = elasticnet_model.predict(X_test)
mse_elasticnet = mean_squared_error(y_test, y_pred_elasticnet)
rmse_elasticnet = np.sqrt(mse_elasticnet)
r2_elasticnet = r2_score(y_test, y_pred_elasticnet)
print('ElasticNet Regression')
print('Mean Squared Error:', mse_elasticnet)
print('Root Mean Squared Error:', rmse_elasticnet)
print('R^2 Score:', r2_elasticnet)

ElasticNet Regression
Mean Squared Error: 65249973.27056528
Root Mean Squared Error: 8077.745556191113
R^2 Score: 0.4434350263003186


Lineer olmayan ilişkileri açıklayabilmek için diğer regresyon modellerinin denenmesi

In [20]:
random_forest_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestRegressor(random_state=42))
])
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)
mse_random_forest = mean_squared_error(y_test, y_pred_random_forest)
rmse_random_forest = np.sqrt(mse_random_forest)
r2_random_forest = r2_score(y_test, y_pred_random_forest)
print('Random Forest Regression')
print('Mean Squared Error:', mse_random_forest)
print('Root Mean Squared Error:', rmse_random_forest)
print('R^2 Score:', r2_random_forest)

Random Forest Regression
Mean Squared Error: 50233815.66995165
Root Mean Squared Error: 7087.581792822686
R^2 Score: 0.5715188697281275


In [21]:
gradient_boosting_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', GradientBoostingRegressor(random_state=42))
])
gradient_boosting_model.fit(X_train, y_train)
y_pred_gradient_boosting = gradient_boosting_model.predict(X_test)
mse_gradient_boosting = mean_squared_error(y_test, y_pred_gradient_boosting)
rmse_gradient_boosting = np.sqrt(mse_gradient_boosting)
r2_gradient_boosting = r2_score(y_test, y_pred_gradient_boosting)
print('Gradient Boosting Regression')
print('Mean Squared Error:', mse_gradient_boosting)
print('Root Mean Squared Error:', rmse_gradient_boosting)
print('R^2 Score:', r2_gradient_boosting)

Gradient Boosting Regression
Mean Squared Error: 55366933.13919568
Root Mean Squared Error: 7440.895990349259
R^2 Score: 0.5277347385466369


In [22]:
support_vector_model = Pipeline([
    ('preparation', full_pipeline),
    ('model', SVR(kernel='linear'))
])
support_vector_model.fit(X_train, y_train)
y_pred_support_vector = support_vector_model.predict(X_test)
mse_support_vector = mean_squared_error(y_test, y_pred_support_vector)
rmse_support_vector = np.sqrt(mse_support_vector)
r2_support_vector = r2_score(y_test, y_pred_support_vector)
print('Support Vector Regression')
print('Mean Squared Error:', mse_support_vector)
print('Root Mean Squared Error:', rmse_support_vector)
print('R^2 Score:', r2_support_vector)

Support Vector Regression
Mean Squared Error: 115902329.03916754
Root Mean Squared Error: 10765.794398889826
R^2 Score: 0.011383859945338193


Çapraz geçerleme ve hiper parametre analizi ile model başarısını arttırma denemesi

In [23]:
param_grid = [
    {'model__n_estimators': [3, 10, 30, 50, 100, 200, 500, 1000], 'model__max_features': [2, 4, 6, 8, 10, 12]},
    {'model__bootstrap': [False], 'model__n_estimators': [3, 10], 'model__max_features': [2, 3, 4]}
]

In [24]:
grid_search = GridSearchCV(random_forest_model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
print('Random Forest Regression Best Parameters:', grid_search.best_params_)
print('Random Forest Regression Best Estimator:', grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

y_pred_random_forest_best = grid_search.best_estimator_.predict(X_test)
mse_random_forest_best = mean_squared_error(y_test, y_pred_random_forest_best)
rmse_random_forest_best = np.sqrt(mse_random_forest_best)
r2_random_forest_best = r2_score(y_test, y_pred_random_forest_best)
print('Random Forest Regression Best Estimator')
print('Mean Squared Error:', mse_random_forest_best)
print('Root Mean Squared Error:', rmse_random_forest_best)
print('R^2 Score:', r2_random_forest_best)

Random Forest Regression Best Parameters: {'model__max_features': 12, 'model__n_estimators': 1000}
Random Forest Regression Best Estimator: Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['room', 'living_room',
                                                   'area', 'age', 'floor']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['city', 'district',
                                                   'neighborhood'])])),
                ('model',
                 RandomForestRegressor(max_features=12, n_estimators=1000,
                                       random_state=42))])
7514.177799658233 {'model__max_features': 2, 'model__n_estimators': 3}
6773.54604449615 {'model__max_features': 2, 'model__n_estimators': 10}
6517.66754

In [29]:
param_grid = [
    {'model__n_estimators': [3, 10, 30, 50, 100, 200, 500, 1000], 'model__learning_rate': [0.01, 0.03, 0.1, 0.3, 1.0], 'model__max_depth': [2, 3, 4, 5, 6, 7, 8]}
]
grid_search = GridSearchCV(gradient_boosting_model, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
print('Gradient Boosting Regression Best Parameters:', grid_search.best_params_)
print('Gradient Boosting Regression Best Estimator:', grid_search.best_estimator_)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

y_pred_gradient_boosting_best = grid_search.best_estimator_.predict(X_test)
mse_gradient_boosting_best = mean_squared_error(y_test, y_pred_gradient_boosting_best)
rmse_gradient_boosting_best = np.sqrt(mse_gradient_boosting_best)
r2_gradient_boosting_best = r2_score(y_test, y_pred_gradient_boosting_best)
print('Gradient Boosting Regression Best Estimator')
print('Mean Squared Error:', mse_gradient_boosting_best)
print('Root Mean Squared Error:', rmse_gradient_boosting_best)
print('R^2 Score:', r2_gradient_boosting_best)

Gradient Boosting Regression Best Parameters: {'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__n_estimators': 500}
Gradient Boosting Regression Best Estimator: Pipeline(steps=[('preparation',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['room', 'living_room',
                                                   'area', 'age', 'floor']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['city', 'district',
                                                   'neighborhood'])])),
                ('model',
                 GradientBoostingRegressor(max_depth=4, n_estimators=500,
                                           random_state=42))])
10094.381738364043 {'model__learning_rate': 0.01, 'model__max_depth': 2, 'model__n_estimators': 3}
9893.269975858766 