In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [39]:
df= pd.read_csv('data_cleaned.csv')

In [40]:
df["city"]= df["city"].astype("category")
df["district"]= df["district"].astype("category")
df["neighborhood"]= df["neighborhood"].astype("category")

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1084 entries, 0 to 1083
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          1084 non-null   category
 1   district      1084 non-null   category
 2   neighborhood  1084 non-null   category
 3   room          1084 non-null   int64   
 4   living_room   1084 non-null   int64   
 5   area          1084 non-null   int64   
 6   age           1084 non-null   int64   
 7   floor         1084 non-null   int64   
 8   price         1084 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 78.2 KB


In [42]:
categorical_features=["city","district","neighborhood"]
numerical_features= ["room","living_room","area","age","floor"]

In [43]:
full_pipeline = ColumnTransformer([
    ("num", StandardScaler(),numerical_features),
    ("cat", OneHotEncoder(handle_unknown ='ignore'),categorical_features)
])

In [44]:
X= df.drop('price',axis=1)
y=df['price']

In [45]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [46]:
model = Pipeline([
    ("preparation", full_pipeline),
    ("model", LinearRegression())
])
    

In [47]:
model.fit(X_train,y_train)

In [48]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,y_pred)

In [49]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 44313335.48075247
RMSE: 6656.826231827932
R^2: 0.3328876804183547


In [50]:
feature_importences = model.named_steps['model'].coef_
print(feature_importences)

[ 4.76054990e+02  0.00000000e+00  2.27327174e+03 -1.74621420e+03
  5.73613073e+02 -1.51336035e+03  1.31821297e+03 -2.02189331e+03
  3.43967477e+03  5.88735839e+03 -7.10999247e+03  3.53319674e+03
  1.11268263e+04 -2.89710838e+03 -6.92460767e+02  3.50834153e+03
 -4.53591552e+03  5.81750310e+03 -4.51797404e+02 -2.05657041e+04
  2.20508930e+02  1.16008778e+04 -3.61082586e+03  6.58884943e+02
  6.09169847e+02  6.53618958e+03  9.90364948e+03 -1.62210298e+03
 -2.24871636e+03  1.53464973e+04 -1.32270518e+04  4.40776396e+03
 -7.10828039e+03  2.32895149e+03 -5.10754865e+03 -1.33808078e+03
 -3.64335823e+03  1.52072441e+03 -1.33275372e+04  4.76943689e+03
  6.63072867e+03  1.98110154e+03 -2.12610102e+03  1.50121378e+04
 -1.23471938e+04  3.08590216e+03 -1.33069480e+04  1.66146516e+03
 -8.42139921e+03  1.11860111e+03  3.18113628e+03 -1.12218976e+03
  5.72350416e+03  1.19408447e+04  4.14288498e+03 -1.43933850e+04
 -2.76305076e+03 -1.86870672e+03  1.66214015e+03 -4.36284528e+03
  1.20187288e+04 -1.28556

In [51]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importences[i])

Numerical Features
room 476.05499031052403
living_room 0.0
area 2273.2717382387577
age -1746.2142042179478
floor 573.6130726568441


In [52]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importences[len(numerical_features) + j])

Categorical Features
ankara -1513.3603452592338
antalya 1318.2129653586858
bursa -2021.8933112623688
istanbul 3439.6747695120985
izmir 5887.358387079531
konya -7109.992465473761
aksehir -1513.3603452592338
aksu 1318.2129653586858
alanya -2021.8933112623688
aliaga 3439.6747695120985
altindag 5887.358387079531
arnavutkoy -7109.992465473761
atasehir 3533.196741264369
avcilar 11126.826322084818
bagcilar -2897.108383791108
bahcelievler -692.4607674763674
balcova 3508.341533176553
bayrakli -4535.915520174775
beykoz 5817.503103370316
beylikduzu -451.7974035631422
beyoglu -20565.704100007006
bornova 220.50893003199656
buca 11600.877800972034
buyukcekmece -3610.8258646630816
cankaya 658.8849429895376
cekmekoy 609.1698473247025
cesme 6536.1895792572
cigli 9903.649484377185
dikili -1622.1029821032726
dosemealti -2248.7163628230305
elmali 15346.49732025697
eregli -13227.051814227765
esenler 4407.763957971242
esenyurt -7108.280393351298
etimesgut 2328.951493177212
eyupsultan -5107.548646095431
fati

In [65]:
new_data = pd.DataFrame({
    'city': ['ankara'],
    'district': ['kecioren'],
    'neighborhood': ['basinevleri'],
    'room': [3],
    'living_room': [1],
    'area': [120],
    'age': [5],
    'floor': [2]
})

print(model.predict(new_data))

[23427.03681065]


In [56]:
print(df[(df['city'] == 'ankara') & (df['district'] == 'kecioren') & (df['neighborhood'] == 'basinevleri')])

      city  district neighborhood  room  living_room  area  age  floor  price
75  ankara  kecioren  basinevleri     3            1   120   27      2  20000
