In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [10]:
df = pd.read_csv('datas_cleaned.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1377 entries, 0 to 1376
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          1377 non-null   object
 1   district      1377 non-null   object
 2   neighborhood  1377 non-null   object
 3   room          1377 non-null   int64 
 4   living room   1377 non-null   int64 
 5   area          1377 non-null   int64 
 6   age           1377 non-null   int64 
 7   floor         1377 non-null   int64 
 8   price         1377 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 96.9+ KB


In [12]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living room'] = df['living room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1377 entries, 0 to 1376
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          1377 non-null   category
 1   district      1377 non-null   category
 2   neighborhood  1377 non-null   category
 3   room          1377 non-null   int32   
 4   living room   1377 non-null   int32   
 5   area          1377 non-null   int32   
 6   age           1377 non-null   int32   
 7   floor         1377 non-null   int32   
 8   price         1377 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 50.0 KB


In [14]:
categorical_features = ['city','district','neighborhood']
numerical_features = ['room','living room','area','age','floor']

In [20]:
# Standart Scaler ve OneHotEncoder ile verileri eğitime hazır hale getirdik

In [17]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(),numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'),categorical_features)
])

In [18]:
X = df.drop('price',axis=1)
y = df['price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=31)

In [21]:
# Modeli Pipeline ile 2 aşamalı olacak şekilde bölüp eğitiyoruz

In [22]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model',LinearRegression())
])

In [23]:
model.fit(X_train,y_train)

In [24]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [25]:
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R^2: {r2}")


Mean Squared Error: 12729.320757250005
Root Mean Squared Error: 112.8242915211525
R^2: 0.6533547201058754


In [26]:
feature_importances = model.named_steps['model'].coef_
feature_importances

array([-1.46311308e+01,  0.00000000e+00,  8.86298415e+01, -5.70754460e+01,
        8.53741397e+00, -1.04540441e-10,  1.32035553e+02, -3.03471318e+02,
        1.22424180e+02, -3.62230238e+02,  3.63332014e+01,  1.52178793e+02,
        9.20160929e+01, -1.47974385e+01, -2.24440249e+02, -4.46870174e+01,
       -6.68404816e+01,  6.35154046e+01,  3.13188074e+02, -5.74544552e+01,
       -2.91586404e+01, -1.35529004e+02,  2.62190400e+01,  6.73946487e+01,
       -2.55135309e+00, -1.10978227e+02,  2.19028815e+02,  1.87911411e+01,
        9.28711386e+01,  6.67202948e+01,  1.71342923e+01, -4.48668379e+01,
       -2.31896785e+01,  3.56242372e+01, -4.59574383e+02, -6.69828200e+01,
        1.24730746e+02,  1.54394861e+02,  6.04002087e+01, -9.68227956e+01,
       -4.79294987e+01,  1.15346361e+02,  1.81157352e+02, -6.72902527e+01,
       -3.10945213e+02, -2.16967678e+02, -1.73423244e+02,  1.25027955e+02,
        4.14019674e+01,  4.35909323e+02,  4.57359401e+01, -3.91981187e+01,
       -1.12455756e+02,  

In [27]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room -14.631130830455758
living room 0.0
area 88.62984149138903
age -57.07544599862072
floor 8.537413971905266


In [28]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
İstanbul -1.0454044084844534e-10
Adalar -1.0454044084844534e-10
Arnavutköy 132.03555272187543
Ataşehir -303.47131794609896
Avcılar 122.42418027826469
Bahçelievler -362.2302384782098
Bakırköy 36.33320138161385
Bayrampaşa 152.17879308530138
Bağcılar 92.01609287400105
Başakşehir -14.797438497621046
Beykoz -224.44024899625254
Beylikdüzü -44.68701736266263
Beyoğlu -66.84048159228563
Beşiktaş 63.515404610106906
Büyükçekmece 313.1880735158724
Esenler -57.45445516754784
Esenyurt -29.158640369066134
Eyüpsultan -135.52900398430495
Fatih 26.219039959641922
Gaziosmanpaşa 67.39464866460558
Güngören -2.551353089574206
Kadıköy -110.978227125273
Kartal 219.02881511267114
Kağıthane 18.791141105095644
Küçükçekmece 92.871138558023
Maltepe 66.72029481219815
Pendik 17.13429227438825
Sancaktepe -44.86683789059777
Sarıyer -23.189678545391256
Silivri 35.62423721232148
Sultanbeyli -459.5743828262663
Sultangazi -66.98282004858292
Tuzla 124.73074623199508
Zeytinburnu 154.3948614402179
Çekmek

In [30]:
new_data = pd.DataFrame({
    'city': ['İstanbul'],
    'district': ['Gaziosmanpaşa'],
    'neighborhood': [' Kazım Karabekir Mah.'],
    'room': [4],
    'living room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[474.52743942]
