In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [3]:
df = pd.read_csv("data_cleaned.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          6116 non-null   object
 1   district      6116 non-null   object
 2   neighborhood  6116 non-null   object
 3   room          6116 non-null   int64 
 4   living_room   6116 non-null   int64 
 5   area          6116 non-null   int64 
 6   age           6116 non-null   int64 
 7   floor         6116 non-null   int64 
 8   price         6116 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 430.2+ KB


In [5]:
df["city"] = df["city"].astype("category")
df["district"] = df["district"].astype("category")
df["neighborhood"] = df["neighborhood"].astype("category")
df["room"] = df["room"].astype("int")
df["living_room"] = df["living_room"].astype("int")
df["area"] = df["area"].astype("int")
df["age"] = df["age"].astype("int")
df["floor"] = df["floor"].astype("int")
df["price"] = df["price"].astype("int")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int64   
 4   living_room   6116 non-null   int64   
 5   area          6116 non-null   int64   
 6   age           6116 non-null   int64   
 7   floor         6116 non-null   int64   
 8   price         6116 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 335.0 KB


In [7]:
categorical_features = ["city", "district", "neighborhood"]
numerical_features = ["room", "living_room", "area", "age", "floor"]

In [9]:
full_pipeline = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

In [10]:
X = df.drop("price", axis=1)
y = df["price"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [12]:
model = Pipeline([
    ("preparation", full_pipeline),
    ("model", LinearRegression())
])

In [13]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preparation', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 48389250.2249592
RMSE: 6956.238223706776
R^2: 0.5584039311773256


In [16]:
feature_importances = model.named_steps["model"].coef_

In [17]:
feature_importances

array([ 9.80218388e+02,  0.00000000e+00,  3.12063619e+03, -2.06725616e+03,
        1.53157518e+02, -3.13789754e+03,  9.02548883e+02, -3.86611123e+03,
        6.32192547e+03, -1.66628049e+03,  1.44581491e+03, -1.95532537e+03,
       -1.41378397e+03, -7.56942619e+02, -1.14211233e+03,  7.12335858e+03,
       -1.06526229e+04, -6.48460631e+03, -7.38498978e+03,  1.77280367e+04,
       -2.50379992e+03, -2.47322209e+03, -3.19984193e+03, -2.09597063e+03,
        1.77231501e+03, -2.16068184e+03,  1.71862496e+03,  1.60434580e+04,
       -5.40815428e+03, -6.48358529e+03, -2.42003793e+03, -5.12974975e+02,
       -2.84918486e+02,  2.05288802e+02, -1.49073841e+03, -3.66115829e+03,
       -2.46240163e+03,  2.38185221e+03,  9.68697925e+03, -3.20369024e+03,
        1.20303204e+04,  3.65827244e+03,  1.40281038e+03, -7.92965774e+03,
        1.48533978e+03, -4.72787199e+01,  1.75428279e+03, -9.03410206e+03,
        1.41217178e+04, -4.58839797e+03,  3.92214008e+03, -1.74765010e+03,
        1.47114774e+04,  

In [18]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room 980.2183880321326
living_room 0.0
area 3120.6361893545673
age -2067.256156631684
floor 153.1575177629303


In [24]:
print("Categorical Features")
offset = len(numerical_features)
for i in range(len(categorical_features)):
    cats = model.named_steps['preparation'].transformers_[1][1].categories_[i]
    for j in range(len(cats)):
        print(categorical_features[i], cats[j], feature_importances[offset + j])
    offset += len(cats)

Categorical Features
city afyonkarahisar -3137.897540558705
city aydin 902.5488829308497
city denizli -3866.1112258552
city izmir 6321.925467715472
city manisa -1666.280490730254
city mugla 1445.8149070781828
district acipayam -1955.3253661993276
district akhisar -1413.7839676882295
district alasehir -756.9426189429522
district aliaga -1142.1123272461502
district balcova 7123.358583557684
district bayindir -10652.62286388964
district bayrakli -6484.606313046627
district bergama -7384.989776221383
district bodrum 17728.03666546321
district bolvadin -2503.799919357562
district bornova -2473.2220887803487
district buca -3199.841933393313
district buharkent -2095.9706347474603
district cameli 1772.3150080889193
district cardak -2160.681843029287
district cay 1718.62496103122
district cesme 16043.458009391841
district cigli -5408.154275654247
district cine -6483.585294284004
district civril -2420.037930743773
district dalaman -512.9749754827404
district datca -284.9184862032212
district dem

In [27]:
new_data = pd.DataFrame({
    "city": ["manisa"],
    "district": ["yunusemre"],
    "neighborhood": ["güzelyurt"],
    "room": [3],
    "living_room": [1],
    "area": [120],
    "age": [5],
    "floor": [3]
})
print(model.predict(new_data))

[19062.32869133]
