### Ege Bölgesi Kira Uygulaması

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv("data.csv")

In [6]:
import numpy as np
import matplotlib.pyplot as plt

In [7]:
df.head()

Unnamed: 0,city,district,neighborhood,room,living_room,area,age,floor,price
0,usak,banaz,cumhuriyet,3,1,150,20,-2,70
1,usak,merkez,kemaloz,2,1,110,8,1,85
2,usak,merkez,kemaloz,1,1,60,0,4,70
3,usak,merkez,durak,1,1,50,10,5,70
4,usak,merkez,cumhuriyet,4,1,300,11,7,165


In [12]:
df["district"]=df["district"].astype("category")

In [14]:
df["neighborhood"]=df["neighborhood"].astype("category")

In [16]:
df["room"]=df["room"].astype(int)

In [18]:
df["living_room"]=df["living_room"].astype(int)

In [20]:
df["area"]=df["area"].astype(int)

In [22]:
df["age"]=df["age"].astype(int)

In [24]:
df["floor"]=df["floor"].astype(int)

In [26]:
df["price"]=df["price"].astype(int)

In [28]:
columns=df.select_dtypes(include=[np.number]).columns
min_values=[]
max_values=[]
for column in columns:
    Q1=df[column].quantile(0.25)
    Q3=df[column].quantile(0.75)
    IQR=Q3-Q1
    min_value =Q1-1.5*IQR
    max_value =Q3+1.5*IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"Column: {column}, Min Value: {min_value}, Max Value: {max_value}")


Column: room, Min Value: 0.5, Max Value: 4.5
Column: living_room, Min Value: 1.0, Max Value: 1.0
Column: area, Min Value: -17.5, Max Value: 242.5
Column: age, Min Value: -20.0, Max Value: 44.0
Column: floor, Min Value: -2.0, Max Value: 6.0
Column: price, Min Value: -18000.0, Max Value: 62000.0


In [30]:
# Aykırı verilerin temizlenmesi

In [32]:
for i,column in enumerate(columns):
    df=df[(df[column]>= min_values[i]) & (df[column]<=max_values[i])]

In [34]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 6212 entries, 0 to 8134
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6212 non-null   category
 1   district      6212 non-null   category
 2   neighborhood  6212 non-null   category
 3   room          6212 non-null   int32   
 4   living_room   6212 non-null   int32   
 5   area          6212 non-null   int32   
 6   age           6212 non-null   int32   
 7   floor         6212 non-null   int32   
 8   price         6212 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 243.3 KB
None


In [36]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
room,6212.0,2.176272,0.826815,1.0,2.0,2.0,3.0,4.0
living_room,6212.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
area,6212.0,104.66935,39.442494,5.0,75.0,100.0,130.0,240.0
age,6212.0,12.653896,10.451565,0.0,4.0,10.0,20.0,44.0
floor,6212.0,2.199614,1.589618,-2.0,1.0,2.0,3.0,6.0
price,6212.0,17900.975853,10467.582893,1.0,11000.0,15000.0,21000.0,60000.0


#### Linear Regresyon

In [50]:
df = pd.read_csv("data_cleaned.csv")

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [56]:
df["city"]=df["city"].astype("category")
df["district"]=df["district"].astype("category")
df["neighborhood"]=df["neighborhood"].astype("category")
df["room"]=df["room"].astype(int)
df["living_room"]=df["living_room"].astype(int)
df["area"]=df["area"].astype(int)
df["age"]=df["age"].astype(int)
df["floor"]=df["floor"].astype(int)
df["price"]=df["price"].astype(int)

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6116 entries, 0 to 6115
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          6116 non-null   category
 1   district      6116 non-null   category
 2   neighborhood  6116 non-null   category
 3   room          6116 non-null   int32   
 4   living_room   6116 non-null   int32   
 5   area          6116 non-null   int32   
 6   age           6116 non-null   int32   
 7   floor         6116 non-null   int32   
 8   price         6116 non-null   int32   
dtypes: category(3), int32(6)
memory usage: 191.7 KB


In [60]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living_room', 'area', 'age', 'floor']
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [62]:
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', LinearRegression())
])
model.fit(X_train, y_train)

In [66]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 48099119.38813986
RMSE: 6935.3528668799445
R^2: 0.5897272630866477


In [68]:
feature_importances = model.named_steps['model'].coef_
print(len(feature_importances))
print(feature_importances)

732
[ 7.71405166e+02  0.00000000e+00  3.45059950e+03 -2.07541757e+03
  1.62024352e+02 -5.56054893e+03  1.78416123e+03 -3.44413148e+03
  6.39060564e+03 -1.90378287e+03  2.73369641e+03 -9.41651427e+02
  1.01512899e+01 -2.32074887e+03  1.55641012e+03  6.35197078e+03
 -7.42666844e+03 -4.36561706e+03 -5.01437502e+03  1.63302276e+04
 -5.33109083e+02 -2.02688122e+03 -2.08525137e+03 -1.82286609e+03
 -1.46020220e+03 -5.96613156e+03  1.75278272e+04 -4.63262031e+03
 -1.75776315e+03 -2.36544888e+03 -7.73994523e+02 -9.34568184e+02
 -2.95932150e+03 -2.82220130e+03 -2.32330684e+03  3.10563473e+03
  9.43231173e+03 -1.03692057e+03  9.02634473e+03  2.20558042e+03
  2.23739247e+03 -7.71452913e+03 -1.85136848e+03 -7.21950848e+01
  1.42294087e+03 -9.11864743e+03 -4.24911332e+03 -4.35022766e+03
  4.82566585e+03  1.04809232e+02  1.25821554e+04  7.99773469e+03
  8.94722339e+02 -3.87216711e+03  2.52976189e+03  9.38691706e+02
  1.75607780e+03 -2.90454500e+03  1.03379684e+04 -3.53337272e+03
 -8.13727822e+03 -3.5

In [70]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i], feature_importances[i])

Numerical Features
room 771.4051664065751
living_room 0.0
area 3450.599496408383
age -2075.4175700883648
floor 162.0243522853012


In [72]:
print("Categorical Features")
for i in range(len(categorical_features)):
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

Categorical Features
afyonkarahisar -5560.548934801228
aydin 1784.1612277457966
denizli -3444.131475045823
izmir 6390.605638413451
manisa -1903.7828714441714
mugla 2733.6964148737575
acipayam -5560.548934801228
akhisar 1784.1612277457966
alasehir -3444.131475045823
aliaga 6390.605638413451
balcova -1903.7828714441714
bayindir 2733.6964148737575
bayrakli -941.6514270687211
bergama 10.151289891698728
bodrum -2320.748869334971
bolvadin 1556.410120064455
bornova 6351.970779834926
buca -7426.668444246511
buharkent -4365.6170575042315
cardak -5014.375017465215
cay 16330.227626665477
cesme -533.1090826965236
cigli -2026.8812241897126
cine -2085.2513723031852
civril -1822.866094298657
dalaman -1460.2022007459173
datca -5966.131557726155
didim 17527.827173561098
dikili -4632.620312017531
efeler -1757.7631525553954
fethiye -2365.4488799614974
foca -773.994522717595
gaziemir -934.5681839268108
germencik -2959.3215014663747
guzelbahce -2822.201303030334
honaz -2323.30684376292
incirliova 3105.6347

In [74]:
new_data = pd.DataFrame({
    'city': ['manisa'],
    'district': ['yunusemre'],
    'neighborhood': ['guzelyurt'],
    'room': [4],
    'living_room': [1],
    'area': [200],
    'age': [5],
    'floor': [3]
})

print(model.predict(new_data))

[30324.92783586]


In [76]:
print(df[(df['city'] == 'manisa') & (df['district'] == 'yunusemre') & (df['neighborhood'] == 'guzelyurt')])

        city   district neighborhood  room  living_room  area  age  floor  \
5151  manisa  yunusemre    guzelyurt     1            1    65   13      5   
5198  manisa  yunusemre    guzelyurt     2            1    85    2      3   
5222  manisa  yunusemre    guzelyurt     4            1   196    5      1   
5239  manisa  yunusemre    guzelyurt     1            1    60   11      5   

      price  
5151  15000  
5198  15000  
5222  36000  
5239  11000  


In [78]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 10000))
print(tolerance_percentage_r2(y_test, y_pred, 0.50))

0.5897272630866477
0.7085110164468329
0.8225924735729502


#### Sınıflandırma

In [81]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [83]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])
X = df.drop('price', axis=1)
y = df['price']
bins = [x for x in range(0, 70000, 10000)]
labels = [x for x in range(1, 7)]
print(bins)
print(labels)

[0, 10000, 20000, 30000, 40000, 50000, 60000]
[1, 2, 3, 4, 5, 6]


In [85]:
y = pd.cut(y, bins=bins, labels=labels)
print(y.unique())

[1, 2, 3, 4, 5, 6]
Categories (6, int64): [1 < 2 < 3 < 4 < 5 < 6]


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])
model.fit(X_train, y_train)

In [89]:
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[152 104   1   0   0   0]
 [ 51 539  26   5   3   0]
 [  1 111  83   9   5   1]
 [  2  29  18  14   6   2]
 [  0  10  12   5   9   4]
 [  0   6   6   5   2   3]]


In [91]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.74      0.59      0.66       257
           2       0.67      0.86      0.76       624
           3       0.57      0.40      0.47       210
           4       0.37      0.20      0.26        71
           5       0.36      0.23      0.28        40
           6       0.30      0.14      0.19        22

    accuracy                           0.65      1224
   macro avg       0.50      0.40      0.43      1224
weighted avg       0.63      0.65      0.63      1224

