In [1]:
import tensorflow as tf
print(tf.__version__)

2.20.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("processed_turkish_house_sales.csv")

In [4]:
print("Veri Seti İlk 5 Satır:")
print(df.head())

Veri Seti İlk 5 Satır:
   satici_tip  Metrekare Oda_Sayisi        il         Ilce     Mahalle  \
0  Sahibinden      160.0      3.5+1  Istanbul        Şişli    Esentepe   
1  Sahibinden       85.0        2+1  Istanbul       Kartal    Topselvi   
2  Sahibinden      110.0        3+1  Istanbul        Tuzla     Aydınlı   
3  Sahibinden      200.0        6+1  Istanbul  Sultanbeyli  Necipfazıl   
4  Sahibinden      120.0        2+1  Istanbul     Esenyurt  Bilinmiyor   

            Tarih    fiyat  
0  25 Mayıs  2025  8750000  
1  25 Mayıs  2025  4899000  
2  24 Mayıs  2025  5900000  
3  24 Mayıs  2025  6999999  
4  24 Mayıs  2025  1625000  


In [5]:
print("\n--- Veri Özeti (Info) ---")
print(df.info())

print("\n--- Eksik Veri Sayıları ---")
print(df.isnull().sum())

# Özel Kontrol: 'Bilinmiyor' yazan mahalle var mı?
bilinmeyen_mahalle = df[df['Mahalle'] == 'Bilinmiyor'].shape[0]
print(f"\n'Bilinmiyor' olarak girilmiş mahalle sayısı: {bilinmeyen_mahalle}")


--- Veri Özeti (Info) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15276 entries, 0 to 15275
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   satici_tip  15276 non-null  object 
 1   Metrekare   15276 non-null  float64
 2   Oda_Sayisi  15276 non-null  object 
 3   il          15276 non-null  object 
 4   Ilce        15276 non-null  object 
 5   Mahalle     15276 non-null  object 
 6   Tarih       15276 non-null  object 
 7   fiyat       15276 non-null  int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 954.9+ KB
None

--- Eksik Veri Sayıları ---
satici_tip    0
Metrekare     0
Oda_Sayisi    0
il            0
Ilce          0
Mahalle       0
Tarih         0
fiyat         0
dtype: int64

'Bilinmiyor' olarak girilmiş mahalle sayısı: 5987


In [6]:
print("\n--- İl Dağılımı (İlk 10) ---")
print(df['il'].value_counts().head(10))

print("\n--- İlçe Çeşitliliği ---")
print(f"Toplam Farklı İlçe Sayısı: {df['Ilce'].nunique()}")
print(df['Ilce'].value_counts().head(5))

# Oda Sayısı çeşitliliğine de bakalım (3.5+1 gibi ilginç tipler var mı?)
print("\n--- Oda Sayısı Tipleri ---")
print(df['Oda_Sayisi'].unique())


--- İl Dağılımı (İlk 10) ---
il
Bilecik           1557
Usak              1240
Istanbul          1110
Erzurum           1030
Bursa             1027
Diyarbakir        1025
Osmaniye          1024
Van               1017
Samsun            1015
Afyonkarahisar    1010
Name: count, dtype: int64

--- İlçe Çeşitliliği ---
Toplam Farklı İlçe Sayısı: 193
Ilce
Merkez       4829
İpekyolu      725
Bozüyük       541
Kayapınar     529
Nilüfer       441
Name: count, dtype: int64

--- Oda Sayısı Tipleri ---
['3.5+1' '2+1' '3+1' '6+1' '5+1' '1+1' '4+2' '4+1' '5+2' '3+2' '2+2'
 'Stüdyo (1+0)' '2+0' '1.5+1' '3+3' '2.5+1' '5.5+1' '6+2' '7+1' '4.5+1'
 '3+0' '10 Üzeri' '5+3' '4+3' '6+3' '7+2' '6.5+1' '5+4' '8+1' '4+0']


In [7]:
# Counting exact matches for 'Bilinmiyor'
unknown_neighborhood_count = (df['Mahalle'] == 'Bilinmiyor').sum()

print(f"Total count of 'Bilinmiyor' in Mahalle column: {unknown_neighborhood_count}")

# For a more robust check (handling potential case sensitivity or extra spaces)
robust_unknown_count = df['Mahalle'].str.strip().str.contains('Bilinmiyor', case=False, na=False).sum()

print(f"Robust count (case-insensitive & stripped): {robust_unknown_count}")

Total count of 'Bilinmiyor' in Mahalle column: 5987
Robust count (case-insensitive & stripped): 5987


In [8]:
# Frequency check
over_10_count = (df['Oda_Sayisi'] == '10 Üzeri').sum()
total_rows = len(df)
ratio = (over_10_count / total_rows) * 100

print(f"Count of '10 Üzeri': {over_10_count}")
print(f"Percentage in Dataset: {ratio:.2f}%")

Count of '10 Üzeri': 3
Percentage in Dataset: 0.02%


In [9]:
# 1. Drop the outliers (10 Üzeri)
df = df[df['Oda_Sayisi'] != '10 Üzeri'].copy()

# 2. Standardize 'Stüdyo (1+0)' to '1+0' format for easier parsing
df['Oda_Sayisi'] = df['Oda_Sayisi'].str.replace('Stüdyo (1+0)', '1+0', regex=False)

print(f"Dataset size after dropping outliers: {len(df)}")

Dataset size after dropping outliers: 15273


In [10]:
# 2. Split function to create numeric Room and Salon columns
def split_room_info(room_string):
    try:
        if '+' in str(room_string):
            parts = str(room_string).split('+')
            # Keeping as float to preserve .5 rooms (e.g., 3.5)
            rooms = float(parts[0])
            salons = float(parts[1])
            return rooms, salons
        return None, None
    except:
        return None, None

# 3. Apply the split
df[['Room_Count', 'Living_Room_Count']] = df['Oda_Sayisi'].apply(
    lambda x: pd.Series(split_room_info(x))
)

# 4. Final Clean-up: Drop any rows that couldn't be parsed
df.dropna(subset=['Room_Count', 'Living_Room_Count'], inplace=True)

print("Feature Engineering Success!")
print(df[['Oda_Sayisi', 'Room_Count', 'Living_Room_Count']].head())

Feature Engineering Success!
  Oda_Sayisi  Room_Count  Living_Room_Count
0      3.5+1         3.5                1.0
1        2+1         2.0                1.0
2        3+1         3.0                1.0
3        6+1         6.0                1.0
4        2+1         2.0                1.0


In [11]:
# Hangi kolonda kaç benzersiz değer var?
print(f"Unique Cities (il): {df['il'].nunique()}")
print(f"Unique Districts (Ilce): {df['Ilce'].nunique()}")
print(f"Unique Neighborhoods (Mahalle): {df['Mahalle'].nunique()}")

Unique Cities (il): 23
Unique Districts (Ilce): 193
Unique Neighborhoods (Mahalle): 314


In [12]:
# Target Encoding function (Split'ten SONRA kullanılacak)
def apply_target_encoding(data, column, target, weight=20):
    global_mean = data[target].mean()
    agg = data.groupby(column)[target].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    smooth = (counts * means + weight * global_mean) / (counts + weight)
    return smooth.to_dict(), global_mean

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# 0) Split önce
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 1) Target Encoding SADECE train'den öğren
ilce_map, global_avg = apply_target_encoding(train_df, 'Ilce', 'fiyat', weight=20)

train_df['Ilce_Encoded'] = train_df['Ilce'].map(ilce_map).fillna(global_avg)
test_df['Ilce_Encoded']  = test_df['Ilce'].map(ilce_map).fillna(global_avg)

# 2) One-Hot (train/test ayrı)
train_final = pd.get_dummies(train_df, columns=['il', 'satici_tip'], drop_first=True)
test_final  = pd.get_dummies(test_df,  columns=['il', 'satici_tip'], drop_first=True)

# 3) Drop'lar
cols_to_drop = ['Ilce', 'Mahalle', 'Oda_Sayisi', 'Tarih', 'Tarih_Formatli', 'Mahalle_Encoded']
train_final.drop(columns=[c for c in cols_to_drop if c in train_final.columns], inplace=True)
test_final.drop(columns=[c for c in cols_to_drop if c in test_final.columns], inplace=True)

# 4) Kolon hizalama (çok kritik)
test_final = test_final.reindex(columns=train_final.columns, fill_value=0)

print("Train sütun sayısı:", train_final.shape[1])
print("Test  sütun sayısı:", test_final.shape[1])

# 5) X/y
X_train = train_final.drop('fiyat', axis=1)
y_train = np.log1p(train_final['fiyat'])

X_test  = test_final.drop('fiyat', axis=1)
y_test  = np.log1p(test_final['fiyat'])

Train sütun sayısı: 28
Test  sütun sayısı: 28


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error


numeric_cols = ['Metrekare', 'Room_Count', 'Living_Room_Count', 'Ilce_Encoded']

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols)
    ],
    remainder="passthrough"
)

model = Pipeline(steps=[
    ("prep", preprocess),
    ("reg", RidgeCV(alphas=np.logspace(-3, 3, 13), cv=5))
])

model.fit(X_train, y_train)

# Tahmin (log space)
y_pred_log = model.predict(X_test)

print("R2 (log):", r2_score(y_test, y_pred_log))

# Gerçek fiyat space (TL)
y_pred_real = np.expm1(y_pred_log)
y_test_real = np.expm1(y_test)

print("R2 (real):", r2_score(y_test_real, y_pred_real))
print("MAE (TL):", mean_absolute_error(y_test_real, y_pred_real))

R2 (log): 0.5361999399468333
R2 (real): 0.43004795501054915
MAE (TL): 1119759.2256616026


In [15]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

model = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    max_depth=None,
    min_samples_leaf=2
)

model.fit(X_train, y_train)

y_pred_log = model.predict(X_test)

print("R2 (log):", r2_score(y_test, y_pred_log))

y_pred_real = np.expm1(y_pred_log)
y_test_real = np.expm1(y_test)

print("R2 (real):", r2_score(y_test_real, y_pred_real))
print("MAE (TL):", mean_absolute_error(y_test_real, y_pred_real))

R2 (log): 0.6068333269604684
R2 (real): 0.49876617675851254
MAE (TL): 997916.6809699676


In [16]:
import joblib

# DİKKAT: Burada senin notebook'undaki değişken isimlerini kullanıyoruz.
artifacts = {
    "model": model,                   # Cell 14'te eğittiğin model (Adı 'model' idi)
    "ilce_map": ilce_map,             # Cell 12'de oluşturduğun harita
    "global_avg": global_avg,         # Cell 12'deki genel ortalama
    "columns": X_train.columns.tolist() # Eğitimde kullanılan sütun sırası (Çok önemli!)
}

# Dosyayı kaydediyoruz
joblib.dump(artifacts, "final_real_estate_model_v1.pkl")

print(f"--- SUCCESS: Model saved as 'final_real_estate_model_v1.pkl' ---")
print(f"Kaydedilen Özellik Sayısı: {len(X_train.columns)}")

--- SUCCESS: Model saved as 'final_real_estate_model_v1.pkl' ---
Kaydedilen Özellik Sayısı: 27
