In [230]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, r2_score

data_path = os.path.join("..", "data", "processed", "ilanlar_final.csv")
df = pd.read_csv(data_path)

print(f"Ham Veri Sayısı: {len(df)}")

Ham Veri Sayısı: 17736


In [232]:
def clean_text_to_numeric(text, remove_str):
    if pd.isna(text): return np.nan
    clean_text = str(text).replace(remove_str, '').replace('.', '').replace(',', '.')
    try:
        return float(clean_text)
    except:
        return np.nan

df['Motor Hacmi'] = df['Motor Hacmi'].apply(lambda x: clean_text_to_numeric(x, ' cc'))
df['Motor Gücü'] = df['Motor Gücü'].apply(lambda x: clean_text_to_numeric(x, ' hp'))
df['Tramer'] = df['Tramer'].apply(lambda x: clean_text_to_numeric(x, ' TL')).fillna(0)

fill_cols = ['Motor Hacmi', 'Motor Gücü']
for col in fill_cols:
    df[col] = df[col].fillna(df.groupby('Model')[col].transform('median')).fillna(0)

def remove_model_outliers(group):
    if len(group) < 5: return group
    Q1 = group['Fiyat'].quantile(0.10) 
    Q3 = group['Fiyat'].quantile(0.90)
    return group[(group['Fiyat'] >= Q1) & (group['Fiyat'] <= Q3)]

print("Model bazlı temizlik yapılıyor...")
df = df.groupby('Model', group_keys=False).apply(remove_model_outliers)
print(f"Temizlik Sonrası Veri: {len(df)}")

Model bazlı temizlik yapılıyor...
Temizlik Sonrası Veri: 14413


  df = df.groupby('Model', group_keys=False).apply(remove_model_outliers)


In [234]:
current_year = 2025
df['Yas'] = current_year - df['Yıl']
df['Yas'] = df['Yas'].replace(0, 1)
df['Yillik_KM'] = df['Kilometre'] / df['Yas']

def count_parts(text):
    if pd.isna(text) or str(text) in ['Yok', 'Belirtilmemiş', 'Orijinal', 'Tamamı orjinal', 'Hatasız', 'nan']: return 0
    return len(str(text).split(','))

df['Boyali_Sayisi'] = df['Boyalı Parçalar'].apply(count_parts) + df['Lokal Boyalı Parçalar'].apply(count_parts)
df['Degisen_Sayisi'] = df['Değişen Parçalar'].apply(count_parts)
df['Hasar_Skoru'] = (df['Boyali_Sayisi'] * 1) + (df['Degisen_Sayisi'] * 2)

In [236]:
cat_cols = ['Marka', 'Seri', 'Model', 'Vites Tipi', 'Yakıt Tipi', 'Kasa Tipi', 'Renk', 'Kimden', 'Çekiş']
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].fillna("Bilinmiyor").astype(str)

drop_cols = [
    'Link', 'İlan No', 'İlan Tarihi', 
    'Boyalı Parçalar', 'Lokal Boyalı Parçalar', 'Değişen Parçalar', 
    'Araç Durumu', 'Takasa Uygun', 'Boya-değişen', 'Ağır Hasarlı',
    'Ort. Yakıt Tüketimi', 'Yakıt Deposu'
]
X = df.drop(columns=['Fiyat'] + [c for c in drop_cols if c in df.columns], errors='ignore')
y = df['Fiyat']

In [238]:
y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [240]:
common_params = {
    'iterations': 3000,
    'learning_rate': 0.03,
    'depth': 8,
    'random_seed': 42,
    'verbose': 500,
    'cat_features': cat_cols,
    'allow_writing_files': False,
    'l2_leaf_reg': 3
}

print("--- 1/3: Ana Model Eğitiliyor... ---")
model_main = CatBoostRegressor(loss_function='MAE', **common_params)
model_main.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=200)

print("\n--- 2/3: Alt Sınır Modeli Eğitiliyor... ---")
model_low = CatBoostRegressor(loss_function='Quantile:alpha=0.10', **common_params)
model_low.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=200)

print("\n--- 3/3: Üst Sınır Modeli Eğitiliyor... ---")
model_high = CatBoostRegressor(loss_function='Quantile:alpha=0.90', **common_params)
model_high.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=200)

--- 1/3: Ana Model Eğitiliyor... ---
0:	learn: 0.6475726	test: 0.6542876	best: 0.6542876 (0)	total: 107ms	remaining: 5m 20s
500:	learn: 0.1009724	test: 0.1190198	best: 0.1190198 (500)	total: 59s	remaining: 4m 54s
1000:	learn: 0.0808338	test: 0.1123361	best: 0.1123361 (1000)	total: 1m 57s	remaining: 3m 55s
1500:	learn: 0.0704907	test: 0.1097210	best: 0.1097143 (1499)	total: 2m 57s	remaining: 2m 57s
2000:	learn: 0.0639632	test: 0.1086786	best: 0.1086786 (2000)	total: 3m 57s	remaining: 1m 58s
2500:	learn: 0.0590795	test: 0.1081078	best: 0.1081007 (2473)	total: 4m 56s	remaining: 59.2s
2999:	learn: 0.0557307	test: 0.1077282	best: 0.1077269 (2996)	total: 5m 57s	remaining: 0us

bestTest = 0.107726926
bestIteration = 2996

Shrink model to first 2997 iterations.

--- 2/3: Alt Sınır Modeli Eğitiliyor... ---
0:	learn: 0.1487581	test: 0.1501677	best: 0.1501677 (0)	total: 135ms	remaining: 6m 44s
500:	learn: 0.0234270	test: 0.0297461	best: 0.0297461 (500)	total: 55s	remaining: 4m 34s
Stopped by over

<catboost.core.CatBoostRegressor at 0x2a58e1fdd90>

In [242]:
y_pred_log = model_main.predict(X_test)
y_pred = np.expm1(y_pred_log) # Log -> TL
y_test_orig = np.expm1(y_test) # Log -> TL

mae = mean_absolute_error(y_test_orig, y_pred)
r2 = r2_score(y_test_orig, y_pred)

print("\n--- FİNAL MODEL BAŞARISI ---")
print(f"Ortalama Hata (MAE): {mae:,.0f} TL")
print(f"Başarı Skoru (R²): {r2:.4f}")


--- FİNAL MODEL BAŞARISI ---
Ortalama Hata (MAE): 233,053 TL
Başarı Skoru (R²): 0.7757


In [244]:
model_folder = os.path.join("..", "models")
joblib.dump(model_main, os.path.join(model_folder, "catboost_main.pkl"))
joblib.dump(model_low, os.path.join(model_folder, "catboost_low.pkl"))
joblib.dump(model_high, os.path.join(model_folder, "catboost_high.pkl"))
print("Modeller kaydedildi!")

Modeller kaydedildi!
