# V8 Modeli

**Veri Hazırlama**

In [None]:
import pandas as pd
import numpy as np
import gc
import os

print("Adım 1 (v8 - Gelişmiş Zamansal ve Sıralama Özellikleri): Veri Hazırlama Başladı.")

IN_TRAIN_PATH = '/content/datathon/train.csv'
IN_TEST_PATH = '/content/datathon/test.csv'

OUT_DIR = '/content/datathon/processed/'
os.makedirs(OUT_DIR, exist_ok=True)

OUT_TRAIN_PATH = OUT_DIR + 'train_processed_v8.csv'
OUT_TEST_PATH = OUT_DIR + 'test_processed_v8.csv'

# --- Veri Yükleme ve Ön Bilgiler (v7 ile aynı) ---
try:
    df_train = pd.read_csv(IN_TRAIN_PATH, parse_dates=['event_time'])
    df_test = pd.read_csv(IN_TEST_PATH, parse_dates=['event_time'])
    print("Ham veri setleri başarıyla yüklendi.")

    # Analiz için ön bilgiler
    train_users = set(df_train['user_id'])
    test_users = set(df_test['user_id'])
    common_users = train_users.intersection(test_users)
    train_products = set(df_train['product_id'])
    new_products_in_test = set(df_test['product_id']) - train_products
    train_session_users = df_train.groupby('user_session')['user_id'].apply(set)
    test_session_users = df_test.groupby('user_session')['user_id'].apply(set)
    common_sessions = set(train_session_users.index).intersection(set(test_session_users.index))
    verified_leaked_sessions = {sid for sid in common_sessions if train_session_users[sid] == test_session_users[sid]}
    print("Analizler için ön bilgiler hesaplandı.")
except Exception as e:
    print(f"Hata: {e}.")
    exit()

# --- Anomali Temizleme, Kullanıcı Özellikleri (v7 ile aynı) ---
def fix_anomalous_sessions(df, data_type='train'):
    session_user_counts = df.groupby('user_session')['user_id'].nunique()
    anomalous_sessions = session_user_counts[session_user_counts > 1].index
    if len(anomalous_sessions) > 0:
        anomalous_indices = df['user_session'].isin(anomalous_sessions)
        df['user_session_corrected'] = df['user_session']
        df.loc[anomalous_indices, 'user_session_corrected'] = df.loc[anomalous_indices, 'user_session'] + '_' + df.loc[anomalous_indices, 'user_id']
        df.drop('user_session', axis=1, inplace=True)
        df.rename(columns={'user_session_corrected': 'user_session'}, inplace=True)
    return df
df_train = fix_anomalous_sessions(df_train, 'train')
df_test = fix_anomalous_sessions(df_test, 'test')

df_combined = pd.concat([df_train.drop('session_value', axis=1), df_test], ignore_index=True)
user_features = df_combined.groupby('user_id').agg(
    user_total_events=('event_type', 'count'), user_unique_products_viewed=('product_id', 'nunique'),
    user_first_seen=('event_time', 'min'), user_last_seen=('event_time', 'max')
)
user_features['user_lifespan_days'] = (user_features['user_last_seen'] - user_features['user_first_seen']).dt.days
user_buy_counts = df_combined[df_combined['event_type'] == 'BUY'].groupby('user_id').size()
user_features['user_buy_count'] = user_buy_counts
user_features['user_buy_count'].fillna(0, inplace=True)
user_features['user_purchase_rate'] = user_features['user_buy_count'] / user_features['user_total_events']
user_features.drop(['user_first_seen', 'user_last_seen'], axis=1, inplace=True)
del df_combined
gc.collect()

# --- 4. ADIM: SEANS BAZLI ÖZELLİKLER (v8 Güncellemesi) ---
def create_session_features_v8(df, data_type='train'):
    print(f"\n{data_type} verisi için seans bazlı özellik mühendisliği (v8) başlıyor...")

    # Temel Zamansal ve Popülerlik Özellikleri (v7 ile aynı)
    df['is_weekend'] = (df['event_time'].dt.dayofweek >= 5).astype(int)
    df['time_of_day'] = df['event_time'].dt.hour // 6 # 0: Gece, 1: Sabah, 2: Öğlen, 3: Akşam
    # Popülerlik özellikleri kaldırıldı (v6'da en iyi özellikler takımına dahil edilmediği için)

    # Seans İçi Zaman Farkları (v7 ile aynı)
    df['time_diff'] = df.groupby('user_session')['event_time'].diff().dt.total_seconds().fillna(0)

    # Seans İçindeki Sıralama Özellikleri (v7 ile aynı)
    df['event_order'] = df.groupby('user_session').cumcount() + 1
    session_event_counts = df['user_session'].map(df['user_session'].value_counts())
    df['event_order_pct'] = df['event_order'] / session_event_counts

    # --- YENİ EKLENEN KISIM 1: DAHA FAZLA ZAMANSAL İSTATİSTİK ---
    df['time_diff_log'] = np.log1p(df['time_diff']) # Log dönüşümü ekledik
    df['time_diff_sqrt'] = np.sqrt(df['time_diff']) # Karekök dönüşümü ekledik


    # --- YENİ EKLENEN KISIM 2: İLK/SON OLAY ÖZELLİKLERİ ---
    # İlk olayın kategorisi (Mode) ve ürünü (Mode) - Eğer seans içinde birden fazla ürün/kategori varsa en sık görüleni alalım
    first_event_features = df.groupby('user_session').agg(
        first_event_category=('category_id', lambda x: x.iloc[0]),
        first_event_product=('product_id', lambda x: x.iloc[0]),
        first_event_type=('event_type', lambda x: x.iloc[0]), # İlk olayın tipi
        first_event_hour=('event_time', lambda x: x.iloc[0].hour), # İlk olayın saati
    )

    # Son olayın kategorisi (Mode) ve ürünü (Mode)
    last_event_features = df.groupby('user_session').agg(
        last_event_category=('category_id', lambda x: x.iloc[-1]),
        last_event_product=('product_id', lambda x: x.iloc[-1]),
        last_event_type=('event_type', lambda x: x.iloc[-1]), # Son olayın tipi
        last_event_hour=('event_time', lambda x: x.iloc[-1].hour), # Son olayın saati
    )


    session_products = df.groupby('user_session')['product_id'].apply(set)
    event_type_counts = pd.crosstab(df['user_session'], df['event_type'])
    all_event_types = ['VIEW', 'ADD_CART', 'REMOVE_CART', 'BUY']
    for event in all_event_types:
        if event not in event_type_counts.columns:
            event_type_counts[event] = 0
    event_type_counts.columns = [f'{col.lower()}_count' for col in event_type_counts.columns]

    session_features = df.groupby('user_session').agg(
        user_id=('user_id', 'first'),
        event_count=('event_type', 'count'),
        unique_products=('product_id', 'nunique'),
        unique_categories=('category_id', 'nunique'),
        session_duration_seconds=('event_time', lambda x: (x.max() - x.min()).total_seconds()),
        avg_day_of_week=('event_time', lambda x: x.dt.dayofweek.mean()),
        avg_hour=('event_time', lambda x: x.dt.hour.mean()),

        # Gelişmiş Zamansal Özellikler (Özet istatistikler)
        avg_time_diff=('time_diff', 'mean'), # v7'den
        max_time_diff=('time_diff', 'max'),   # v7'den
        min_time_diff=('time_diff', 'min'),   # v7'den
        std_time_diff=('time_diff', 'std'),   # Yeni
        median_time_diff=('time_diff', 'median'), # Yeni
        avg_time_diff_log=('time_diff_log', 'mean'), # Yeni
        avg_time_diff_sqrt=('time_diff_sqrt', 'mean'), # Yeni


        # Sıralama Özellikleri (Özet istatistikler)
        avg_event_order=('event_order', 'mean'), # v7'den
        avg_event_order_pct=('event_order_pct', 'mean'), # v7'den

        # Popülerlik Özellikleri (v7'de vardı, v6'da elendi, v8'de de almıyoruz)
        # avg_product_pop=('product_pop', 'mean'),
        # max_product_pop=('product_pop', 'max'),
        # avg_category_pop=('category_pop', 'mean'),
        # max_category_pop=('max_category_pop', 'max') # Hata vardı, düzeltildi
    )

    # Analiz bazlı özellikler (v7 ile aynı)
    session_features['is_common_user'] = session_features['user_id'].isin(common_users).astype(int)
    session_features['is_leaked_session'] = session_features.index.isin(verified_leaked_sessions).astype(int)
    if data_type == 'test':
        session_features['has_new_product'] = [1 if not session_products[sid].isdisjoint(new_products_in_test) else 0 for sid in session_features.index]
    else:
        session_features['has_new_product'] = 0


    df_session = pd.concat([session_features, event_type_counts, first_event_features, last_event_features], axis=1) # Yeni özellikler eklendi

    # Etkileşim Oranları ve Türevleri (v7 ile aynı)
    epsilon = 1e-6
    df_session['view_to_add_cart_rate'] = df_session['add_cart_count'] / (df_session['view_count'] + epsilon)
    df_session['add_cart_to_buy_rate'] = df_session['buy_count'] / (df_session['add_cart_count'] + epsilon)
    df_session['view_to_buy_rate'] = df_session['buy_count'] / (df_session['view_count'] + epsilon)
    df_session['net_cart_additions'] = df_session['add_cart_count'] - df_session['remove_cart_count']
    df_session['did_purchase'] = (df_session['buy_count'] > 0).astype(int)

    print(f"{data_type} verisi için özellik mühendisliği (v8) tamamlandı.")
    return df_session

df_session_train = create_session_features_v8(df_train, 'train')
df_session_test = create_session_features_v8(df_test, 'test')


# --- 5. ADIM ve sonrası (v7 ile aynı) ---
print("\nKullanıcı ve seans özellikleri birleştiriliyor...")
df_session_train = df_session_train.merge(user_features, on='user_id', how='left').set_index(df_session_train.index)
df_session_test = df_session_test.merge(user_features, on='user_id', how='left').set_index(df_session_test.index)
df_session_train.drop('user_id', axis=1, inplace=True)
df_session_test.drop('user_id', axis=1, inplace=True)

print("\nEn önemli etkileşim özellikleri oluşturuluyor...")
for df in [df_session_train, df_session_test]:
    df['buy_x_hour'] = df['buy_count'] * df['avg_hour']
    df['buy_x_unique_products'] = df['buy_count'] * df['unique_products']
    df['buy_x_user_purchase_rate'] = df['buy_count'] * df['user_purchase_rate']
    # Yeni etkileşimler (Örnek: buy_count ile yeni zamansal/sıralama özellikleri)
    df['buy_x_avg_time_diff'] = df['buy_count'] * df['avg_time_diff'] # Yeni
    df['buy_x_avg_event_order'] = df['buy_count'] * df['avg_event_order'] # Yeni


session_value = df_train.groupby('user_session')['session_value'].first()
df_session_train['session_value'] = session_value

del df_train, df_test, session_value, user_features, common_users, train_users, test_users, train_products, new_products_in_test, train_session_users, test_session_users, common_sessions, verified_leaked_sessions
gc.collect()

df_session_train.to_csv(OUT_TRAIN_PATH)
df_session_test.to_csv(OUT_TEST_PATH)

print("\nAdım 1 (v8) Tamamlandı: 'train_processed_v8.csv' ve 'test_processed_v8.csv' dosyaları oluşturuldu.")

Adım 1 (v8 - Gelişmiş Zamansal ve Sıralama Özellikleri): Veri Hazırlama Başladı.
Ham veri setleri başarıyla yüklendi.
Analizler için ön bilgiler hesaplandı.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_features['user_buy_count'].fillna(0, inplace=True)



train verisi için seans bazlı özellik mühendisliği (v8) başlıyor...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


train verisi için özellik mühendisliği (v8) tamamlandı.

test verisi için seans bazlı özellik mühendisliği (v8) başlıyor...


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


test verisi için özellik mühendisliği (v8) tamamlandı.

Kullanıcı ve seans özellikleri birleştiriliyor...

En önemli etkileşim özellikleri oluşturuluyor...

Adım 1 (v8) Tamamlandı: 'train_processed_v8.csv' ve 'test_processed_v8.csv' dosyaları oluşturuldu.


**Model Eğitme**

In [None]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

print("Adım 2: Model Eğitimi Başlandı.")

MODEL_DIR = "/content/models/V8"
os.makedirs(MODEL_DIR, exist_ok=True)

IN_TRAIN_PATH = "/content/datathon/processed/train_processed_v8.csv"

FEATURE_IMPORTANCE_DIR_OUT = MODEL_DIR + "/feature_importance/IN/"
os.makedirs(FEATURE_IMPORTANCE_DIR_OUT, exist_ok=True)

OUT_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_OUT + "importance.json"

# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(IN_TRAIN_PATH, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed_v8.csv') yüklendi.")
except FileNotFoundError:
    print("Hata: 'train_processed.csv' bulunamadı. Lütfen önce '1_data_preparation_v8.py' scriptini çalıştırın.")
    exit()

# --- Modelleme için Veriyi Hazırlama ---
y = df_train['session_value']
X = df_train.drop(['session_value'], axis=1)


# Hedef değişkene log dönüşümü
y_log = np.log1p(y)

# --- Kategorik Özellikleri Belirleme ---
# V8 veri hazırlamada eklenen potansiyel kategorik sütunlar:
# first_event_category, first_event_product, first_event_type,
# last_event_category, last_event_product, last_event_type
# Ayrıca, 'is_weekend', 'time_of_day' gibi özellikler de kategorik olarak ele alınabilir.
# Bu sütunları X içinden bulalım.
categorical_features_indices = []
for i, col in enumerate(X.columns):
    if col in ['first_event_category', 'first_event_product', 'first_event_type',
               'last_event_category', 'last_event_product', 'last_event_type',
               'is_weekend', 'time_of_day']: # Potansiyel kategorik sütunlar
        categorical_features_indices.append(i)

print(f"CatBoost için belirlenen kategorik özellik indeksleri: {categorical_features_indices}")


# --- Zaman Bazlı Doğrulama (Time-Based Validation) ---
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False # shuffle=False zaman serisi doğrulama için kritik!
)
print(f"Train seti boyutu: {X_train.shape[0]}, Validation seti boyutu: {X_val.shape[0]}")

# --- CatBoost Modelini Eğitme ve Değerlendirme ---
print("CatBoost Modeli eğitimi başlıyor...")


best_params = {
    'learning_rate': 0.05705622600719216,
    'depth': 4,
    'l2_leaf_reg': 2.768927236825974,
    'colsample_bylevel': 0.8234334604424713,
    'min_child_samples': 56,
    'objective': 'RMSE',
    'random_seed': 42,
    'verbose': 500
}

cat_model = CatBoostRegressor(
    **best_params,
    iterations=4500,
    eval_metric='RMSE',
    early_stopping_rounds=300,
    cat_features=categorical_features_indices # Kategorik özellikleri belirt
)


cat_model.fit(
    X_train, y_train_log,
    eval_set=(X_val, y_val_log),
    cat_features=categorical_features_indices # Kategorik özellikleri belirt
)

# --- Performans Değerlendirme ---
val_preds_log = cat_model.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
y_val = np.expm1(y_val_log)

validation_mse = mean_squared_error(y_val, val_preds)
print(f"\nValidation Seti Üzerindeki MSE Skoru (CatBoost): {validation_mse:.4f}")
print(f"Validation Seti Üzerindeki RMSE Skoru (CatBoost): {np.sqrt(validation_mse):.4f}")

# --- Final Modelini Eğitme ve Kaydetme ---
print("\nFinal CatBoost modeli tüm train verisi üzerinde eğitiliyor...")
final_model = CatBoostRegressor(
    **best_params,
    iterations=cat_model.get_best_iteration(),
    cat_features=categorical_features_indices # Kategorik özellikleri belirt
)
final_model.fit(X, y_log, cat_features=categorical_features_indices) # Kategorik özellikleri belirt

features = X.columns.tolist()

# Feature importance as a DataFrame and save in a format that works
fi = final_model.get_feature_importance(prettified=False)
df_fi = pd.DataFrame({"feature": features, "importance": fi})
df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

# Prefer parquet, fallback to CSV if parquet not available, always also save JSON

df_fi.to_json(OUT_FEATURES_PATH, orient="records")



out_dir = MODEL_DIR + "/"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "catboost_model_v8.cbm")

# Modeli kaydetme
final_model.save_model(out_path)

print("\nAdım 2 Tamamlandı: 'catboost_model_v8.cbm' dosyası olarak model kaydedildi.")

Adım 2: Model Eğitimi Başlandı.
İşlenmiş train verisi ('train_processed.csv') yüklendi.
CatBoost için belirlenen kategorik özellik indeksleri: [22, 23, 24, 26, 27, 28]
Train seti boyutu: 56605, Validation seti boyutu: 14152
CatBoost Modeli eğitimi başlıyor...
0:	learn: 0.7440925	test: 0.7440154	best: 0.7440154 (0)	total: 195ms	remaining: 14m 38s
500:	learn: 0.4365973	test: 0.4343283	best: 0.4343283 (500)	total: 50.6s	remaining: 6m 43s
1000:	learn: 0.4337088	test: 0.4336672	best: 0.4336672 (1000)	total: 1m 21s	remaining: 4m 45s
1500:	learn: 0.4316438	test: 0.4335475	best: 0.4335354 (1488)	total: 1m 52s	remaining: 3m 44s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.4334927689
bestIteration = 1698

Shrink model to first 1699 iterations.

Validation Seti Üzerindeki MSE Skoru (CatBoost): 240.5440
Validation Seti Üzerindeki RMSE Skoru (CatBoost): 15.5095

Final CatBoost modeli tüm train verisi üzerinde eğitiliyor...
0:	learn: 0.7440795	total: 70.9ms	remaining: 2m
500:

**Hpo Kodu**

In [None]:
import pandas as pd
import numpy as np
import optuna
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import gc

print("Adım 2b: Hiperparametre Optimizasyonu (Optuna ile) Başladı.")

# --- Veritabanı ve Çalışma Ayarları ---
DB_FILENAME = "optuna_studies.db"
STUDY_NAME = "catboost_v8_features" # Her yeni özellik seti için bu ismi değiştirebilirsin

train_path = "/content/datathon/processed/train_processed_v8.csv"

feature_path = "/content/models/V8/feature_importance/OUT/importance.json"

# --- Modelleme için Veriyi Hazırlama ---
# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(train_path, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed_v8.csv') yüklendi.")

    with open(feature_path, 'r') as f:
      features_importance = json.load(f)
    print(f"Özelikler Başarıyla yüklendi... {len(features_importance)}")
except FileNotFoundError:
    print("Hata: 'train_processed_v8.csv' bulunamadı. Lütfen önce '1_data_preparation_v7.py' scriptini çalıştırın.")
    exit()

# --- 1. Adım: Özellik Seçilimi ---
print("\nÖzelikler Seçiliyor...")
importance_threshold = 0.0001
selected_features = [item['feature'] for item in features_importance if item['importance'] >= importance_threshold]
X = df_train[selected_features]
print(f"Önemi >= {importance_threshold} olan {len(selected_features)} adet özellik seçildi.")


# --- Veriyi Hazırlama ---
y = df_train['session_value']
y_log = np.log1p(y)

# --- Kategorik Özellikleri Belirleme ---
# V8 veri hazırlamada eklenen potansiyel kategorik sütunlar:
# first_event_category, first_event_product, first_event_type,
# last_event_category, last_event_product, last_event_type
# Ayrıca, 'is_weekend', 'time_of_day' gibi özellikler de kategorik olarak ele alınabilir.
# Bu sütunları X içinden bulalım.
categorical_features_indices = []
for i, col in enumerate(X.columns):
    if col in ['first_event_category', 'first_event_product', 'first_event_type',
               'last_event_category', 'last_event_product', 'last_event_type',
               'is_weekend', 'time_of_day']: # Potansiyel kategorik sütunlar
        categorical_features_indices.append(i)

print(f"CatBoost için belirlenen kategorik özellik indeksleri: {categorical_features_indices}")


# Aynı doğrulama setini kullanmak için ayırma işlemini tekrar yapıyoruz
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False
)

# Objective fonksiyonu (GÜNCELLENDİ)
def objective(trial):
    params = {
        'objective': 'RMSE',
        'iterations': 4500,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_seed': 42,
        'verbose': 0,
    }

    model = CatBoostRegressor(**params, cat_features=categorical_features_indices)
    # Modeli eğitirken hem train (X_train) hem de val (X_val) setini izlemesini sağlıyoruz
    model.fit(
        X_train, y_train_log,
        eval_set=[(X_train, y_train_log), (X_val, y_val_log)], # İki seti de ekledik
        early_stopping_rounds=300,
        verbose=0,
        cat_features=categorical_features_indices
    )

    # En iyi iterasyondaki skorları al
    scores = model.get_best_score()

    best_learn_rmse = scores['learn']['RMSE']
    val_rmse_0 = scores['validation_0']['RMSE']
    val_rmse_1 = scores['validation_1']['RMSE']

    # MSE'yi hesapla
    preds_log = model.predict(X_val)
    preds = np.expm1(preds_log)
    preds[preds < 0] = 0
    y_val_original = np.expm1(y_val_log)
    mse = mean_squared_error(y_val_original, preds)

    # --- GÜNCELLENMİŞ RAPORLAMA KISMI ---
    print(f"✅ Trial {trial.number} bitti | MSE: {mse:.4f} | Learn RMSE: {best_learn_rmse:.4f} | Val_0 RMSE: {val_rmse_0:.4f} | Val_1 RMSE : {val_rmse_1:.4f} | İterasyon: {model.get_best_iteration()}")
    gc.collect()
    return mse


# --- Optimizasyon Sürecini Başlatma (VERİTABANI İLE) ---
# SQLite veritabanı için bağlantı dizesi oluştur
storage_name = f"sqlite:///{DB_FILENAME}"

# Çalışmayı oluştur veya varsa veritabanından yükle
study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=storage_name,
    direction='minimize',
    load_if_exists=True # Eğer bu isimde bir çalışma varsa, sıfırdan başlamak yerine devam et
)

# Optimizasyonu çalıştır
n_trials = 50
print(f"Optimizasyon başlıyor... Sonuçlar '{DB_FILENAME}' dosyasına kaydedilecek.")
print(f"Mevcut deneme sayısı: {len(study.trials)}. Toplamda {n_trials} denemeye ulaşılacak.")
study.optimize(objective, n_trials=n_trials)

# --- Sonuçları Yazdırma ---
print("\nOptimizasyon Tamamlandı!")
print(f"Toplam deneme sayısı: {len(study.trials)}")
print(f"En iyi denemenin skoru (MSE): {study.best_value}")
print("En iyi denemenin parametreleri:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

Adım 2b: Hiperparametre Optimizasyonu (Optuna ile) Başladı.


[I 2025-08-27 21:27:41,575] Using an existing study with name 'catboost_v8_features' instead of creating a new one.


İşlenmiş train verisi ('train_processed_v8.csv') yüklendi.
Özelikler Başarıyla yüklendi... 40

Özelikler Seçiliyor...
Önemi >= 0.0001 olan 40 adet özellik seçildi.
CatBoost için belirlenen kategorik özellik indeksleri: [1, 2, 30, 32, 33, 34]
Optimizasyon başlıyor... Sonuçlar 'optuna_studies.db' dosyasına kaydedilecek.
Mevcut deneme sayısı: 2. Toplamda 50 denemeye ulaşılacak.


[I 2025-08-27 21:30:07,094] Trial 2 finished with value: 323.74163680808306 and parameters: {'learning_rate': 0.05567894414571388, 'depth': 7, 'l2_leaf_reg': 1.0567970035254877, 'colsample_bylevel': 0.5246516723137997, 'min_child_samples': 69}. Best is trial 2 with value: 323.74163680808306.


✅ Trial 2 bitti | MSE: 323.7416 | Learn RMSE: 0.4189 | Val_0 RMSE: 0.4302 | Val_1 RMSE : 0.4335 | İterasyon: 723


[I 2025-08-27 21:33:26,205] Trial 3 finished with value: 308.1041016319068 and parameters: {'learning_rate': 0.028837841174583177, 'depth': 8, 'l2_leaf_reg': 4.690516999359494, 'colsample_bylevel': 0.6078890168900097, 'min_child_samples': 20}. Best is trial 3 with value: 308.1041016319068.


✅ Trial 3 bitti | MSE: 308.1041 | Learn RMSE: 0.4247 | Val_0 RMSE: 0.4322 | Val_1 RMSE : 0.4339 | İterasyon: 858
✅ Trial 4 bitti | MSE: 326.9889 | Learn RMSE: 0.4175 | Val_0 RMSE: 0.4296 | Val_1 RMSE : 0.4340 | İterasyon: 1513


[I 2025-08-27 21:40:17,019] Trial 4 finished with value: 326.9888509242429 and parameters: {'learning_rate': 0.01572170740815751, 'depth': 9, 'l2_leaf_reg': 2.093945848507491, 'colsample_bylevel': 0.6340791593926816, 'min_child_samples': 68}. Best is trial 3 with value: 308.1041016319068.
[I 2025-08-27 21:43:11,655] Trial 5 finished with value: 351.642499104365 and parameters: {'learning_rate': 0.07082984348646865, 'depth': 10, 'l2_leaf_reg': 7.79658367395018, 'colsample_bylevel': 0.5841911735435026, 'min_child_samples': 35}. Best is trial 3 with value: 308.1041016319068.


✅ Trial 5 bitti | MSE: 351.6425 | Learn RMSE: 0.4074 | Val_0 RMSE: 0.4269 | Val_1 RMSE : 0.4345 | İterasyon: 321


[I 2025-08-27 21:45:38,512] Trial 6 finished with value: 349.5436629884279 and parameters: {'learning_rate': 0.06014270439798817, 'depth': 9, 'l2_leaf_reg': 2.2984653716438466, 'colsample_bylevel': 0.6214218781208825, 'min_child_samples': 55}. Best is trial 3 with value: 308.1041016319068.


✅ Trial 6 bitti | MSE: 349.5437 | Learn RMSE: 0.4098 | Val_0 RMSE: 0.4270 | Val_1 RMSE : 0.4343 | İterasyon: 370


[I 2025-08-27 21:54:15,379] Trial 7 finished with value: 295.6557327547748 and parameters: {'learning_rate': 0.015299248235424464, 'depth': 7, 'l2_leaf_reg': 4.31695854709011, 'colsample_bylevel': 0.8223437708319301, 'min_child_samples': 73}. Best is trial 7 with value: 295.6557327547748.


✅ Trial 7 bitti | MSE: 295.6557 | Learn RMSE: 0.4244 | Val_0 RMSE: 0.4323 | Val_1 RMSE : 0.4334 | İterasyon: 2950


[I 2025-08-27 21:58:58,367] Trial 8 finished with value: 307.40918489792773 and parameters: {'learning_rate': 0.027063249464671996, 'depth': 8, 'l2_leaf_reg': 1.4902327885440494, 'colsample_bylevel': 0.927762180574875, 'min_child_samples': 59}. Best is trial 7 with value: 295.6557327547748.


✅ Trial 8 bitti | MSE: 307.4092 | Learn RMSE: 0.4189 | Val_0 RMSE: 0.4297 | Val_1 RMSE : 0.4339 | İterasyon: 1080


[I 2025-08-27 22:02:16,594] Trial 9 finished with value: 334.86125711483055 and parameters: {'learning_rate': 0.03452208222206153, 'depth': 9, 'l2_leaf_reg': 1.2796382606577357, 'colsample_bylevel': 0.5422344327456619, 'min_child_samples': 75}. Best is trial 7 with value: 295.6557327547748.


✅ Trial 9 bitti | MSE: 334.8613 | Learn RMSE: 0.4116 | Val_0 RMSE: 0.4276 | Val_1 RMSE : 0.4339 | İterasyon: 667


[I 2025-08-27 22:07:39,963] Trial 10 finished with value: 298.8624889402533 and parameters: {'learning_rate': 0.020668681692170694, 'depth': 7, 'l2_leaf_reg': 1.6245950099610458, 'colsample_bylevel': 0.584527317741043, 'min_child_samples': 71}. Best is trial 7 with value: 295.6557327547748.


✅ Trial 10 bitti | MSE: 298.8625 | Learn RMSE: 0.4224 | Val_0 RMSE: 0.4309 | Val_1 RMSE : 0.4335 | İterasyon: 2037


[I 2025-08-27 22:10:19,543] Trial 11 finished with value: 268.4577564158097 and parameters: {'learning_rate': 0.07862352556881896, 'depth': 5, 'l2_leaf_reg': 3.283153937082837, 'colsample_bylevel': 0.9896632529287785, 'min_child_samples': 100}. Best is trial 11 with value: 268.4577564158097.


✅ Trial 11 bitti | MSE: 268.4578 | Learn RMSE: 0.4255 | Val_0 RMSE: 0.4327 | Val_1 RMSE : 0.4332 | İterasyon: 1136


[I 2025-08-27 22:12:31,385] Trial 12 finished with value: 258.55420751843866 and parameters: {'learning_rate': 0.09718452345001552, 'depth': 4, 'l2_leaf_reg': 8.520925473672282, 'colsample_bylevel': 0.9913100221565152, 'min_child_samples': 100}. Best is trial 12 with value: 258.55420751843866.


✅ Trial 12 bitti | MSE: 258.5542 | Learn RMSE: 0.4289 | Val_0 RMSE: 0.4342 | Val_1 RMSE : 0.4335 | İterasyon: 1258


[I 2025-08-27 22:14:37,089] Trial 13 finished with value: 263.31076489075053 and parameters: {'learning_rate': 0.09499731474122255, 'depth': 4, 'l2_leaf_reg': 8.681009409323153, 'colsample_bylevel': 0.9894855607812741, 'min_child_samples': 100}. Best is trial 12 with value: 258.55420751843866.


✅ Trial 13 bitti | MSE: 263.3108 | Learn RMSE: 0.4295 | Val_0 RMSE: 0.4344 | Val_1 RMSE : 0.4335 | İterasyon: 1183
✅ Trial 14 bitti | MSE: 248.3322 | Learn RMSE: 0.4288 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4336 | İterasyon: 1380


[I 2025-08-27 22:16:49,446] Trial 14 finished with value: 248.33219630473624 and parameters: {'learning_rate': 0.0963743849659102, 'depth': 4, 'l2_leaf_reg': 9.496117162230513, 'colsample_bylevel': 0.8666314642300947, 'min_child_samples': 100}. Best is trial 14 with value: 248.33219630473624.
[I 2025-08-27 22:21:12,336] Trial 15 finished with value: 245.6544632660239 and parameters: {'learning_rate': 0.04509987812095391, 'depth': 4, 'l2_leaf_reg': 6.467720823580776, 'colsample_bylevel': 0.8349418850477369, 'min_child_samples': 88}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 15 bitti | MSE: 245.6545 | Learn RMSE: 0.4287 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4332 | İterasyon: 3078


[I 2025-08-27 22:25:46,381] Trial 16 finished with value: 253.55283680139527 and parameters: {'learning_rate': 0.040320858192304763, 'depth': 5, 'l2_leaf_reg': 6.225963708446461, 'colsample_bylevel': 0.7974746534255553, 'min_child_samples': 86}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 16 bitti | MSE: 253.5528 | Learn RMSE: 0.4266 | Val_0 RMSE: 0.4333 | Val_1 RMSE : 0.4332 | İterasyon: 2415


[I 2025-08-27 22:29:21,657] Trial 17 finished with value: 273.20414231284224 and parameters: {'learning_rate': 0.046604640149513585, 'depth': 5, 'l2_leaf_reg': 5.858180918661651, 'colsample_bylevel': 0.866856930239367, 'min_child_samples': 86}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 17 bitti | MSE: 273.2041 | Learn RMSE: 0.4278 | Val_0 RMSE: 0.4339 | Val_1 RMSE : 0.4333 | İterasyon: 1752


[I 2025-08-27 22:38:14,786] Trial 18 finished with value: 278.4271090049249 and parameters: {'learning_rate': 0.012067541772649204, 'depth': 6, 'l2_leaf_reg': 9.887423607875338, 'colsample_bylevel': 0.7234242125526023, 'min_child_samples': 39}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 18 bitti | MSE: 278.4271 | Learn RMSE: 0.4297 | Val_0 RMSE: 0.4346 | Val_1 RMSE : 0.4332 | İterasyon: 4453


[I 2025-08-27 22:41:40,495] Trial 19 finished with value: 251.18313972412466 and parameters: {'learning_rate': 0.044612381858249416, 'depth': 4, 'l2_leaf_reg': 6.041026279480231, 'colsample_bylevel': 0.7232950730696212, 'min_child_samples': 87}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 19 bitti | MSE: 251.1831 | Learn RMSE: 0.4295 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4333 | İterasyon: 2624


[I 2025-08-27 22:44:28,571] Trial 20 finished with value: 284.9660134191306 and parameters: {'learning_rate': 0.06801978529580795, 'depth': 6, 'l2_leaf_reg': 4.31571783885787, 'colsample_bylevel': 0.8888746338720241, 'min_child_samples': 5}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 20 bitti | MSE: 284.9660 | Learn RMSE: 0.4235 | Val_0 RMSE: 0.4317 | Val_1 RMSE : 0.4334 | İterasyon: 949


[I 2025-08-27 22:50:25,682] Trial 21 finished with value: 257.3361210362106 and parameters: {'learning_rate': 0.02330169645917244, 'depth': 5, 'l2_leaf_reg': 3.2083733209952285, 'colsample_bylevel': 0.7788200516935997, 'min_child_samples': 87}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 21 bitti | MSE: 257.3361 | Learn RMSE: 0.4281 | Val_0 RMSE: 0.4333 | Val_1 RMSE : 0.4331 | İterasyon: 3270


[I 2025-08-27 22:53:12,860] Trial 22 finished with value: 269.45615155945563 and parameters: {'learning_rate': 0.050078537765027825, 'depth': 6, 'l2_leaf_reg': 7.014943306054554, 'colsample_bylevel': 0.6826979329325821, 'min_child_samples': 43}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 22 bitti | MSE: 269.4562 | Learn RMSE: 0.4261 | Val_0 RMSE: 0.4332 | Val_1 RMSE : 0.4332 | İterasyon: 1125


[I 2025-08-27 22:57:20,909] Trial 23 finished with value: 252.97920044189038 and parameters: {'learning_rate': 0.0410409014967393, 'depth': 4, 'l2_leaf_reg': 5.759810356819774, 'colsample_bylevel': 0.7192268553761905, 'min_child_samples': 90}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 23 bitti | MSE: 252.9792 | Learn RMSE: 0.4289 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4331 | İterasyon: 3136


[I 2025-08-27 23:01:18,474] Trial 24 finished with value: 272.37584196341874 and parameters: {'learning_rate': 0.03708697196991385, 'depth': 4, 'l2_leaf_reg': 9.919754656640519, 'colsample_bylevel': 0.8504005606487381, 'min_child_samples': 80}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 24 bitti | MSE: 272.3758 | Learn RMSE: 0.4310 | Val_0 RMSE: 0.4349 | Val_1 RMSE : 0.4334 | İterasyon: 2749


[I 2025-08-27 23:03:22,807] Trial 25 finished with value: 255.68956520689443 and parameters: {'learning_rate': 0.08280574372013957, 'depth': 4, 'l2_leaf_reg': 5.112124570356889, 'colsample_bylevel': 0.9266932086209808, 'min_child_samples': 92}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 25 bitti | MSE: 255.6896 | Learn RMSE: 0.4300 | Val_0 RMSE: 0.4344 | Val_1 RMSE : 0.4334 | İterasyon: 1179


[I 2025-08-27 23:05:54,177] Trial 26 finished with value: 268.5619103521975 and parameters: {'learning_rate': 0.060656339372542015, 'depth': 5, 'l2_leaf_reg': 7.398720367319801, 'colsample_bylevel': 0.7608020936912503, 'min_child_samples': 62}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 26 bitti | MSE: 268.5619 | Learn RMSE: 0.4286 | Val_0 RMSE: 0.4341 | Val_1 RMSE : 0.4333 | İterasyon: 1225


[I 2025-08-27 23:09:07,558] Trial 27 finished with value: 261.148701376083 and parameters: {'learning_rate': 0.04658126284913638, 'depth': 4, 'l2_leaf_reg': 3.740358255544383, 'colsample_bylevel': 0.8179754968994565, 'min_child_samples': 80}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 27 bitti | MSE: 261.1487 | Learn RMSE: 0.4298 | Val_0 RMSE: 0.4339 | Val_1 RMSE : 0.4334 | İterasyon: 2214


[I 2025-08-27 23:14:20,955] Trial 28 finished with value: 287.0656358847187 and parameters: {'learning_rate': 0.031815861778764616, 'depth': 6, 'l2_leaf_reg': 6.549259437650841, 'colsample_bylevel': 0.911829121749149, 'min_child_samples': 95}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 28 bitti | MSE: 287.0656 | Learn RMSE: 0.4257 | Val_0 RMSE: 0.4326 | Val_1 RMSE : 0.4331 | İterasyon: 2023


[I 2025-08-27 23:21:22,605] Trial 29 finished with value: 286.4004467488808 and parameters: {'learning_rate': 0.02063890098648222, 'depth': 5, 'l2_leaf_reg': 2.436443990996021, 'colsample_bylevel': 0.688164175893589, 'min_child_samples': 80}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 29 bitti | MSE: 286.4004 | Learn RMSE: 0.4269 | Val_0 RMSE: 0.4321 | Val_1 RMSE : 0.4331 | İterasyon: 4431


[I 2025-08-27 23:24:51,102] Trial 30 finished with value: 248.12210933725677 and parameters: {'learning_rate': 0.05573286159730011, 'depth': 4, 'l2_leaf_reg': 5.162669444637468, 'colsample_bylevel': 0.84883936806068, 'min_child_samples': 47}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 30 bitti | MSE: 248.1221 | Learn RMSE: 0.4285 | Val_0 RMSE: 0.4335 | Val_1 RMSE : 0.4332 | İterasyon: 2377


[I 2025-08-27 23:27:23,539] Trial 31 finished with value: 250.0560927208526 and parameters: {'learning_rate': 0.05400175907271928, 'depth': 5, 'l2_leaf_reg': 5.117851953961595, 'colsample_bylevel': 0.8425306666900582, 'min_child_samples': 46}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 31 bitti | MSE: 250.0561 | Learn RMSE: 0.4291 | Val_0 RMSE: 0.4333 | Val_1 RMSE : 0.4333 | İterasyon: 1171


[I 2025-08-27 23:29:56,336] Trial 32 finished with value: 250.21911960843232 and parameters: {'learning_rate': 0.08215646875818132, 'depth': 4, 'l2_leaf_reg': 8.571918010408716, 'colsample_bylevel': 0.9549793531250812, 'min_child_samples': 29}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 32 bitti | MSE: 250.2191 | Learn RMSE: 0.4290 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4331 | İterasyon: 1587


[I 2025-08-27 23:32:47,409] Trial 33 finished with value: 268.17815997685904 and parameters: {'learning_rate': 0.05434388724465587, 'depth': 5, 'l2_leaf_reg': 5.436395716253481, 'colsample_bylevel': 0.8472240817115197, 'min_child_samples': 46}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 33 bitti | MSE: 268.1782 | Learn RMSE: 0.4279 | Val_0 RMSE: 0.4339 | Val_1 RMSE : 0.4332 | İterasyon: 1374


[I 2025-08-27 23:35:22,245] Trial 34 finished with value: 267.6202416862611 and parameters: {'learning_rate': 0.0648415287716285, 'depth': 4, 'l2_leaf_reg': 4.803408127337772, 'colsample_bylevel': 0.8816870462471669, 'min_child_samples': 50}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 34 bitti | MSE: 267.6202 | Learn RMSE: 0.4296 | Val_0 RMSE: 0.4335 | Val_1 RMSE : 0.4332 | İterasyon: 1647


[I 2025-08-27 23:38:44,331] Trial 35 finished with value: 263.01158180879384 and parameters: {'learning_rate': 0.05385978681194557, 'depth': 5, 'l2_leaf_reg': 3.88948060269792, 'colsample_bylevel': 0.8135892689060457, 'min_child_samples': 23}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 35 bitti | MSE: 263.0116 | Learn RMSE: 0.4261 | Val_0 RMSE: 0.4324 | Val_1 RMSE : 0.4334 | İterasyon: 1730


[I 2025-08-27 23:40:57,703] Trial 36 finished with value: 281.1260891647535 and parameters: {'learning_rate': 0.07322315421044558, 'depth': 6, 'l2_leaf_reg': 7.36932879845895, 'colsample_bylevel': 0.8407384719603137, 'min_child_samples': 64}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 36 bitti | MSE: 281.1261 | Learn RMSE: 0.4261 | Val_0 RMSE: 0.4330 | Val_1 RMSE : 0.4334 | İterasyon: 731


[I 2025-08-27 23:44:09,612] Trial 37 finished with value: 246.72090365442577 and parameters: {'learning_rate': 0.054388927628454244, 'depth': 4, 'l2_leaf_reg': 4.952389987737656, 'colsample_bylevel': 0.7896351139665998, 'min_child_samples': 34}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 37 bitti | MSE: 246.7209 | Learn RMSE: 0.4289 | Val_0 RMSE: 0.4334 | Val_1 RMSE : 0.4332 | İterasyon: 2283


[I 2025-08-27 23:48:36,508] Trial 38 finished with value: 322.0041906231694 and parameters: {'learning_rate': 0.028129201854692358, 'depth': 8, 'l2_leaf_reg': 2.7370744158056066, 'colsample_bylevel': 0.7881889216561735, 'min_child_samples': 31}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 38 bitti | MSE: 322.0042 | Learn RMSE: 0.4198 | Val_0 RMSE: 0.4303 | Val_1 RMSE : 0.4336 | İterasyon: 1107


[I 2025-08-27 23:50:53,341] Trial 39 finished with value: 250.2047772424403 and parameters: {'learning_rate': 0.05932581812842508, 'depth': 4, 'l2_leaf_reg': 4.1453350939085905, 'colsample_bylevel': 0.7508123333691782, 'min_child_samples': 11}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 39 bitti | MSE: 250.2048 | Learn RMSE: 0.4303 | Val_0 RMSE: 0.4339 | Val_1 RMSE : 0.4333 | İterasyon: 1595


[I 2025-08-27 23:55:36,768] Trial 40 finished with value: 368.3230160927863 and parameters: {'learning_rate': 0.038428353621106644, 'depth': 10, 'l2_leaf_reg': 7.737126350334536, 'colsample_bylevel': 0.8981688527919165, 'min_child_samples': 55}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 40 bitti | MSE: 368.3230 | Learn RMSE: 0.4161 | Val_0 RMSE: 0.4291 | Val_1 RMSE : 0.4344 | İterasyon: 544


[I 2025-08-27 23:59:59,922] Trial 41 finished with value: 293.9720243361502 and parameters: {'learning_rate': 0.03318824888167874, 'depth': 7, 'l2_leaf_reg': 6.628476134350515, 'colsample_bylevel': 0.9509172212249041, 'min_child_samples': 36}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 41 bitti | MSE: 293.9720 | Learn RMSE: 0.4251 | Val_0 RMSE: 0.4330 | Val_1 RMSE : 0.4334 | İterasyon: 1261


[I 2025-08-28 00:01:55,991] Trial 42 finished with value: 256.7931456171273 and parameters: {'learning_rate': 0.08981877190456083, 'depth': 4, 'l2_leaf_reg': 3.5027056353577484, 'colsample_bylevel': 0.7670405731460332, 'min_child_samples': 19}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 42 bitti | MSE: 256.7931 | Learn RMSE: 0.4286 | Val_0 RMSE: 0.4330 | Val_1 RMSE : 0.4334 | İterasyon: 1267


[I 2025-08-28 00:04:22,851] Trial 43 finished with value: 260.18748210983523 and parameters: {'learning_rate': 0.05239559245800792, 'depth': 5, 'l2_leaf_reg': 4.689602408491179, 'colsample_bylevel': 0.8388802486628076, 'min_child_samples': 45}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 43 bitti | MSE: 260.1875 | Learn RMSE: 0.4293 | Val_0 RMSE: 0.4336 | Val_1 RMSE : 0.4334 | İterasyon: 1140


[I 2025-08-28 00:06:32,724] Trial 44 finished with value: 256.8014660189471 and parameters: {'learning_rate': 0.06491456154537105, 'depth': 4, 'l2_leaf_reg': 5.169926892269327, 'colsample_bylevel': 0.8759838901817196, 'min_child_samples': 51}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 44 bitti | MSE: 256.8015 | Learn RMSE: 0.4308 | Val_0 RMSE: 0.4346 | Val_1 RMSE : 0.4334 | İterasyon: 1352


[I 2025-08-28 00:08:40,524] Trial 45 finished with value: 262.58472438402833 and parameters: {'learning_rate': 0.07472776134676867, 'depth': 4, 'l2_leaf_reg': 4.704784268359444, 'colsample_bylevel': 0.8114555581988306, 'min_child_samples': 41}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 45 bitti | MSE: 262.5847 | Learn RMSE: 0.4294 | Val_0 RMSE: 0.4337 | Val_1 RMSE : 0.4334 | İterasyon: 1393


[I 2025-08-28 00:12:52,575] Trial 46 finished with value: 265.4991754903706 and parameters: {'learning_rate': 0.04343092931873186, 'depth': 5, 'l2_leaf_reg': 8.651188739092461, 'colsample_bylevel': 0.8629909168564932, 'min_child_samples': 31}. Best is trial 15 with value: 245.6544632660239.


✅ Trial 46 bitti | MSE: 265.4992 | Learn RMSE: 0.4274 | Val_0 RMSE: 0.4335 | Val_1 RMSE : 0.4334 | İterasyon: 2136


[I 2025-08-28 00:15:52,336] Trial 47 finished with value: 234.08094384953523 and parameters: {'learning_rate': 0.05705622600719216, 'depth': 4, 'l2_leaf_reg': 2.768927236825974, 'colsample_bylevel': 0.8234334604424713, 'min_child_samples': 56}. Best is trial 47 with value: 234.08094384953523.


✅ Trial 47 bitti | MSE: 234.0809 | Learn RMSE: 0.4287 | Val_0 RMSE: 0.4330 | Val_1 RMSE : 0.4332 | İterasyon: 2048


[I 2025-08-28 00:18:34,904] Trial 48 finished with value: 254.68790051631066 and parameters: {'learning_rate': 0.05984415987208251, 'depth': 4, 'l2_leaf_reg': 2.863924666674996, 'colsample_bylevel': 0.7979180008220116, 'min_child_samples': 96}. Best is trial 47 with value: 234.08094384953523.


✅ Trial 48 bitti | MSE: 254.6879 | Learn RMSE: 0.4292 | Val_0 RMSE: 0.4336 | Val_1 RMSE : 0.4334 | İterasyon: 1825


[I 2025-08-28 00:20:34,771] Trial 49 finished with value: 253.9174859393951 and parameters: {'learning_rate': 0.08758442319673514, 'depth': 4, 'l2_leaf_reg': 1.85994513298214, 'colsample_bylevel': 0.8279071804389915, 'min_child_samples': 68}. Best is trial 47 with value: 234.08094384953523.


✅ Trial 49 bitti | MSE: 253.9175 | Learn RMSE: 0.4284 | Val_0 RMSE: 0.4331 | Val_1 RMSE : 0.4334 | İterasyon: 1245


[I 2025-08-28 00:23:50,588] Trial 50 finished with value: 319.72536560597877 and parameters: {'learning_rate': 0.048784711769565024, 'depth': 9, 'l2_leaf_reg': 1.9837821376996554, 'colsample_bylevel': 0.9191875226170217, 'min_child_samples': 57}. Best is trial 47 with value: 234.08094384953523.


✅ Trial 50 bitti | MSE: 319.7254 | Learn RMSE: 0.4110 | Val_0 RMSE: 0.4278 | Val_1 RMSE : 0.4343 | İterasyon: 452


[I 2025-08-28 00:28:35,538] Trial 51 finished with value: 267.22850933732946 and parameters: {'learning_rate': 0.035621561485923665, 'depth': 5, 'l2_leaf_reg': 2.9254999757874605, 'colsample_bylevel': 0.8630944492155944, 'min_child_samples': 23}. Best is trial 47 with value: 234.08094384953523.


✅ Trial 51 bitti | MSE: 267.2285 | Learn RMSE: 0.4268 | Val_0 RMSE: 0.4330 | Val_1 RMSE : 0.4333 | İterasyon: 2400

Optimizasyon Tamamlandı!
Toplam deneme sayısı: 52
En iyi denemenin skoru (MSE): 234.08094384953523
En iyi denemenin parametreleri:
    learning_rate: 0.05705622600719216
    depth: 4
    l2_leaf_reg: 2.768927236825974
    colsample_bylevel: 0.8234334604424713
    min_child_samples: 56


**Model Eğitimi - Özellik Seçilimi**

In [None]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

print("Adım 2: Model Eğitimi Başlandı.")

MODEL_DIR = "/content/models/V8"
os.makedirs(MODEL_DIR, exist_ok=True)

IN_TRAIN_PATH = "/content/datathon/processed/train_processed_v8.csv"

FEATURE_IMPORTANCE_DIR_IN = MODEL_DIR + "/feature_importance/IN/"
os.makedirs(FEATURE_IMPORTANCE_DIR_IN, exist_ok=True)

IN_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_IN + "importance.json"


FEATURE_IMPORTANCE_DIR_OUT = MODEL_DIR + "/feature_importance/OUT/"
os.makedirs(FEATURE_IMPORTANCE_DIR_OUT, exist_ok=True)

OUT_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_OUT + "importance.json"
# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(IN_TRAIN_PATH, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed.csv') yüklendi.")

    with open(IN_FEATURES_PATH, 'r') as f:
      features_in = json.load(f)

    print("Özellikler yüklendi.")
except FileNotFoundError:
    print("Hata: 'train_processed.csv' bulunamadı. Lütfen önce '1_data_preparation.py' scriptini çalıştırın.")
    exit()

# --- Modelleme için Veriyi Hazırlama ---
y = df_train['session_value']
X = df_train.drop(['session_value'], axis=1)


# Hedef değişkene log dönüşümü
y_log = np.log1p(y)


# feature seçilimi
importance_threshold = 0.08
print(f"Toplam özellik sayısı: {len(features_in)}")
features_in = [item['feature'] for item in features_in if item['importance'] >= importance_threshold]
print(f"Seçilen özellik sayısı: {len(features_in)}")
X = X[features_in]


# --- Kategorik Özellikleri Belirleme ---
# V8 veri hazırlamada eklenen potansiyel kategorik sütunlar:
# first_event_category, first_event_product, first_event_type,
# last_event_category, last_event_product, last_event_type
# Ayrıca, 'is_weekend', 'time_of_day' gibi özellikler de kategorik olarak ele alınabilir.
# Bu sütunları X içinden bulalım.
categorical_features_indices = []
for i, col in enumerate(X.columns):
    if col in ['first_event_category', 'first_event_product', 'first_event_type',
               'last_event_category', 'last_event_product', 'last_event_type',
               'is_weekend', 'time_of_day']: # Potansiyel kategorik sütunlar
        categorical_features_indices.append(i)

print(f"CatBoost için belirlenen kategorik özellik indeksleri: {categorical_features_indices}")


# --- Zaman Bazlı Doğrulama (Time-Based Validation) ---
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False # shuffle=False zaman serisi doğrulama için kritik!
)
print(f"Train seti boyutu: {X_train.shape[0]}, Validation seti boyutu: {X_val.shape[0]}")

# --- CatBoost Modelini Eğitme ve Değerlendirme ---
print("CatBoost Modeli eğitimi başlıyor...")


best_params = {
    'learning_rate': 0.05705622600719216,
    'depth': 4,
    'l2_leaf_reg': 2.768927236825974,
    'colsample_bylevel': 0.8234334604424713,
    'min_child_samples': 56,
    'objective': 'RMSE',
    'random_seed': 42,
    'verbose': 500
}

cat_model = CatBoostRegressor(
    **best_params,
    iterations=4500,
    eval_metric='RMSE',
    early_stopping_rounds=300,
    cat_features=categorical_features_indices # Kategorik özellikleri belirt
)


cat_model.fit(
    X_train, y_train_log,
    eval_set=(X_val, y_val_log),
    cat_features=categorical_features_indices
)

# --- Performans Değerlendirme ---
val_preds_log = cat_model.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
y_val = np.expm1(y_val_log)

validation_mse = mean_squared_error(y_val, val_preds)
print(f"\nValidation Seti Üzerindeki MSE Skoru (CatBoost): {validation_mse:.4f}")
print(f"Validation Seti Üzerindeki RMSE Skoru (CatBoost): {np.sqrt(validation_mse):.4f}")

# --- Final Modelini Eğitme ve Kaydetme ---
print("\nFinal CatBoost modeli tüm train verisi üzerinde eğitiliyor...")
final_model = CatBoostRegressor(
    **best_params,
    iterations=cat_model.get_best_iteration(),
    cat_features=categorical_features_indices # Kategorik özellikleri belirt
)
final_model.fit(X, y_log,cat_features=categorical_features_indices)

features_out = X.columns.tolist()

# Feature importance as a DataFrame and save in a format that works
fi = final_model.get_feature_importance(prettified=False)
df_fi = pd.DataFrame({"feature": features_out, "importance": fi})
df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

# Prefer parquet, fallback to CSV if parquet not available, always also save JSON

df_fi.to_json(OUT_FEATURES_PATH, orient="records")



out_dir = "/content/models/V8/"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "catboost_model_v8_selected.cbm")

# Modeli kaydetme
final_model.save_model(out_path)

print("\nAdım 2 Tamamlandı: 'catboost_model_v8_selected.cbm' dosyası olarak model kaydedildi.")

Adım 2: Model Eğitimi Başlandı.
İşlenmiş train verisi ('train_processed.csv') yüklendi.
Özellikler yüklendi.
Toplam özellik sayısı: 45
Seçilen özellik sayısı: 40
CatBoost için belirlenen kategorik özellik indeksleri: [0, 4, 31, 32, 34, 36]
Train seti boyutu: 56605, Validation seti boyutu: 14152
CatBoost Modeli eğitimi başlıyor...
0:	learn: 0.7437963	test: 0.7437897	best: 0.7437897 (0)	total: 119ms	remaining: 8m 56s
500:	learn: 0.4367338	test: 0.4345061	best: 0.4345024 (499)	total: 32.7s	remaining: 4m 20s
1000:	learn: 0.4336994	test: 0.4337858	best: 0.4337787 (996)	total: 1m 1s	remaining: 3m 34s
1500:	learn: 0.4315701	test: 0.4335246	best: 0.4335240 (1499)	total: 1m 31s	remaining: 3m 1s
2000:	learn: 0.4298465	test: 0.4334069	best: 0.4333759 (1892)	total: 2m 5s	remaining: 2m 36s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.4333759443
bestIteration = 1892

Shrink model to first 1893 iterations.

Validation Seti Üzerindeki MSE Skoru (CatBoost): 250.9356
Validation S

**Submission Oluşturma**

In [None]:
import pandas as pd
import os
import json
import numpy as np
from catboost import CatBoostRegressor, Pool

print("Adım 3 (Akıllı Doldurma ile): Tahmin ve Gönderim Başladı.")

# --- Ayarlar ---
PROCESSED_TEST_FILE = '/content/datathon/processed/test_processed_v8.csv'
MODEL_FILE = '/content/models/V8/catboost_model_v8.cbm' # v8 için optimize edilmiş modeli kullandığımızdan emin olalım
SELECTED_FEATURES_PATH = '/content/models/V8/feature_importance/IN/importance.json'
TRAIN_RAW_PATH = '/content/datathon/train.csv' # Sızıntı tespiti için gerekli
TEST_RAW_PATH = '/content/datathon/test.csv'   # Sızıntı ve anomali tespiti için gerekli

SUBMISSION_DIR = "/content/submissions"
os.makedirs(SUBMISSION_DIR, exist_ok=True)
SUBMISSION_FILE = '/content/submissions/submission_v8.csv'


# --- Gerekli Dosyaları Yükleme ---
try:
    df_test_processed = pd.read_csv(PROCESSED_TEST_FILE) # index_col olmadan okuyoruz
    df_submission = pd.read_csv('/content/datathon/sample_submission.csv')
    df_train_raw = pd.read_csv(TRAIN_RAW_PATH)
    df_test_raw = pd.read_csv(TEST_RAW_PATH)

    with open(SELECTED_FEATURES_PATH, 'r') as f:
      selected_features_dict = json.load(f)

    model = CatBoostRegressor()
    model.load_model(MODEL_FILE)

    print(f"Gerekli dosyalar ve {len(selected_features_dict)} adet seçilmiş özellik başarıyla yüklendi.")
except FileNotFoundError as e:
    print(f"Hata: {e}.")
    exit()

# --- 1. ADIM: DOĞRULANMIŞ VERİ SIZINTISINI TESPİT ET ---
print("\nDoğrulanmış veri sızıntısı tespit ediliyor...")
train_session_users = df_train_raw.groupby('user_session')['user_id'].apply(set)
test_session_users = df_test_raw.groupby('user_session')['user_id'].apply(set)
common_sessions = set(train_session_users.index).intersection(set(test_session_users.index))
verified_leaked_sessions = {sid for sid in common_sessions if train_session_users[sid] == test_session_users[sid]}
verified_leak_map = df_train_raw[
    df_train_raw['user_session'].isin(verified_leaked_sessions)
].groupby('user_session')['session_value'].first().to_dict()
print(f"Tespit edilen DOĞRULANMIŞ sızıntı seans sayısı: {len(verified_leaked_sessions)}")


# --- 2. ADIM: MODEL TAHMİNLERİNİ YAP --
selected_features = [item['feature'] for item in selected_features_dict]
print(f"Seçilen Özlliklerin Sayısı : {selected_features}")

# Olası eksik sütun hatalarını önlemek için kontrol
missing_cols = set(selected_features) - set(df_test_processed.columns)
if missing_cols:
    print(f"HATA: Test verisinde şu sütunlar eksik: {missing_cols}")
    exit()

# Sütun sırasının modelin beklediğiyle aynı olmasını garantile
print("Selected Feature Sayısı : ", len(selected_features))
# Test verisini filtrele
df_test_processed.set_index('user_session', inplace=True)
df_test_selected = df_test_processed[selected_features]



# --- YENİ EKLENEN DÜZELTME KISMI ---
# --- Kategorik Özellikleri Belirleme ---
# V8 veri hazırlamada eklenen potansiyel kategorik sütunlar:
# first_event_category, first_event_product, first_event_type,
# last_event_category, last_event_product, last_event_type
# Ayrıca, 'is_weekend', 'time_of_day' gibi özellikler de kategorik olarak ele alınabilir.
# Bu sütunları X içinden bulalım.
categorical_features_indices = []
for i, col in enumerate(df_test_selected.columns):
    if col in ['first_event_category', 'first_event_product', 'first_event_type',
               'last_event_category', 'last_event_product', 'last_event_type',
               'is_weekend', 'time_of_day']: # Potansiyel kategorik sütunlar
        categorical_features_indices.append(i)

print(f"CatBoost için belirlenen kategorik özellik indeksleri: {categorical_features_indices}")

# Test verisini CatBoost Pool nesnesine dönüştürüyoruz
test_pool = Pool(
    data=df_test_selected,
    cat_features=categorical_features_indices
)
# --- DÜZELTME BİTTİ ---


# Tahmin yap
print("Test seti üzerinde model tahminleri yapılıyor...")
test_preds_log = model.predict(test_pool)
final_predictions = np.expm1(test_preds_log)
final_predictions[final_predictions < 0] = 0
df_predictions = pd.DataFrame({
    'user_session': df_test_selected.index,
    'predicted_value': final_predictions
})

# --- 3. ADIM: SUBMISSION DOSYASINI OLUŞTURMA ---
# Önce normal model tahminlerini map et
submission_map = dict(zip(df_predictions['user_session'], df_predictions['predicted_value']))
df_submission['session_value'] = df_submission['user_session'].map(submission_map)

# Sonra "Akıllı Doldurma" ile NaN user anormalliklerini doldur
session_user_counts = df_test_raw.groupby('user_session')['user_id'].nunique()
anomalous_sessions_orig = session_user_counts[session_user_counts > 1].index.tolist()
print(f"\nAkıllı doldurma yapılacak anormal seanslar: {anomalous_sessions_orig}")
for session_id in anomalous_sessions_orig:
    constituent_preds = df_predictions[df_predictions['user_session'].str.startswith(f"{session_id}_")]
    total_value = constituent_preds['predicted_value'].sum()
    df_submission.loc[df_submission['user_session'] == session_id, 'session_value'] = total_value
    print(f"'{session_id}' için {len(constituent_preds)} parçanın tahmini toplandı: {total_value:.4f}")

# SON VE EN ÖNEMLİ ADIM: Doğrulanmış sızıntıdan gelen gerçek değerlerle tüm tahminleri EZ
print(f"\n{len(verified_leaked_sessions)} adet seansın değeri, sızıntıdan gelen gerçek değerlerle güncelleniyor...")
leaked_updates = df_submission['user_session'].map(verified_leak_map)
df_submission['session_value'] = np.where(leaked_updates.notna(), leaked_updates, df_submission['session_value'])

# --- SON KONTROLLER ---
nan_count = df_submission['session_value'].isnull().sum()
if nan_count > 0:
    print(f"⚠️ UYARI: Hala {nan_count} adet null değer var! Bunlar 0 ile dolduruluyor.")
    df_submission['session_value'].fillna(0, inplace=True)

excepted_row = 30789
if excepted_row != len(df_submission):
    print(f"Satır sayıları eşleşmiyor. Beklenen : {excepted_row}. Bulunan : {len(df_submission)}")
    exit()

# --- Dosyayı Kaydetme ---
df_submission.to_csv(SUBMISSION_FILE, index=False)
print(f"\n'{SUBMISSION_FILE}' dosyası başarıyla oluşturuldu! Bu son gönderim için bol şans!")
print("Dosyanın ilk 5 satırı:")
print(df_submission.head())

Adım 3 (Akıllı Doldurma ile): Tahmin ve Gönderim Başladı.
Gerekli dosyalar ve 45 adet seçilmiş özellik başarıyla yüklendi.

Doğrulanmış veri sızıntısı tespit ediliyor...
Tespit edilen DOĞRULANMIŞ sızıntı seans sayısı: 209
Seçilen Özlliklerin Sayısı : ['first_event_type', 'buy_count', 'did_purchase', 'view_to_buy_rate', 'last_event_type', 'buy_x_unique_products', 'buy_x_user_purchase_rate', 'first_event_hour', 'add_cart_count', 'add_cart_to_buy_rate', 'avg_day_of_week', 'user_buy_count', 'unique_categories', 'remove_cart_count', 'last_event_hour', 'user_unique_products_viewed', 'avg_hour', 'buy_x_hour', 'unique_products', 'view_to_add_cart_rate', 'event_count', 'net_cart_additions', 'user_total_events', 'avg_event_order_pct', 'user_purchase_rate', 'view_count', 'avg_event_order', 'std_time_diff', 'is_leaked_session', 'session_duration_seconds', 'user_lifespan_days', 'last_event_product', 'first_event_product', 'avg_time_diff_log', 'first_event_category', 'min_time_diff', 'last_event_cat