# V7

In [2]:
!pip install catboost

Collecting catboost
  Using cached catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading graphviz-0.21-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.3/47.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz, catboost
Successfully installed catboost-1.2.8 graphviz-0.21


In [5]:
import pandas as pd

df_submission = pd.read_csv('./submission.csv')
session_values = df_submission['session_value']

print("Ortalama:", session_values.mean())
print("Standart Sapma:", session_values.std())
print("Maksimum:", session_values.max())
print("Minimum:", session_values.min())

Ortalama: 41.438740199599046
Standart Sapma: 44.11533396262052
Maksimum: 713.970248534592
Minimum: 9.140855995108964


In [None]:
# -*- coding: utf-8 -*-
"""
V8_Pseudo_Labeling_Integrated_Script.py

This script combines data preparation, a teacher-student pseudo-labeling framework,
hyperparameter optimization for the pseudo-labeling process, and submission generation
into a single, cohesive workflow.

Workflow:
1.  **Data Preparation**: Processes raw data and engineers features as in v7. It also
    identifies and separates "leaked" sessions present in both train and test sets.
2.  **Teacher Model Training**: A baseline CatBoost model is trained on the clean
    (non-leaked) training data.
3.  **Pseudo-Labeling HPO**: Optuna is used to find the best hyperparameters for the
    student model's training process. This HPO tunes the *weight* of pseudo-labels
    and a *filtering threshold*, not the model's architecture.
4.  **Final Student Model Training**: Using the best HPO parameters, a final student
    model is trained on the combination of clean training data and high-quality
    pseudo-labeled test data.
5.  **Feature Selection**: Feature importance is calculated from the final student model,
    and a new, leaner model is trained using only the most impactful features.
6.  **Submission Generation**: The final predictions are made using the feature-selected
    student model. The predictions for the "leaked" sessions are then overwritten
    with their known true values to maximize accuracy.
"""

# ==============================================================================
# 0. SETUP AND IMPORTS
# ==============================================================================
import pandas as pd
import numpy as np
import os
import gc
import json
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

print("Script V8: Full Workflow with Pseudo-Labeling Initiated.")

# --- Configuration ---
# Input Paths
IN_TRAIN_PATH = '/content/datathon/train.csv'
IN_TEST_PATH = '/content/datathon/test.csv'
SAMPLE_SUBMISSION_PATH = '/content/datathon/sample_submission.csv'

# Output Directories
OUT_DIR = '/content/datathon/processed/'
MODEL_DIR = "/content/models/V8/"
SUBMISSION_DIR = '/content/submissions/'
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# Processed File Paths
OUT_TRAIN_PATH = os.path.join(OUT_DIR, 'train_processed_v8.csv')
OUT_TEST_PATH = os.path.join(OUT_DIR, 'test_processed_v8.csv')

# Model & Submission Paths
TEACHER_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model_v8_teacher.cbm")
STUDENT_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model_v8_student_selected.cbm")
SUBMISSION_FILE = os.path.join(SUBMISSION_DIR, 'submission_v8.csv')

# HPO Database
DB_FILENAME = "optuna_studies_v8.db"
STUDY_NAME = "catboost_v8_pseudo_labeling"


# ==============================================================================
# 1. DATA PREPARATION & LEAK IDENTIFICATION
# ==============================================================================
print("\n--- Adım 1: Veri Hazırlama ve Sızıntı Tespiti Başladı ---")

# --- Load Raw Data ---
try:
    df_train_raw = pd.read_csv(IN_TRAIN_PATH, parse_dates=['event_time'])
    df_test_raw = pd.read_csv(IN_TEST_PATH, parse_dates=['event_time'])
    print("Ham veri setleri başarıyla yüklendi.")
except Exception as e:
    print(f"Hata: Ham veri yüklenemedi. {e}")
    exit()

# --- Identify Leaked Sessions ---
print("Ortak (sızıntılı) seanslar tespit ediliyor...")
train_session_users = df_train_raw.groupby('user_session')['user_id'].apply(set)
test_session_users = df_test_raw.groupby('user_session')['user_id'].apply(set)
common_sessions = set(train_session_users.index).intersection(set(test_session_users.index))
verified_leaked_sessions = {sid for sid in common_sessions if train_session_users.get(sid) == test_session_users.get(sid)}

# Create a map of leaked session_id -> session_value
leak_map = df_train_raw[
    df_train_raw['user_session'].isin(verified_leaked_sessions)
].groupby('user_session')['session_value'].first().to_dict()
print(f"Tespit edilen DOĞRULANMIŞ sızıntı seans sayısı: {len(verified_leaked_sessions)}")


# --- Feature Engineering (Based on v7) ---
def fix_anomalous_sessions(df):
    session_user_counts = df.groupby('user_session')['user_id'].nunique()
    anomalous_sessions = session_user_counts[session_user_counts > 1].index
    if not anomalous_sessions.empty:
        anomalous_indices = df['user_session'].isin(anomalous_sessions)
        # Use .loc to avoid SettingWithCopyWarning
        df_copy = df.copy()
        df_copy.loc[anomalous_indices, 'user_session'] = df_copy.loc[anomalous_indices, 'user_session'] + '_' + df_copy.loc[anomalous_indices, 'user_id'].astype(str)
        return df_copy
    return df

df_train_raw_fixed = fix_anomalous_sessions(df_train_raw)
df_test_raw_fixed = fix_anomalous_sessions(df_test_raw)

df_combined = pd.concat([df_train_raw_fixed.drop('session_value', axis=1), df_test_raw_fixed], ignore_index=True)

# User-level features
user_features = df_combined.groupby('user_id').agg(
    user_total_events=('event_type', 'count'),
    user_unique_products_viewed=('product_id', 'nunique'),
    user_first_seen=('event_time', 'min'),
    user_last_seen=('event_time', 'max')
)
user_features['user_lifespan_days'] = (user_features['user_last_seen'] - user_features['user_first_seen']).dt.days
user_buy_counts = df_combined[df_combined['event_type'] == 'BUY'].groupby('user_id').size()
user_features['user_buy_count'] = user_buy_counts
user_features['user_buy_count'].fillna(0, inplace=True)
user_features['user_purchase_rate'] = user_features['user_buy_count'] / user_features['user_total_events']
user_features.drop(['user_first_seen', 'user_last_seen'], axis=1, inplace=True)
del df_combined
gc.collect()

def create_session_features(df, data_type='train'):
    print(f"{data_type} verisi için seans bazlı özellik mühendisliği...")
    df['event_order'] = df.groupby('user_session').cumcount() + 1
    session_event_counts = df.groupby('user_session')['event_type'].transform('count')
    df['event_order_pct'] = df['event_order'] / session_event_counts

    event_type_counts = pd.crosstab(df['user_session'], df['event_type'])
    all_event_types = ['VIEW', 'ADD_CART', 'REMOVE_CART', 'BUY']
    for event in all_event_types:
        if event not in event_type_counts.columns:
            event_type_counts[event] = 0
    event_type_counts.columns = [f'{col.lower()}_count' for col in event_type_counts.columns]

    session_features = df.groupby('user_session').agg(
        user_id=('user_id', 'first'),
        event_count=('event_type', 'count'),
        unique_products=('product_id', 'nunique'),
        unique_categories=('category_id', 'nunique'),
        session_duration_seconds=('event_time', lambda x: (x.max() - x.min()).total_seconds())
    )

    df_session = pd.concat([session_features, event_type_counts], axis=1)

    epsilon = 1e-6
    df_session['view_to_add_cart_rate'] = df_session['add_cart_count'] / (df_session['view_count'] + epsilon)
    df_session['add_cart_to_buy_rate'] = df_session['buy_count'] / (df_session['add_cart_count'] + epsilon)
    df_session['net_cart_additions'] = df_session['add_cart_count'] - df_session['remove_cart_count']
    
    # Merge with user features
    df_session = df_session.merge(user_features, on='user_id', how='left')
    df_session.drop('user_id', axis=1, inplace=True)
    
    return df_session

df_session_train = create_session_features(df_train_raw_fixed, 'train')
df_session_test = create_session_features(df_test_raw_fixed, 'test')

# Add target variable to train set
session_value = df_train_raw_fixed.groupby('user_session')['session_value'].first()
df_session_train['session_value'] = session_value

# --- Separate Leaked Data from Training Set ---
df_session_train_clean = df_session_train[~df_session_train.index.isin(verified_leaked_sessions)]
print(f"Orijinal train seti boyutu: {len(df_session_train)}")
print(f"Sızıntılı veriler ayıklandıktan sonraki train seti boyutu: {len(df_session_train_clean)}")

# Align columns - crucial for combining later
train_cols = df_session_train_clean.drop('session_value', axis=1).columns
test_cols = df_session_test.columns
common_cols = list(set(train_cols) & set(test_cols))
df_session_train_clean = df_session_train_clean[common_cols + ['session_value']]
df_session_test = df_session_test[common_cols]

print("Adım 1 Tamamlandı: Veri işlendi ve sızıntılı seanslar eğitimden ayrıldı.")


# ==============================================================================
# 2. TEACHER MODEL TRAINING
# ==============================================================================
print("\n--- Adım 2: Teacher Model Eğitimi Başladı ---")

# Prepare data for the teacher model
y_teacher = df_session_train_clean['session_value']
X_teacher = df_session_train_clean.drop('session_value', axis=1)
y_teacher_log = np.log1p(y_teacher)

# Using pre-optimized parameters from v7
teacher_params = {
    'learning_rate': 0.06007502637954865,
    'depth': 4,
    'l2_leaf_reg': 2.5440079813104126,
    'colsample_bylevel': 0.8424763199702767,
    'min_child_samples': 29,
    'objective': 'RMSE',
    'random_seed': 42,
    'verbose': 0 # Silent during HPO runs
}

teacher_model = CatBoostRegressor(
    **teacher_params,
    iterations=4500 # A sufficiently large number
)

# Time-based split for early stopping validation
X_train_teacher, X_val_teacher, y_train_teacher_log, y_val_teacher_log = train_test_split(
    X_teacher, y_teacher_log, test_size=0.2, shuffle=False
)

print("Teacher model eğitiliyor...")
teacher_model.fit(
    X_train_teacher, y_train_teacher_log,
    eval_set=(X_val_teacher, y_val_teacher_log),
    early_stopping_rounds=300,
    verbose=1000
)

teacher_model.save_model(TEACHER_MODEL_PATH)
print(f"Teacher model eğitildi ve '{TEACHER_MODEL_PATH}' olarak kaydedildi.")


# ==============================================================================
# 3. HYPERPARAMETER OPTIMIZATION FOR PSEUDO-LABELING
# ==============================================================================
print("\n--- Adım 3: Pseudo-Labeling için HPO Başladı ---")

# The data for HPO comes from the clean training set
X_hpo = X_teacher
y_hpo_log = y_teacher_log
X_test_processed = df_session_test

# This validation set will be used inside Optuna to get a realistic score
X_train_hpo, X_val_hpo, y_train_hpo_log, y_val_hpo_log = train_test_split(
    X_hpo, y_hpo_log, test_size=0.2, shuffle=False
)

def objective(trial):
    # --- 1. Define HPO parameters for the pseudo-labeling process ---
    pseudo_label_weight = trial.suggest_float('pseudo_label_weight', 0.1, 1.0)
    pseudo_label_quantile_filter = trial.suggest_float('pseudo_label_quantile_filter', 0.0, 0.1) # Filter 0% to 20% of data (10% from each tail)

    # --- 2. Generate and filter pseudo-labels ---
    test_preds_log = teacher_model.predict(X_test_processed)
    
    df_pseudo = pd.DataFrame({
        'pseudo_label_log': test_preds_log
    }, index=X_test_processed.index)

    # Filtering
    if pseudo_label_quantile_filter > 0:
        lower_bound = df_pseudo['pseudo_label_log'].quantile(pseudo_label_quantile_filter)
        upper_bound = df_pseudo['pseudo_label_log'].quantile(1 - pseudo_label_quantile_filter)
        df_pseudo = df_pseudo[(df_pseudo['pseudo_label_log'] >= lower_bound) & (df_pseudo['pseudo_label_log'] <= upper_bound)]

    X_test_pseudo = X_test_processed.loc[df_pseudo.index]
    y_test_pseudo_log = df_pseudo['pseudo_label_log']

    # --- 3. Create the student training set ---
    # Combine original training data with pseudo-labeled data
    X_student_train = pd.concat([X_train_hpo, X_test_pseudo], axis=0)
    y_student_train_log = pd.concat([y_train_hpo_log, y_test_pseudo_log], axis=0)
    
    # Create sample weights: 1 for real data, tuned weight for pseudo-data
    weights = np.concatenate([
        np.ones(len(X_train_hpo)),
        np.full(len(X_test_pseudo), pseudo_label_weight)
    ])

    # --- 4. Train and evaluate the student model ---
    student_model = CatBoostRegressor(**teacher_params, iterations=3000)
    student_model.fit(
        X_student_train, y_student_train_log,
        sample_weight=weights,
        eval_set=(X_val_hpo, y_val_hpo_log), # Evaluate on unseen REAL data
        early_stopping_rounds=150,
        verbose=0
    )
    
    # --- 5. Calculate final score ---
    val_preds_log = student_model.predict(X_val_hpo)
    y_val_original = np.expm1(y_val_hpo_log)
    val_preds_original = np.expm1(val_preds_log)
    val_preds_original[val_preds_original < 0] = 0
    
    mse = mean_squared_error(y_val_original, val_preds_original)
    
    gc.collect()
    return mse

# --- Run the HPO study ---
storage_name = f"sqlite:///{DB_FILENAME}"
study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=storage_name,
    direction='minimize',
    load_if_exists=True
)

print(f"Optimizasyon başlıyor... Sonuçlar '{DB_FILENAME}' dosyasına kaydedilecek.")
study.optimize(objective, n_trials=30) # Run for 30 trials

best_pseudo_params = study.best_params
print("\nOptimizasyon Tamamlandı!")
print(f"En iyi denemenin skoru (MSE): {study.best_value}")
print("En iyi pseudo-labeling parametreleri:")
print(best_pseudo_params)


# ==============================================================================
# 4. FINAL STUDENT MODEL TRAINING & FEATURE SELECTION
# ==============================================================================
print("\n--- Adım 4: Final Student Model Eğitimi ve Özellik Seçimi Başladı ---")

# --- 4a. Generate final pseudo-labels with best params ---
print("En iyi parametrelerle final pseudo-label'lar oluşturuluyor...")
final_quantile = best_pseudo_params['pseudo_label_quantile_filter']
final_weight = best_pseudo_params['pseudo_label_weight']

test_preds_log = teacher_model.predict(X_test_processed)
df_pseudo_final = pd.DataFrame({'pseudo_label_log': test_preds_log}, index=X_test_processed.index)

if final_quantile > 0:
    lower_bound = df_pseudo_final['pseudo_label_log'].quantile(final_quantile)
    upper_bound = df_pseudo_final['pseudo_label_log'].quantile(1 - final_quantile)
    df_pseudo_final = df_pseudo_final[(df_pseudo_final['pseudo_label_log'] >= lower_bound) & (df_pseudo_final['pseudo_label_log'] <= upper_bound)]

X_test_pseudo_final = X_test_processed.loc[df_pseudo_final.index]
y_test_pseudo_final_log = df_pseudo_final['pseudo_label_log']

# --- 4b. Create final combined training set ---
# Use ALL clean training data this time
X_student_final = pd.concat([X_teacher, X_test_pseudo_final], axis=0)
y_student_final_log = pd.concat([y_teacher_log, y_test_pseudo_final_log], axis=0)
final_weights = np.concatenate([
    np.ones(len(X_teacher)),
    np.full(len(X_test_pseudo_final), final_weight)
])

# --- 4c. Train the student model on ALL data to get feature importances ---
print("Özellik önem skorlarını almak için final student model eğitiliyor...")
student_model_for_fi = CatBoostRegressor(
    **teacher_params,
    iterations=teacher_model.get_best_iteration() # Use teacher's best iteration
)
student_model_for_fi.fit(
    X_student_final, y_student_final_log,
    sample_weight=final_weights,
    verbose=1000
)

# --- 4d. Perform feature selection ---
fi = student_model_for_fi.get_feature_importance()
df_fi = pd.DataFrame({"feature": X_student_final.columns, "importance": fi})
df_fi = df_fi.sort_values("importance", ascending=False)

importance_threshold = 0.2
selected_features = df_fi[df_fi['importance'] >= importance_threshold]['feature'].tolist()
print(f"Toplam {len(df_fi)} özellikten, önemi >= {importance_threshold} olan {len(selected_features)} adet özellik seçildi.")

# --- 4e. Train final, feature-selected student model ---
print("Seçilmiş özelliklerle nihai student model eğitiliyor...")
X_student_selected = X_student_final[selected_features]

final_student_model = CatBoostRegressor(
    **teacher_params,
    iterations=teacher_model.get_best_iteration() + 500 # Add a few more iterations
)
final_student_model.fit(
    X_student_selected, y_student_final_log,
    sample_weight=final_weights,
    verbose=1000
)

final_student_model.save_model(STUDENT_MODEL_PATH)
print(f"Final Student model eğitildi ve '{STUDENT_MODEL_PATH}' olarak kaydedildi.")


# ==============================================================================
# 5. SUBMISSION GENERATION
# ==============================================================================
print("\n--- Adım 5: Submission Dosyası Oluşturuluyor ---")

# --- 5a. Load necessary components ---
df_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
model = CatBoostRegressor()
model.load_model(STUDENT_MODEL_PATH)

# --- 5b. Make predictions with the final model ---
print("Test seti üzerinde tahminler yapılıyor...")
X_test_selected = df_session_test[selected_features]
test_preds_log = model.predict(X_test_selected)
final_predictions = np.expm1(test_preds_log)
final_predictions[final_predictions < 0] = 0

df_predictions = pd.DataFrame({
    'user_session': X_test_selected.index,
    'predicted_value': final_predictions
})

# --- 5c. Fill submission with model predictions ---
submission_map = dict(zip(df_predictions['user_session'], df_predictions['predicted_value']))
df_submission['session_value'] = df_submission['user_session'].map(submission_map)

# --- 5d. Handle anomalous sessions (smart filling) ---
session_user_counts = df_test_raw.groupby('user_session')['user_id'].nunique()
anomalous_sessions_orig = session_user_counts[session_user_counts > 1].index.tolist()
if anomalous_sessions_orig:
    print(f"\n{len(anomalous_sessions_orig)} adet anormal seans için akıllı doldurma yapılıyor...")
    for session_id in anomalous_sessions_orig:
        constituent_preds = df_predictions[df_predictions['user_session'].str.startswith(f"{session_id}_")]
        if not constituent_preds.empty:
            total_value = constituent_preds['predicted_value'].sum()
            df_submission.loc[df_submission['user_session'] == session_id, 'session_value'] = total_value

# --- 5e. Overwrite with leaked data (CRITICAL STEP) ---
print(f"\n{len(leak_map)} adet seansın değeri, sızıntıdan gelen gerçek değerlerle güncelleniyor...")
leaked_updates = df_submission['user_session'].map(leak_map)
df_submission['session_value'] = np.where(leaked_updates.notna(), leaked_updates, df_submission['session_value'])

# --- 5f. Final checks and save ---
nan_count = df_submission['session_value'].isnull().sum()
if nan_count > 0:
    print(f"UYARI: Hala {nan_count} adet null değer var! Bunlar 0 ile dolduruluyor.")
    df_submission['session_value'].fillna(0, inplace=True)

df_submission.to_csv(SUBMISSION_FILE, index=False)
print(f"\n'{SUBMISSION_FILE}' dosyası başarıyla oluşturuldu!")
print("Dosyanın ilk 5 satırı:")
print(df_submission.head())

print("\n--- V8 SÜRECİ TAMAMLANDI ---")

In [None]:
!unzip /content/datathon-2025.zip

Archive:  /content/datathon-2025.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


**Veri Hazırlama**

In [None]:
import pandas as pd
import numpy as np
import gc
import os
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
import pandas as pd
import numpy as np
import gc
import os

print("Adım 1 (v8 - Zero-Shot Kategorizasyon ile): Veri Hazırlama Başladı.")

IN_TRAIN_PATH = './train.csv'
IN_TEST_PATH = './test.csv'

OUT_DIR = '/processed'

OUT_TRAIN_PATH = OUT_DIR + 'train_processed_v7.csv'
OUT_TEST_PATH = OUT_DIR + 'test_processed_v7.csv'


df_train = pd.read_csv(IN_TRAIN_PATH, parse_dates=['event_time'])
df_test = pd.read_csv(IN_TEST_PATH, parse_dates=['event_time'])
print("Ham veri setleri başarıyla yüklendi.")

# Analiz için ön bilgiler
train_users = set(df_train['user_id'])
test_users = set(df_test['user_id'])
common_users = train_users.intersection(test_users)
train_products = set(df_train['product_id'])
new_products_in_test = set(df_test['product_id']) - train_products
train_session_users = df_train.groupby('user_session')['user_id'].apply(set)
test_session_users = df_test.groupby('user_session')['user_id'].apply(set)
common_sessions = set(train_session_users.index).intersection(set(test_session_users.index))
verified_leaked_sessions = {sid for sid in common_sessions if train_session_users[sid] == test_session_users[sid]}
print("Analizler için ön bilgiler hesaplandı.")


# --- Anomali Temizleme, Kullanıcı Özellikleri (v6 ile aynı) ---
# ... (Bu kısımlar önceki script ile birebir aynı, o yüzden tekrar eklemiyorum)
def fix_anomalous_sessions(df, data_type='train'):
    session_user_counts = df.groupby('user_session')['user_id'].nunique()
    anomalous_sessions = session_user_counts[session_user_counts > 1].index
    if len(anomalous_sessions) > 0:
        anomalous_indices = df['user_session'].isin(anomalous_sessions)
        df['user_session_corrected'] = df['user_session']
        df.loc[anomalous_indices, 'user_session_corrected'] = df.loc[anomalous_indices, 'user_session'] + '_' + df.loc[anomalous_indices, 'user_id']
        df.drop('user_session', axis=1, inplace=True)
        df.rename(columns={'user_session_corrected': 'user_session'}, inplace=True)
    return df

# Anormal seansları düzelt
df_train = fix_anomalous_sessions(df_train, 'train')
df_test = fix_anomalous_sessions(df_test, 'test')

# Train ve Test verilerini birleştirerek genel özellikler için hazırla
df_combined = pd.concat([df_train.drop('session_value', axis=1), df_test], ignore_index=True)
print("\nTrain ve test verileri birleştirildi.")

# Kullanıcı bazlı özellikleri tüm veriden hesapla
print("Kullanıcı bazlı özellikler (user_features) oluşturuluyor...")
user_features = df_combined.groupby('user_id').agg(
    user_total_events=('event_type', 'count'),
    user_unique_products_viewed=('product_id', 'nunique'),
    user_first_seen=('event_time', 'min'),
    user_last_seen=('event_time', 'max')
)
user_features['user_lifespan_days'] = (user_features['user_last_seen'] - user_features['user_first_seen']).dt.days
user_buy_counts = df_combined[df_combined['event_type'] == 'BUY'].groupby('user_id').size()
user_features['user_buy_count'] = user_buy_counts
user_features['user_buy_count'].fillna(0, inplace=True)
user_features['user_purchase_rate'] = user_features['user_buy_count'] / user_features['user_total_events']
user_features.drop(['user_first_seen', 'user_last_seen'], axis=1, inplace=True)
print("   -> Kullanıcı bazlı özellikler tamamlandı.")
import pandas as pd
import numpy as np
import gc
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans

# ------------------------------------------------------------------------------------
# --- YENİ ADIM (v8): AĞIRLIKLANDIRILMIŞ KULLANICI TERCİHLERİNE GÖRE ZERO-SHOT KATEGORİZASYON ---
# ------------------------------------------------------------------------------------
print("\nAğırlıklandırılmış kullanıcı tercihlerine göre Zero-Shot kategorizasyon başlıyor...")

# 1. Her olay türü için bir ağırlık (önem) tanımla.
# REMOVE_CART, negatif bir sinyal olduğu için negatif bir ağırlığa sahip.
event_weights = {
    'VIEW': 0.07,
    'ADD_CART': 0.18,
    'BUY': 0.75,
    'REMOVE_CART': -0.18
}
print(f"   -> Olay ağırlıkları tanımlandı: {event_weights}")

# 2. Ağırlıkları kullanarak her bir etkileşim için bir skor oluştur.
df_combined['interaction_score'] = df_combined['event_type'].map(event_weights).fillna(0)

# 3. Ağırlıklandırılmış Kullanıcı-Ürün matrisini 'pivot_table' ile oluştur.
# Her hücre, bir kullanıcının bir ürünle olan toplam etkileşim skorunu içerir.
print("   -> Ağırlıklandırılmış Kullanıcı-Ürün matrisi oluşturuluyor...")
user_product_matrix = df_combined.pivot_table(
    index='user_id',
    columns='product_id',
    values='interaction_score',
    aggfunc='sum'
).fillna(0)
print(f"   -> Kullanıcı-Ürün matrisi oluşturuldu. Boyut: {user_product_matrix.shape}")

# 4. Ürünlerin davranışsal vektörlerini (embeddings) SVD ile oluştur
N_COMPONENTS_SVD = 100
svd = TruncatedSVD(n_components=N_COMPONENTS_SVD, random_state=42)
product_embeddings = svd.fit_transform(user_product_matrix) # Matris zaten (kullanıcı x ürün) formatında, SVD'ye bu şekilde verilir.
# SVD ürün bazlı vektörler için matrisin transpozunu bekler, biz de (ürün x kullanıcı) matrisinin transpozunu alıyoruz.
product_embeddings_transposed = svd.fit_transform(user_product_matrix.T)
print(f"   -> Ürünler için {N_COMPONENTS_SVD} boyutlu davranışsal vektörler SVD ile oluşturuldu.")

# 5. Ürün vektörlerini K-Means ile kümeleyerek yeni kategoriler oluştur
N_CLUSTERS_KMEANS = 75
kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS_KMEANS, random_state=42, batch_size=256, n_init='auto')
product_clusters = kmeans.fit_predict(product_embeddings_transposed)
print(f"   -> Davranışsal vektörler {N_CLUSTERS_KMEANS} adet kümeye (yeni kategoriye) ayrıldı.")

# 6. Her ürün için oluşturulan yeni kategori ID'sini bir map'e ata
product_to_zeroshot_cat = pd.Series(product_clusters, index=user_product_matrix.columns).to_dict()
print("   -> Ürün -> Zero-Shot Kategori haritası oluşturuldu.")

# Bellek temizliği
del user_product_matrix, product_embeddings, product_embeddings_transposed, product_clusters
gc.collect()


# --- 4. ADIM: SEANS BAZLI ÖZELLİKLER (v8) ---
def create_session_features_v8(df, product_to_zeroshot_map, data_type='train'):
    print(f"\n{data_type} verisi için seans bazlı özellik mühendisliği (v8) başlıyor...")

    # Zero-shot kategorileri ekle
    df['zeroshot_category_id'] = df['product_id'].map(product_to_zeroshot_map).fillna(-1).astype(int)

    # Seans içi olay sırasını hesapla
    df['event_order'] = df.groupby('user_session').cumcount() + 1
    session_event_counts = df['user_session'].map(df['user_session'].value_counts())
    df['event_order_pct'] = df['event_order'] / session_event_counts

    # Olay tiplerini say
    event_type_counts = pd.crosstab(df['user_session'], df['event_type'])
    all_event_types = ['VIEW', 'ADD_CART', 'REMOVE_CART', 'BUY']
    for event in all_event_types:
        if event not in event_type_counts.columns:
            event_type_counts[event] = 0
    event_type_counts.columns = [f'{col.lower()}_count' for col in event_type_counts.columns]

    # Popüler kategorileri belirle (orijinal ve zero-shot)
    popular_categories = df['category_id'].value_counts().nlargest(int(df['category_id'].nunique() * 0.1)).index
    popular_zeroshot_categories = df['zeroshot_category_id'].value_counts().nlargest(int(df['zeroshot_category_id'].nunique() * 0.1)).index

    # Seans bazında ana özellikleri topla
    session_features = df.groupby('user_session').agg(
        user_id=('user_id', 'first'),
        event_count=('event_type', 'count'),
        unique_products=('product_id', 'nunique'),
        unique_categories=('category_id', 'nunique'),
        unique_zeroshot_categories=('zeroshot_category_id', 'nunique'),
        session_duration_seconds=('event_time', lambda x: (x.max() - x.min()).total_seconds()),
        avg_day_of_week=('event_time', lambda x: x.dt.dayofweek.mean()),
        avg_hour=('event_time', lambda x: x.dt.hour.mean()),
        avg_event_order=('event_order', 'mean'),
        avg_event_order_pct=('event_order_pct', 'mean')
    )

    # Popüler kategori varlık özelliklerini ekle
    session_features['has_popular_category'] = df.groupby('user_session')['category_id'].apply(lambda x: 1 if any(cat in popular_categories for cat in x) else 0)
    session_features['has_popular_zeroshot_category'] = df.groupby('user_session')['zeroshot_category_id'].apply(lambda x: 1 if any(cat in popular_zeroshot_categories for cat in x) else 0)

    # Olay sayılarını ve diğer özellikleri birleştir
    df_session = pd.concat([session_features, event_type_counts], axis=1)

    # Oran bazlı özellikleri hesapla
    epsilon = 1e-6
    df_session['view_to_add_cart_rate'] = df_session['add_cart_count'] / (df_session['view_count'] + epsilon)
    df_session['add_cart_to_buy_rate'] = df_session['buy_count'] / (df_session['add_cart_count'] + epsilon)
    df_session['view_to_buy_rate'] = df_session['buy_count'] / (df_session['view_count'] + epsilon)
    df_session['net_cart_additions'] = df_session['add_cart_count'] - df_session['remove_cart_count']
    df_session['did_purchase'] = (df_session['buy_count'] > 0).astype(int)

    print(f"   -> {data_type} için seans bazlı özellikler tamamlandı.")
    return df_session

# Fonksiyonu çağırarak train ve test için seans bazlı özellikleri oluştur
df_session_train = create_session_features_v8(df_train, product_to_zeroshot_cat, 'train')
df_session_test = create_session_features_v8(df_test, product_to_zeroshot_cat, 'test')

# --- 5. ADIM ve sonrası (v6 ile aynı) ---
print("\nKullanıcı ve seans özellikleri birleştiriliyor...")
df_session_train = df_session_train.merge(user_features, on='user_id', how='left').set_index(df_session_train.index)
df_session_test = df_session_test.merge(user_features, on='user_id', how='left').set_index(df_session_test.index)
df_session_train.drop('user_id', axis=1, inplace=True)
df_session_test.drop('user_id', axis=1, inplace=True)

print("\nEn önemli etkileşim özellikleri oluşturuluyor...")
for df in [df_session_train, df_session_test]:
    df['buy_x_hour'] = df['buy_count'] * df['avg_hour']
    df['buy_x_unique_products'] = df['buy_count'] * df['unique_products']
    df['buy_x_user_purchase_rate'] = df['buy_count'] * df['user_purchase_rate']

session_value = df_train.groupby('user_session')['session_value'].first()
df_session_train['session_value'] = session_value

df_session_train.to_csv(OUT_TRAIN_PATH)
df_session_test.to_csv(OUT_TEST_PATH)

print("\nAdım 1 (v7) Tamamlandı: 'train_processed_v7.csv' ve 'test_processed_v7.csv' dosyaları oluşturuldu.")

Adım 1 (v8 - Zero-Shot Kategorizasyon ile): Veri Hazırlama Başladı.
Ham veri setleri başarıyla yüklendi.
Analizler için ön bilgiler hesaplandı.

Train ve test verileri birleştirildi.
Kullanıcı bazlı özellikler (user_features) oluşturuluyor...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  user_features['user_buy_count'].fillna(0, inplace=True)


   -> Kullanıcı bazlı özellikler tamamlandı.

Ağırlıklandırılmış kullanıcı tercihlerine göre Zero-Shot kategorizasyon başlıyor...
   -> Olay ağırlıkları tanımlandı: {'VIEW': 0.07, 'ADD_CART': 0.18, 'BUY': 0.75, 'REMOVE_CART': -0.18}
   -> Ağırlıklandırılmış Kullanıcı-Ürün matrisi oluşturuluyor...


**Model Eğitme**

In [7]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

print("Adım 2: Model Eğitimi Başlandı.")

MODEL_DIR = "/content/models/V7"
os.makedirs(MODEL_DIR, exist_ok=True)

IN_TRAIN_PATH = "/content/datathon/processed/train_processed_v7.csv"

FEATURE_IMPORTANCE_DIR_OUT = MODEL_DIR + "/feature_importance/IN/"
os.makedirs(FEATURE_IMPORTANCE_DIR_OUT, exist_ok=True)

OUT_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_OUT + "importance.json"

# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(IN_TRAIN_PATH, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed.csv') yüklendi.")
except FileNotFoundError:
    print("Hata: 'train_processed.csv' bulunamadı. Lütfen önce '1_data_preparation_v7.py' scriptini çalıştırın.")
    exit()

# --- Modelleme için Veriyi Hazırlama ---
y = df_train['session_value']
X = df_train.drop(['session_value'], axis=1)


# Hedef değişkene log dönüşümü
y_log = np.log1p(y)

# --- Zaman Bazlı Doğrulama (Time-Based Validation) ---
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False # shuffle=False zaman serisi doğrulama için kritik!
)
print(f"Train seti boyutu: {X_train.shape[0]}, Validation seti boyutu: {X_val.shape[0]}")

# --- CatBoost Modelini Eğitme ve Değerlendirme ---
print("CatBoost Modeli eğitimi başlıyor...")

best_params = {
    'learning_rate': 0.06007502637954865,
    'depth': 4,
    'l2_leaf_reg': 2.5440079813104126,
    'colsample_bylevel': 0.8424763199702767,
    'min_child_samples': 29,
    'objective': 'RMSE',
    'random_seed': 42,
    'verbose': 500
}

cat_model = CatBoostRegressor(
    **best_params,
    iterations=4500,
    eval_metric='RMSE',
    early_stopping_rounds=300
)


cat_model.fit(
    X_train, y_train_log,
    eval_set=(X_val, y_val_log)
)

# --- Performans Değerlendirme ---
val_preds_log = cat_model.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
y_val = np.expm1(y_val_log)

validation_mse = mean_squared_error(y_val, val_preds)
print(f"\nValidation Seti Üzerindeki MSE Skoru (CatBoost): {validation_mse:.4f}")
print(f"Validation Seti Üzerindeki RMSE Skoru (CatBoost): {np.sqrt(validation_mse):.4f}")

# --- Final Modelini Eğitme ve Kaydetme ---
print("\nFinal CatBoost modeli tüm train verisi üzerinde eğitiliyor...")
final_model = CatBoostRegressor(
    **best_params,
    iterations=cat_model.get_best_iteration()
)
final_model.fit(X, y_log)

features = X.columns.tolist()

# Feature importance as a DataFrame and save in a format that works
fi = final_model.get_feature_importance(prettified=False)
df_fi = pd.DataFrame({"feature": features, "importance": fi})
df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

# Prefer parquet, fallback to CSV if parquet not available, always also save JSON

df_fi.to_json(OUT_FEATURES_PATH, orient="records")



out_dir = MODEL_DIR + "/"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "catboost_model_v7.cbm")

# Modeli kaydetme
final_model.save_model(out_path)

print("\nAdım 2 Tamamlandı: 'catboost_model_v7.cbm' dosyası olarak model kaydedildi.")

Adım 2: Model Eğitimi Başlandı.
İşlenmiş train verisi ('train_processed.csv') yüklendi.
Train seti boyutu: 56605, Validation seti boyutu: 14152
CatBoost Modeli eğitimi başlıyor...
0:	learn: 0.7429778	test: 0.7430018	best: 0.7430018 (0)	total: 55.4ms	remaining: 4m 9s
500:	learn: 0.4376195	test: 0.4357696	best: 0.4357696 (500)	total: 4.33s	remaining: 34.6s
1000:	learn: 0.4346994	test: 0.4350890	best: 0.4350616 (962)	total: 7.7s	remaining: 26.9s
1500:	learn: 0.4328432	test: 0.4350377	best: 0.4349916 (1413)	total: 11.1s	remaining: 22.2s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.4349916238
bestIteration = 1413

Shrink model to first 1414 iterations.

Validation Seti Üzerindeki MSE Skoru (CatBoost): 312.8961
Validation Seti Üzerindeki RMSE Skoru (CatBoost): 17.6889

Final CatBoost modeli tüm train verisi üzerinde eğitiliyor...
0:	learn: 0.7429769	total: 12ms	remaining: 17s
500:	learn: 0.4370777	total: 5.04s	remaining: 9.18s
1000:	learn: 0.4344697	total: 9.21s	remai

**Model Eğitimi - Özellik Seçilimli**

In [8]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

print("Adım 2: Model Eğitimi Başlandı.")

MODEL_DIR = "/content/models/V7"
os.makedirs(MODEL_DIR, exist_ok=True)

IN_TRAIN_PATH = "/content/datathon/processed/train_processed_v7.csv"

FEATURE_IMPORTANCE_DIR_IN = MODEL_DIR + "/feature_importance/IN/"
os.makedirs(FEATURE_IMPORTANCE_DIR_IN, exist_ok=True)

IN_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_IN + "importance.json"


FEATURE_IMPORTANCE_DIR_OUT = MODEL_DIR + "/feature_importance/OUT/"
os.makedirs(FEATURE_IMPORTANCE_DIR_OUT, exist_ok=True)

OUT_FEATURES_PATH = FEATURE_IMPORTANCE_DIR_OUT + "importance.json"
# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(IN_TRAIN_PATH, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed.csv') yüklendi.")

    with open(IN_FEATURES_PATH, 'r') as f:
      features_in = json.load(f)

    print("Özellikler yüklendi.")
except FileNotFoundError:
    print("Hata: 'train_processed.csv' bulunamadı. Lütfen önce '1_data_preparation.py' scriptini çalıştırın.")
    exit()

# --- Modelleme için Veriyi Hazırlama ---
y = df_train['session_value']
X = df_train.drop(['session_value'], axis=1)


# Hedef değişkene log dönüşümü
y_log = np.log1p(y)

# feature seçilimi
importance_threshold = 0.2
print(f"Toplam özellik sayısı: {len(features_in)}")
features_in = [item['feature'] for item in features_in if item['importance'] >= importance_threshold]
print(f"Seçilen özellik sayısı: {len(features_in)}")
X = X[features_in]

# --- Zaman Bazlı Doğrulama (Time-Based Validation) ---
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False # shuffle=False zaman serisi doğrulama için kritik!
)
print(f"Train seti boyutu: {X_train.shape[0]}, Validation seti boyutu: {X_val.shape[0]}")

# --- CatBoost Modelini Eğitme ve Değerlendirme ---
print("CatBoost Modeli eğitimi başlıyor...")

best_params = {
    'learning_rate': 0.06007502637954865,
    'depth': 4,
    'l2_leaf_reg': 2.5440079813104126,
    'colsample_bylevel': 0.8424763199702767,
    'min_child_samples': 29,
    'objective': 'RMSE',
    'random_seed': 42,
    'verbose': 500
}

cat_model = CatBoostRegressor(
    **best_params,
    iterations=4500,
    eval_metric='RMSE',
    early_stopping_rounds=300
)


cat_model.fit(
    X_train, y_train_log,
    eval_set=(X_val, y_val_log)
)

# --- Performans Değerlendirme ---
val_preds_log = cat_model.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0
y_val = np.expm1(y_val_log)

validation_mse = mean_squared_error(y_val, val_preds)
print(f"\nValidation Seti Üzerindeki MSE Skoru (CatBoost): {validation_mse:.4f}")
print(f"Validation Seti Üzerindeki RMSE Skoru (CatBoost): {np.sqrt(validation_mse):.4f}")

# --- Final Modelini Eğitme ve Kaydetme ---
print("\nFinal CatBoost modeli tüm train verisi üzerinde eğitiliyor...")
final_model = CatBoostRegressor(
    **best_params,
    iterations=cat_model.get_best_iteration()
)
final_model.fit(X, y_log)

features_out = X.columns.tolist()

# Feature importance as a DataFrame and save in a format that works
fi = final_model.get_feature_importance(prettified=False)
df_fi = pd.DataFrame({"feature": features_out, "importance": fi})
df_fi = df_fi.sort_values("importance", ascending=False).reset_index(drop=True)

# Prefer parquet, fallback to CSV if parquet not available, always also save JSON

df_fi.to_json(OUT_FEATURES_PATH, orient="records")



out_dir = "/content/models/V7/"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "catboost_model_v7_selected_1.cbm")

# Modeli kaydetme
final_model.save_model(out_path)

print("\nAdım 2 Tamamlandı: 'catboost_model_v7_selected.cbm' dosyası olarak model kaydedildi.")

Adım 2: Model Eğitimi Başlandı.
İşlenmiş train verisi ('train_processed.csv') yüklendi.
Özellikler yüklendi.
Toplam özellik sayısı: 28
Seçilen özellik sayısı: 25
Train seti boyutu: 56605, Validation seti boyutu: 14152
CatBoost Modeli eğitimi başlıyor...
0:	learn: 0.7428714	test: 0.7429129	best: 0.7429129 (0)	total: 20.4ms	remaining: 1m 31s
500:	learn: 0.4375224	test: 0.4355779	best: 0.4355758 (496)	total: 4.23s	remaining: 33.8s
1000:	learn: 0.4346846	test: 0.4349265	best: 0.4349229 (998)	total: 7.91s	remaining: 27.6s
1500:	learn: 0.4329725	test: 0.4348845	best: 0.4348817 (1493)	total: 12.3s	remaining: 24.6s
2000:	learn: 0.4317747	test: 0.4349137	best: 0.4348663 (1813)	total: 16s	remaining: 20s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.4348663493
bestIteration = 1813

Shrink model to first 1814 iterations.

Validation Seti Üzerindeki MSE Skoru (CatBoost): 298.1948
Validation Seti Üzerindeki RMSE Skoru (CatBoost): 17.2683

Final CatBoost modeli tüm train verisi

**Submission Oluşturma**

In [None]:
import pandas as pd
import json
import numpy as np
from catboost import CatBoostRegressor

print("Adım 3 (Akıllı Doldurma ile): Tahmin ve Gönderim Başladı.")

# --- Ayarlar ---
PROCESSED_TEST_FILE = '/content/datathon/processed/test_processed_v7.csv'
MODEL_FILE = '/content/models/V7/catboost_model_v7_selected_1.cbm' # v3 için optimize edilmiş modeli kullandığımızdan emin olalım
SELECTED_FEATURES_PATH = '/content/models/V7/feature_importance/OUT/importance.json'
TRAIN_RAW_PATH = '/content/datathon/train.csv' # Sızıntı tespiti için gerekli
TEST_RAW_PATH = '/content/datathon/test.csv'   # Sızıntı ve anomali tespiti için gerekli
SUBMISSION_FILE = '/content/submissions/submission_v7.csv'


# --- Gerekli Dosyaları Yükleme ---
try:
    df_test_processed = pd.read_csv(PROCESSED_TEST_FILE) # index_col olmadan okuyoruz
    df_submission = pd.read_csv('/content/datathon/sample_submission.csv')
    df_train_raw = pd.read_csv(TRAIN_RAW_PATH)
    df_test_raw = pd.read_csv(TEST_RAW_PATH)

    with open(SELECTED_FEATURES_PATH, 'r') as f:
      selected_features_dict = json.load(f)

    model = CatBoostRegressor()
    model.load_model(MODEL_FILE)

    print(f"Gerekli dosyalar ve {len(selected_features_dict)} adet seçilmiş özellik başarıyla yüklendi.")
except FileNotFoundError as e:
    print(f"Hata: {e}.")
    exit()

# --- 1. ADIM: DOĞRULANMIŞ VERİ SIZINTISINI TESPİT ET ---
print("\nDoğrulanmış veri sızıntısı tespit ediliyor...")
train_session_users = df_train_raw.groupby('user_session')['user_id'].apply(set)
test_session_users = df_test_raw.groupby('user_session')['user_id'].apply(set)
common_sessions = set(train_session_users.index).intersection(set(test_session_users.index))
verified_leaked_sessions = {sid for sid in common_sessions if train_session_users[sid] == test_session_users[sid]}
verified_leak_map = df_train_raw[
    df_train_raw['user_session'].isin(verified_leaked_sessions)
].groupby('user_session')['session_value'].first().to_dict()
print(f"Tespit edilen DOĞRULANMIŞ sızıntı seans sayısı: {len(verified_leaked_sessions)}")


# --- 2. ADIM: MODEL TAHMİNLERİNİ YAP --
selected_features = [item['feature'] for item in selected_features_dict]

# Olası eksik sütun hatalarını önlemek için kontrol
missing_cols = set(selected_features) - set(df_test_processed.columns)
if missing_cols:
    print(f"HATA: Test verisinde şu sütunlar eksik: {missing_cols}")
    exit()

# Sütun sırasının modelin beklediğiyle aynı olmasını garantile
print("Selected Feature Sayısı : ", len(selected_features))
# Test verisini filtrele
df_test_processed.set_index('user_session', inplace=True)
df_test_selected = df_test_processed[selected_features]

# Tahmin yap
print("Test seti üzerinde model tahminleri yapılıyor...")
test_preds_log = model.predict(df_test_selected)
final_predictions = np.expm1(test_preds_log)
final_predictions[final_predictions < 0] = 0
df_predictions = pd.DataFrame({
    'user_session': df_test_selected.index,
    'predicted_value': final_predictions
})

# --- 3. ADIM: SUBMISSION DOSYASINI OLUŞTURMA ---
# Önce normal model tahminlerini map et
submission_map = dict(zip(df_predictions['user_session'], df_predictions['predicted_value']))
df_submission['session_value'] = df_submission['user_session'].map(submission_map)

# Sonra "Akıllı Doldurma" ile NaN user anormalliklerini doldur
session_user_counts = df_test_raw.groupby('user_session')['user_id'].nunique()
anomalous_sessions_orig = session_user_counts[session_user_counts > 1].index.tolist()
print(f"\nAkıllı doldurma yapılacak anormal seanslar: {anomalous_sessions_orig}")
for session_id in anomalous_sessions_orig:
    constituent_preds = df_predictions[df_predictions['user_session'].str.startswith(f"{session_id}_")]
    total_value = constituent_preds['predicted_value'].sum()
    df_submission.loc[df_submission['user_session'] == session_id, 'session_value'] = total_value
    print(f"'{session_id}' için {len(constituent_preds)} parçanın tahmini toplandı: {total_value:.4f}")

# SON VE EN ÖNEMLİ ADIM: Doğrulanmış sızıntıdan gelen gerçek değerlerle tüm tahminleri EZ
print(f"\n{len(verified_leaked_sessions)} adet seansın değeri, sızıntıdan gelen gerçek değerlerle güncelleniyor...")
leaked_updates = df_submission['user_session'].map(verified_leak_map)
df_submission['session_value'] = np.where(leaked_updates.notna(), leaked_updates, df_submission['session_value'])

# --- SON KONTROLLER ---
nan_count = df_submission['session_value'].isnull().sum()
if nan_count > 0:
    print(f"⚠️ UYARI: Hala {nan_count} adet null değer var! Bunlar 0 ile dolduruluyor.")
    df_submission['session_value'].fillna(0, inplace=True)

excepted_row = 30789
if excepted_row != len(df_submission):
    print(f"Satır sayıları eşleşmiyor. Beklenen : {excepted_row}. Bulunan : {len(df_submission)}")
    exit()

# --- Dosyayı Kaydetme ---
df_submission.to_csv(SUBMISSION_FILE, index=False)
print(f"\n'{SUBMISSION_FILE}' dosyası başarıyla oluşturuldu! Bu son gönderim için bol şans!")
print("Dosyanın ilk 5 satırı:")
print(df_submission.head())

**hpo kodu v7**

In [10]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.5.0


In [11]:
import pandas as pd
import numpy as np
import optuna
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import gc

print("Adım 2b: Hiperparametre Optimizasyonu (Optuna ile) Başladı.")

# --- Veritabanı ve Çalışma Ayarları ---
DB_FILENAME = "optuna_studies.db"
STUDY_NAME = "catboost_v7_features" # Her yeni özellik seti için bu ismi değiştirebilirsin

train_path = "/content/datathon/processed/train_processed_v7.csv"

feature_path = "/content/models/V7/feature_importance/OUT/importance.json"

# --- Modelleme için Veriyi Hazırlama ---
# --- İşlenmiş Veriyi Yükleme ---
try:
    df_train = pd.read_csv(train_path, index_col='user_session')
    print("İşlenmiş train verisi ('train_processed_v7.csv') yüklendi.")

    with open(feature_path, 'r') as f:
      features_importance = json.load(f)
    print(f"Özelikler Başarıyla yüklendi... {len(features_importance)}")
except FileNotFoundError:
    print("Hata: 'train_processed_v7.csv' bulunamadı. Lütfen önce '1_data_preparation_v7.py' scriptini çalıştırın.")
    exit()

# --- 1. Adım: Özellik Seçilimi ---
print("\nÖzelikler Seçiliyor...")
importance_threshold = 0.2
selected_features = [item['feature'] for item in features_importance if item['importance'] >= importance_threshold]
X = df_train[selected_features]
print(f"Önemi >= {importance_threshold} olan {len(selected_features)} adet özellik seçildi.")

# --- Veriyi Hazırlama ---
y = df_train['session_value']
y_log = np.log1p(y)

# Aynı doğrulama setini kullanmak için ayırma işlemini tekrar yapıyoruz
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, shuffle=False
)

# Objective fonksiyonu (GÜNCELLENDİ)
def objective(trial):
    params = {
        'objective': 'RMSE',
        'iterations': 4500,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'random_seed': 42,
        'verbose': 0,
    }

    model = CatBoostRegressor(**params)
    # Modeli eğitirken hem train (X_train) hem de val (X_val) setini izlemesini sağlıyoruz
    model.fit(
        X_train, y_train_log,
        eval_set=[(X_train, y_train_log), (X_val, y_val_log)], # İki seti de ekledik
        early_stopping_rounds=250,
        verbose=0
    )

    # En iyi iterasyondaki skorları al
    scores = model.get_best_score()

    best_learn_rmse = scores['learn']['RMSE']
    val_rmse_0 = scores['validation_0']['RMSE']
    val_rmse_1 = scores['validation_1']['RMSE']

    # MSE'yi hesapla
    preds_log = model.predict(X_val)
    preds = np.expm1(preds_log)
    preds[preds < 0] = 0
    y_val_original = np.expm1(y_val_log)
    mse = mean_squared_error(y_val_original, preds)

    # --- GÜNCELLENMİŞ RAPORLAMA KISMI ---
    print(f"✅ Trial {trial.number} bitti | MSE: {mse:.4f} | Learn RMSE: {best_learn_rmse:.4f} | Val_0 RMSE: {val_rmse_0:.4f} | Val_1 RMSE : {val_rmse_1:.4f} | İterasyon: {model.get_best_iteration()}")
    gc.collect()
    return mse


# --- Optimizasyon Sürecini Başlatma (VERİTABANI İLE) ---
# SQLite veritabanı için bağlantı dizesi oluştur
storage_name = f"sqlite:///{DB_FILENAME}"

# Çalışmayı oluştur veya varsa veritabanından yükle
study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=storage_name,
    direction='minimize',
    load_if_exists=True # Eğer bu isimde bir çalışma varsa, sıfırdan başlamak yerine devam et
)

# Optimizasyonu çalıştır
n_trials = 100
print(f"Optimizasyon başlıyor... Sonuçlar '{DB_FILENAME}' dosyasına kaydedilecek.")
print(f"Mevcut deneme sayısı: {len(study.trials)}. Toplamda {n_trials} denemeye ulaşılacak.")
study.optimize(objective, n_trials=n_trials)

# --- Sonuçları Yazdırma ---
print("\nOptimizasyon Tamamlandı!")
print(f"Toplam deneme sayısı: {len(study.trials)}")
print(f"En iyi denemenin skoru (MSE): {study.best_value}")
print("En iyi denemenin parametreleri:")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

Adım 2b: Hiperparametre Optimizasyonu (Optuna ile) Başladı.
İşlenmiş train verisi ('train_processed_v7.csv') yüklendi.
Özelikler Başarıyla yüklendi... 25

Özelikler Seçiliyor...
Önemi >= 0.2 olan 24 adet özellik seçildi.


[I 2025-08-27 13:47:14,838] A new study created in RDB with name: catboost_v7_features


Optimizasyon başlıyor... Sonuçlar 'optuna_studies.db' dosyasına kaydedilecek.
Mevcut deneme sayısı: 0. Toplamda 100 denemeye ulaşılacak.


[I 2025-08-27 13:47:34,069] Trial 0 finished with value: 424.1950566097895 and parameters: {'learning_rate': 0.0690116372249165, 'depth': 10, 'l2_leaf_reg': 1.223810248282764, 'colsample_bylevel': 0.5570085575711088, 'min_child_samples': 91}. Best is trial 0 with value: 424.1950566097895.


✅ Trial 0 bitti | MSE: 424.1951 | Learn RMSE: 0.4165 | Val_0 RMSE: 0.4165 | Val_1 RMSE : 0.4363 | İterasyon: 190


[I 2025-08-27 13:47:51,998] Trial 1 finished with value: 294.5126161013067 and parameters: {'learning_rate': 0.04762256092725526, 'depth': 5, 'l2_leaf_reg': 6.006712204320837, 'colsample_bylevel': 0.5572376924145312, 'min_child_samples': 10}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 1 bitti | MSE: 294.5126 | Learn RMSE: 0.4309 | Val_0 RMSE: 0.4309 | Val_1 RMSE : 0.4349 | İterasyon: 1617


[I 2025-08-27 13:48:38,435] Trial 2 finished with value: 387.26699670277344 and parameters: {'learning_rate': 0.011254878453132895, 'depth': 9, 'l2_leaf_reg': 2.315755618424441, 'colsample_bylevel': 0.621938250000565, 'min_child_samples': 44}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 2 bitti | MSE: 387.2670 | Learn RMSE: 0.4270 | Val_0 RMSE: 0.4270 | Val_1 RMSE : 0.4356 | İterasyon: 1866
✅ Trial 3 bitti | MSE: 335.9856 | Learn RMSE: 0.4303 | Val_0 RMSE: 0.4303 | Val_1 RMSE : 0.4352 | İterasyon: 554


[I 2025-08-27 13:48:49,097] Trial 3 finished with value: 335.9855506599756 and parameters: {'learning_rate': 0.05254001772004395, 'depth': 7, 'l2_leaf_reg': 7.126594182314087, 'colsample_bylevel': 0.8437375689162578, 'min_child_samples': 25}. Best is trial 1 with value: 294.5126161013067.
[I 2025-08-27 13:49:07,575] Trial 4 finished with value: 386.5751591073186 and parameters: {'learning_rate': 0.030356687488645024, 'depth': 9, 'l2_leaf_reg': 1.4697821018404569, 'colsample_bylevel': 0.872290938514591, 'min_child_samples': 97}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 4 bitti | MSE: 386.5752 | Learn RMSE: 0.4255 | Val_0 RMSE: 0.4255 | Val_1 RMSE : 0.4358 | İterasyon: 544


[I 2025-08-27 13:49:21,235] Trial 5 finished with value: 366.2567109940408 and parameters: {'learning_rate': 0.04455503261784065, 'depth': 8, 'l2_leaf_reg': 3.796390195906418, 'colsample_bylevel': 0.7737328798025489, 'min_child_samples': 99}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 5 bitti | MSE: 366.2567 | Learn RMSE: 0.4276 | Val_0 RMSE: 0.4276 | Val_1 RMSE : 0.4352 | İterasyon: 538


[I 2025-08-27 13:49:47,086] Trial 6 finished with value: 334.024477273911 and parameters: {'learning_rate': 0.016929659254169283, 'depth': 7, 'l2_leaf_reg': 1.2974376178553737, 'colsample_bylevel': 0.8478831903864567, 'min_child_samples': 94}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 6 bitti | MSE: 334.0245 | Learn RMSE: 0.4295 | Val_0 RMSE: 0.4295 | Val_1 RMSE : 0.4351 | İterasyon: 1784


[I 2025-08-27 13:50:09,282] Trial 7 finished with value: 326.2092655378106 and parameters: {'learning_rate': 0.026809139094762388, 'depth': 5, 'l2_leaf_reg': 1.5266608041944536, 'colsample_bylevel': 0.6642769931602601, 'min_child_samples': 29}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 7 bitti | MSE: 326.2093 | Learn RMSE: 0.4315 | Val_0 RMSE: 0.4315 | Val_1 RMSE : 0.4348 | İterasyon: 2193


[I 2025-08-27 13:50:46,561] Trial 8 finished with value: 326.1219444083908 and parameters: {'learning_rate': 0.010346913961916863, 'depth': 6, 'l2_leaf_reg': 1.4987365092626603, 'colsample_bylevel': 0.5207430298903617, 'min_child_samples': 39}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 8 bitti | MSE: 326.1219 | Learn RMSE: 0.4319 | Val_0 RMSE: 0.4319 | Val_1 RMSE : 0.4350 | İterasyon: 3493


[I 2025-08-27 13:51:01,738] Trial 9 finished with value: 383.2198065567333 and parameters: {'learning_rate': 0.04803154803748883, 'depth': 9, 'l2_leaf_reg': 2.0142473774806797, 'colsample_bylevel': 0.7293857083599191, 'min_child_samples': 41}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 9 bitti | MSE: 383.2198 | Learn RMSE: 0.4226 | Val_0 RMSE: 0.4226 | Val_1 RMSE : 0.4358 | İterasyon: 411


[I 2025-08-27 13:51:14,238] Trial 10 finished with value: 313.6754275995975 and parameters: {'learning_rate': 0.09987091512997025, 'depth': 4, 'l2_leaf_reg': 8.301458108375321, 'colsample_bylevel': 0.6226250000013146, 'min_child_samples': 10}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 10 bitti | MSE: 313.6754 | Learn RMSE: 0.4324 | Val_0 RMSE: 0.4324 | Val_1 RMSE : 0.4350 | İterasyon: 1068
✅ Trial 11 bitti | MSE: 296.6241 | Learn RMSE: 0.4332 | Val_0 RMSE: 0.4332 | Val_1 RMSE : 0.4347 | İterasyon: 834


[I 2025-08-27 13:51:23,420] Trial 11 finished with value: 296.6240646657087 and parameters: {'learning_rate': 0.09403830358762762, 'depth': 4, 'l2_leaf_reg': 9.803417526175123, 'colsample_bylevel': 0.9965496943006595, 'min_child_samples': 8}. Best is trial 1 with value: 294.5126161013067.
[I 2025-08-27 13:51:31,945] Trial 12 finished with value: 298.2963269053093 and parameters: {'learning_rate': 0.09792927575823643, 'depth': 4, 'l2_leaf_reg': 5.372253462885506, 'colsample_bylevel': 0.9868311635554933, 'min_child_samples': 7}. Best is trial 1 with value: 294.5126161013067.


✅ Trial 12 bitti | MSE: 298.2963 | Learn RMSE: 0.4332 | Val_0 RMSE: 0.4332 | Val_1 RMSE : 0.4350 | İterasyon: 754


[I 2025-08-27 13:51:43,996] Trial 13 finished with value: 289.44557895989374 and parameters: {'learning_rate': 0.06795080918579646, 'depth': 5, 'l2_leaf_reg': 9.127221293269104, 'colsample_bylevel': 0.9983716266050546, 'min_child_samples': 71}. Best is trial 13 with value: 289.44557895989374.


✅ Trial 13 bitti | MSE: 289.4456 | Learn RMSE: 0.4309 | Val_0 RMSE: 0.4309 | Val_1 RMSE : 0.4350 | İterasyon: 1091


[I 2025-08-27 13:51:53,691] Trial 14 finished with value: 337.864524345192 and parameters: {'learning_rate': 0.06432548673714628, 'depth': 6, 'l2_leaf_reg': 5.455690076321882, 'colsample_bylevel': 0.9169656731744283, 'min_child_samples': 70}. Best is trial 13 with value: 289.44557895989374.


✅ Trial 14 bitti | MSE: 337.8645 | Learn RMSE: 0.4306 | Val_0 RMSE: 0.4306 | Val_1 RMSE : 0.4352 | İterasyon: 622


[W 2025-08-27 13:51:58,519] Trial 15 failed with parameters: {'learning_rate': 0.040136280194092384, 'depth': 5, 'l2_leaf_reg': 4.06408180607236, 'colsample_bylevel': 0.7415245081318025, 'min_child_samples': 67} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipython-input-4071707534.py", line 65, in objective
    model.fit(
  File "/usr/local/lib/python3.12/dist-packages/catboost/core.py", line 5873, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/catboost/core.py", line 2410, in _fit
   

KeyboardInterrupt: 