In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

In [10]:
df = pd.read_csv("hatalı.csv")
print(f"data tüklendi {df.shape}")
print(f"Fraud sayısı {df["Class"].sum()}")

data tüklendi (284807, 31)
Fraud sayısı 492


In [11]:
from sklearn.preprocessing import StandardScaler

# SCALING - Time ve Amount'u normalize et
scaler_time = StandardScaler()
scaler_amount = StandardScaler()
df['Time_Scaled'] = scaler_time.fit_transform(df[['Time']])
df['Amount_Scaled'] = scaler_amount.fit_transform(df[['Amount']])

# AMOUNT FEATURES
df['Amount_Log'] = np.log1p(df['Amount'])  # log transformation
df['Is_Small_Amount'] = (df['Amount'] < 10).astype(int)  # 10$'dan kucuk mu?
df['Is_Large_Amount'] = (df['Amount'] > 200).astype(int)  # 200$'dan buyuk mu?

# TIME FEATURES
df['Time_Hours'] = df['Time'] / 3600  # saniye -> saat
df['Hour'] = (df['Time_Hours'] % 24).astype(int)  # 0-23 arasi saat
df['Is_Night'] = ((df['Hour'] >= 22) | (df['Hour'] <= 6)).astype(int)  # gece mi?

# V INTERACTIONS
df['V17_V14'] = df['V17'] * df['V14']  # en guclu 2 feature'i carp
df['Top5_sum'] = df['V17'] + df['V14'] + df['V12'] + df['V10'] + df['V16']  # top 5 topla

print("Tum feature'lar olusturuldu!")

Tum feature'lar olusturuldu!


In [12]:
# FEATURE LİSTESİ - 37 feature
features = [
    'Time_Scaled', 'Amount_Scaled',
    'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
    'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
    'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
    'Amount_Log', 'Is_Small_Amount', 'Is_Large_Amount',
    'Hour', 'Is_Night',
    'V17_V14', 'Top5_sum'
]

# X ve y hazirla
X = df[features]
y = df['Class']

# TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,  # %20 test
    random_state=42,  # tekrar edilebilir
    stratify=y  # fraud oranini koru
)

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")
print(f"Feature sayisi: {len(features)}")

Train: (227845, 37)
Test: (56962, 37)
Feature sayisi: 37


In [13]:
# LIGHTGBM MODEL - IYILESTIRILMIS
print("LightGBM egitiliyor...")

# Imbalance ratio
imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Imbalance ratio: {imbalance_ratio:.1f}:1")

# Model olustur - DAHA IYI PARAMETRELER
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,              #
    learning_rate=0.05,            #
    max_depth=7,                   #
    num_leaves=31,                 #
    min_child_samples=20,          #
    scale_pos_weight=imbalance_ratio,
    random_state=42,
    verbose=-1
)

# Egit
lgb_model.fit(X_train, y_train)

# Tahmin
y_train_pred_lgb = lgb_model.predict_proba(X_train)[:, 1]
y_test_pred_lgb = lgb_model.predict_proba(X_test)[:, 1]

# Skorlar
train_auc_lgb = roc_auc_score(y_train, y_train_pred_lgb)
test_auc_lgb = roc_auc_score(y_test, y_test_pred_lgb)

print("\nSONUCLAR:")
print("="*50)
print(f"Logistic Regression Test AUC: 0.9750")
print(f"LightGBM Train AUC: {train_auc_lgb:.4f}")
print(f"LightGBM Test AUC:  {test_auc_lgb:.4f}")
print(f"Improvement: {test_auc_lgb - 0.9750:.4f}")
print("="*50)

LightGBM egitiliyor...
Imbalance ratio: 577.3:1

SONUCLAR:
Logistic Regression Test AUC: 0.9750
LightGBM Train AUC: 0.9349
LightGBM Test AUC:  0.9137
Improvement: -0.0613


In [14]:
from sklearn.model_selection import cross_val_score

def test_lgb_params(n_estimators, learning_rate, max_depth, num_leaves):
    """
    Verilen parametrelerle LightGBM'i CV ile test et
    """
    # Imbalance ratio
    imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()

    # Model olustur
    model = lgb.LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves=num_leaves,
        min_child_samples=20,
        scale_pos_weight=imbalance_ratio,
        random_state=42,
        verbose=-1
    )

    # 5-Fold CV
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=5,              # 5 fold
        scoring='roc_auc', # AUC metrik
        n_jobs=-1          # tum cekirdekler
    )

    mean_score = cv_scores.mean()
    std_score = cv_scores.std()

    return mean_score, std_score

print("CV fonksiyonu hazir")

CV fonksiyonu hazir!


In [15]:

# Test edilecek parametreler
param_grid = [
    # (n_estimators, learning_rate, max_depth, num_leaves)
    (300, 0.05, 6, 31),
    (500, 0.05, 6, 31),
    (300, 0.03, 7, 31),
    (500, 0.03, 7, 31),
    (500, 0.05, 7, 63),
]

results = []

for i, (n_est, lr, depth, leaves) in enumerate(param_grid, 1):
    print(f"Test {i}/{len(param_grid)}: n_est={n_est}, lr={lr}, depth={depth}, leaves={leaves}")

    mean_score, std_score = test_lgb_params(n_est, lr, depth, leaves)

    results.append({
        'n_estimators': n_est,
        'learning_rate': lr,
        'max_depth': depth,
        'num_leaves': leaves,
        'mean_auc': mean_score,
        'std_auc': std_score
    })

    print(f"  -> CV AUC: {mean_score:.4f} (+/- {std_score:.4f})\n")

# Sonuclari dataframe'e cevir
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('mean_auc', ascending=False)

print("="*70)
print("EN IYI 5 SONUC:")
print("="*70)
print(results_df.to_string(index=False))

HYPERPARAMETER TUNING BASLIYOR...
Bu ~20-30 dakika surecek, lutfen bekleyin!

Test 1/5: n_est=300, lr=0.05, depth=6, leaves=31
  -> CV AUC: 0.8758 (+/- 0.0604)

Test 2/5: n_est=500, lr=0.05, depth=6, leaves=31
  -> CV AUC: 0.8590 (+/- 0.0485)

Test 3/5: n_est=300, lr=0.03, depth=7, leaves=31
  -> CV AUC: 0.8682 (+/- 0.0298)

Test 4/5: n_est=500, lr=0.03, depth=7, leaves=31
  -> CV AUC: 0.8914 (+/- 0.0066)

Test 5/5: n_est=500, lr=0.05, depth=7, leaves=63
  -> CV AUC: 0.8391 (+/- 0.1042)

EN IYI 5 SONUC:
 n_estimators  learning_rate  max_depth  num_leaves  mean_auc  std_auc
          500           0.03          7          31  0.891431 0.006577
          300           0.05          6          31  0.875803 0.060415
          300           0.03          7          31  0.868159 0.029784
          500           0.05          6          31  0.859004 0.048507
          500           0.05          7          63  0.839119 0.104179


LightGBM denedim, CV ile hyperparameter tuning yaptım, ama Logistic Regression daha iyi oldu. Çünkü dataset PCA'lanmış ve linear separable. Karmaşık model her zaman iyi olmuyor!