In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import joblib  # Modelleri kaydetmek için

In [10]:
# Eğitilmiş veri setini yükle
print("Loading train set...")
train_df = pd.read_csv('../processed_data/preprocessed_train_set.csv')

# 2. Veriyi eğitim ve doğrulama setlerine ayırma
X = train_df.drop(columns=['prediction'])
y = train_df['prediction']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. Birinci Model: 0 mı, değil mi? (Binary Classification)
print("Training Binary Classification Model (0 vs. Not 0)...")
y_train_binary = (y_train == 0).astype(int)
y_val_binary = (y_val == 0).astype(int)

model_1 = XGBClassifier(scale_pos_weight=(1 / y_train_binary.mean()), random_state=42)
model_1.fit(X_train, y_train_binary)

# Performans değerlendirme
pred_1_val = model_1.predict(X_val)
print("\nBinary Classification Performance (Validation Set):")
print(classification_report(y_val_binary, pred_1_val))

# 4. İkinci Model: 1, 2, 3, 4 sınıfları için (Multi-Class Classification)
print("Training Multi-Class Classification Model (1, 2, 3, 4)...")
X_train_second = X_train[y_train != 0]
y_train_second = y_train[y_train != 0]
X_val_second = X_val[pred_1_val == 0]  # Birinci modelin "0 değil" dediği veriler
y_val_second = y_val[pred_1_val == 0]

# En iyi parametrelerle RandomForestClassifier tanımı
best_params = {
    'bootstrap': True,
    'max_depth': None,
    'min_samples_leaf': 4,
    'min_samples_split': 10,
    'n_estimators': 50
}

model_2 = RandomForestClassifier(**best_params, random_state=42)
model_2.fit(X_train_second, y_train_second)

# Performans değerlendirme
pred_2_val = model_2.predict(X_val_second)
print("\nMulti-Class Classification Performance (Validation Set):")
print(classification_report(y_val_second, pred_2_val))

# 5. Modelleri kaydetme
print("Saving models...")
joblib.dump(model_1, '../models/binary_model.pkl')
joblib.dump(model_2, '../models/multi_class_model.pkl')
print("Models saved successfully!")

# 6. Birleştirilmiş Tahminler (Validation Set)
print("\nEvaluating Combined Model...")
final_predictions = []
for i in range(len(X_val)):
    if pred_1_val[i] == 1:  # Birinci model "0" dediyse
        final_predictions.append(0)
    else:  # İkinci model tahminine geç
        final_predictions.append(pred_2_val[0])
        pred_2_val = np.delete(pred_2_val, 0)  # Tahmin edilen elemanı sil

final_predictions = np.array(final_predictions)

# Birleştirilmiş performans raporu
print("\nCombined Model Performance (Validation Set):")
print(classification_report(y_val, final_predictions))



Loading train set...
Training Binary Classification Model (0 vs. Not 0)...

Binary Classification Performance (Validation Set):
              precision    recall  f1-score   support

           0       0.84      0.70      0.76      3360
           1       0.93      0.97      0.95     14456

    accuracy                           0.92     17816
   macro avg       0.89      0.83      0.86     17816
weighted avg       0.91      0.92      0.91     17816

Training Multi-Class Classification Model (1, 2, 3, 4)...

Multi-Class Classification Performance (Validation Set):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       452
           1       0.22      0.20      0.21       395
           2       0.20      0.21      0.21       504
           3       0.17      0.14      0.15       519
           4       0.41      0.66      0.50       929

    accuracy                           0.31      2799
   macro avg       0.20      0.24      0.21      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
