In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [21]:
train = pd.read_csv('/Users/cenkerarin/ml_test/datasets/cs-training.csv')
test = pd.read_csv('/Users/cenkerarin/ml_test/datasets/cs-test.csv')

In [22]:
train.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [23]:
test.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


---

this notebook will cover over xgboost explanation, parameters and use-case in a kaggle comp called 'Give Me Some Credit'

---

In [24]:
X = train.drop(['SeriousDlqin2yrs', 'Unnamed: 0'], axis=1)
y = train['SeriousDlqin2yrs']

X = X.fillna(X.median())

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Parametrelerin Açıklamaları

#### Boosting Parametreleri

boosting modellerinin zayıf ağaç modellerinin ardışık olarak kurularak her iterasyonda hatayı azaltmaya çalışan modeller olduğunu biliyoruz. Bu sectiondaki parametrelerin olayı da o ardışık öğrenmeyi gerçekleştirecek ağaçların özellikleri hakkında

**n_estimators** = kaç round'da, ağaçta boost gerçekleşecek

**max_depth** = ağaçların derinliği,uzunluğu

**min_child_weight** = ağaçların dallanabilmesi için gereken weight büyüklüğü

**subsample** = her round'da verimizdeki örneklerin yüzde kaçı kullanılacak

**colsample_bytree** = ağaçlarda rastgele kaç tane feature kullanılacak o ayarlanır

**colsample_bylevel** = Her tree’deki her level için kaç feature kullanılacağını ayarlar.

**colsample_bynode** = Her split için feature sampling oranı.

#### Öğrenme Parametreleri

**learning_rate** = her boosting round'da güncelleme ne kadar yapılacacak

**gamma** = bir dallanma yapabilmesi için gereken minimum loss reduction miktarı
 
**alpha** = L1 regularization terimi (Lasso benzeri) -> feature selection için
 
**reg_lambda** = L2 regularization terimi (Ridge benzeri)
 
#### Ağaç Yapısı Parametreleri
 
**max_delta_step** = her ağacın weight tahminlemesi için izin verilen maksimum delta step
 
**scale_pos_weight** = pozitif ve negatif sınıfların dengelenmesi için kullanılır
 
**booster** = hangi booster kullanılacağı: gbtree, gblinear veya dart
 
**tree_method** = ağaç yapım algoritması
 
**grow_policy** = yeni node'ların ağaca nasıl ekleneceğini kontrol eder



In [None]:
xgb_model = xgb.XGBClassifier(
    # Basic parameters
    random_state=42,
    eval_metric='logloss',
    
    # Boosting parameters
    n_estimators=100,           # Number of trees (boosting rounds)
    max_depth=6,                # Maximum depth of trees
    min_child_weight=1,         # Minimum sum of instance weight needed in a child
    subsample=0.8,              # Subsample ratio of training instances
    colsample_bytree=0.8,       # Subsample ratio of columns when constructing each tree
    colsample_bylevel=1.0,      # Subsample ratio of columns for each level
    colsample_bynode=1.0,       # Subsample ratio of columns for each split
    
    # Learning parameters
    learning_rate=0.1,          # Step size shrinkage (eta)
    gamma=0,                    # Minimum loss reduction required to make split
    alpha=0,                    # L1 regularization term
    reg_lambda=1,               # L2 regularization term (lambda)
    
    # Tree construction parameters
    max_delta_step=0,           # Maximum delta step allowed for each tree's weight estimation
    scale_pos_weight=1,         # Balancing of positive and negative weights
    
    # General parameters
    booster='gbtree',           # Which booster to use: gbtree, gblinear or dart
    tree_method='auto',         # Tree construction algorithm
    grow_policy='depthwise',    # Controls how new nodes are added to the tree
    max_leaves=0,               # Maximum number of leaves (0 means no limit)
    max_bin=256,                # Maximum number of bins for feature values
    
    # Performance parameters
    n_jobs=-1,                  # Number of parallel threads
    verbosity=1,                # Verbosity level
    
    # GPU parameters (uncomment if using GPU)
    # tree_method='gpu_hist',
    # gpu_id=0,
    
    # Early stopping and validation (will be used in fit method)
    early_stopping_rounds=None,  # Will be set in fit method if needed
    
    # Additional advanced parameters
    monotone_constraints=None,   # Monotonic constraints for features
    interaction_constraints=None, # Constraints for feature interactions
    importance_type='gain',      # Feature importance type
    
    # Dart booster specific parameters (if booster='dart')
    # sample_type='uniform',
    # normalize_type='tree',
    # rate_drop=0.0,
    # one_drop=0,
    # skip_drop=0.0,
    
    # Validation and callbacks parameters
    validate_parameters=True,    # Validate input parameters
    enable_categorical=False     # Enable categorical feature support
)

# Fit the model with evaluation set for monitoring
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_metric=['logloss', 'auc'],
    early_stopping_rounds=50,
    verbose=True
)


Baseline XGBoost Accuracy: 0.9364

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     27995
           1       0.56      0.21      0.31      2005

    accuracy                           0.94     30000
   macro avg       0.75      0.60      0.64     30000
weighted avg       0.92      0.94      0.92     30000


Confusion Matrix:
[[27663   332]
 [ 1576   429]]


In [None]:
y_pred = xgb_model.predict(X_val)
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
print(f"Baseline XGBoost Accuracy: {accuracy:.4f}")

print(classification_report(y_val, y_pred))

print(confusion_matrix(y_val, y_pred))

In [None]:
if 'SeriousDlqin2yrs' in test.columns:
    X_test = test.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1)
else:
    X_test = test.drop(['Unnamed: 0'], axis=1)

X_test = X_test.reindex(columns=X.columns, fill_value=0)

X_test = X_test.fillna(X_test.median())


test_predictions = xgb_model.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    'Id': test['Unnamed: 0'],
    'Probability': test_predictions
})

# Save submission file if needed
# submission.to_csv('xgboost_submission.csv', index=False)


Test data columns: ['Unnamed: 0', 'SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
Training features used: ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
Test features shape: (101503, 10)
Training features shape: (150000, 10)
Submission file created: xgboost_submission.csv
Shape of submission: (101503, 2)

First few predictions:
   Id  Probability
0   1     0.053767
1   2     0.058438
2   3     0.007077
3   4     0.082744
4   5     0.098151
