In [151]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve


In [152]:
df = pd.read_csv('dataset_for_models.csv')

In [153]:
def prepare_features_and_target(df, target_column='default'):
    """
    Datasetdan kerakli va kerak bo'lmagan ustunlarni ajratib oladi.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        To'liq dataset
    target_column : str
        Target ustun nomi (default: 'default')
    
    Returns:
    --------
    X : pandas.DataFrame
        Feature ustunlar (ID va target ustunlarsiz)
    y : pandas.Series
        Target ustun
    """
    # ID ustunlar (model uchun kerak emas)
    id_columns = ['customer_ref', 'application_id']
    
    # Kerak bo'lmagan ustunlar (identifikatorlar yoki foydasiz)
    unnecessary_columns = ['loan_officer_id', 'previous_zip_code', 'referral_code']
    
    # Barcha olib tashlash kerak bo'lgan ustunlar
    columns_to_drop = id_columns + unnecessary_columns + [target_column]
    
    # Faqat mavjud ustunlarni olib tashlash
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    
    # Feature ustunlar (X)
    X = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Target ustun (y)
    y = df[target_column].copy()
    
    return X, y

X, y = prepare_features_and_target(df)


In [154]:
# Categorical va numerical ustunlarni ajratish
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [155]:
# Categorical ustunlarni encode qilish
X_encoded = X.copy()
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le


In [156]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)


In [157]:
# LightGBM model train qilish (Class weights bilan)
class_counts = y_train.value_counts().sort_index()
total = len(y_train)
weight_0 = total / (2 * class_counts[0])
weight_1 = total / (2 * class_counts[1])
class_weights = {0: weight_0, 1: weight_1}

model = lgb.LGBMClassifier(
    objective='binary',
    metric='auc',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=5,
    class_weight=class_weights,
    verbose=0,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]
)


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.807105
Did not meet early stopping. Best iteration is:
[85]	valid_0's auc: 0.807723


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,"{0: np.float64(0.5268939172179615), 1: np.float64(9.79578231292517)}"
,min_split_gain,0.0
,min_child_weight,0.001


In [158]:
# Model baholash va Threshold tuning
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Optimal threshold topish (F1 score maksimallashtirish)
thresholds = np.arange(0.1, 0.9, 0.01)
best_f1 = 0
best_threshold = 0.5
best_y_pred = None

for threshold in thresholds:
    y_pred_temp = (y_pred_proba >= threshold).astype(int)
    f1_temp = f1_score(y_test, y_pred_temp)
    if f1_temp > best_f1:
        best_f1 = f1_temp
        best_threshold = threshold
        best_y_pred = y_pred_temp

y_pred = best_y_pred if best_y_pred is not None else (y_pred_proba >= best_threshold).astype(int)

# Metrikalar
auc_score = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("=" * 60)
print("MODEL NATIJALARI")
print("=" * 60)
print(f"ROC-AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Optimal Threshold: {best_threshold:.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


MODEL NATIJALARI
ROC-AUC Score: 0.8077
Accuracy: 0.9111
F1 Score: 0.3174
Optimal Threshold: 0.740

Confusion Matrix:
[[16028  1053]
 [  547   372]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95     17081
           1       0.26      0.40      0.32       919

    accuracy                           0.91     18000
   macro avg       0.61      0.67      0.63     18000
weighted avg       0.93      0.91      0.92     18000



In [159]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)


In [160]:
print("=" * 60)
print(f"\nROC-AUC Score: {auc_score:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"\nFeature ustunlar soni: {X_train.shape[1]}")
print(f"Train set: {X_train.shape[0]} qator")
print(f"Test set: {X_test.shape[0]} qator")
print("=" * 60)



ROC-AUC Score: 0.8077
Accuracy: 0.9111
F1 Score: 0.3174

Feature ustunlar soni: 53
Train set: 71999 qator
Test set: 18000 qator


In [161]:
# Barcha ma'lumotlarni predict qilish va saqlash
from pathlib import Path

X_all = X_encoded.copy()
all_proba = model.predict_proba(X_all)[:, 1]
all_pred = (all_proba >= 0.5).astype(int)  # Prob >= 0.5 → default = 1

results_df = pd.DataFrame({
    'customer_id': df['customer_ref'].values.astype(int),
    'prob': all_proba.round(5),
    'default': all_pred
})

result_folder = Path('../result')
result_folder.mkdir(exist_ok=True)
output_file = result_folder / 'results.csv'
results_df.to_csv(output_file, index=False)

print(f"✅ Results saqlandi: {output_file}")
print(f"Qatorlar: {len(results_df):,}, Ustunlar: {len(results_df.columns)}")


✅ Results saqlandi: ..\result\results.csv
Qatorlar: 89,999, Ustunlar: 3
