In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train_c.csv')
df_test = pd.read_csv('test_c.csv')

test_ids = df_test['ID'].copy()
df_test = df_test.drop('ID', axis=1)

print(f"Размер train до обработки: {len(df_train)}")
df_train = df_train.dropna(subset=['LoanApproved'])
print(f"Размер train после удаления NaN в LoanApproved: {len(df_train)}")

Размер train до обработки: 11017
Размер train после удаления NaN в LoanApproved: 10487


In [3]:
def extract_date_features(df):
    df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    df['Year'] = df['ApplicationDate'].dt.year
    df['Month'] = df['ApplicationDate'].dt.month
    df['DayOfWeek'] = df['ApplicationDate'].dt.dayofweek
    df.drop('ApplicationDate', axis=1, inplace=True)
    return df

df_train = extract_date_features(df_train)
df_test = extract_date_features(df_test)

In [4]:
df_train['Income_to_Debt'] = df_train['AnnualIncome'] / (df_train['MonthlyDebtPayments'] + 1)
df_train['Credit_to_Income'] = df_train['CreditScore'] / (df_train['AnnualIncome'] + 1)
df_train['Loan_to_Income'] = df_train['LoanAmount'] / (df_train['AnnualIncome'] + 1)
df_train['RateSpread'] = df_train['InterestRate'] - df_train['BaseInterestRate']

df_test['Income_to_Debt'] = df_test['AnnualIncome'] / (df_test['MonthlyDebtPayments'] + 1)
df_test['Credit_to_Income'] = df_test['CreditScore'] / (df_test['AnnualIncome'] + 1)
df_test['Loan_to_Income'] = df_test['LoanAmount'] / (df_test['AnnualIncome'] + 1)
df_test['RateSpread'] = df_test['InterestRate'] - df_test['BaseInterestRate']

In [18]:
categorical_cols = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 
                   'EducationLevel', 'BankruptcyHistory', 'PreviousLoanDefaults', 'PaymentHistory']

important_numerical_cols = [
    'Age', 'AnnualIncome', 'CreditScore', 'LoanAmount', 'LoanDuration',
    'MonthlyDebtPayments', 'DebtToIncomeRatio', 'NetWorth',
    'Income_to_Debt', 'Credit_to_Income', 'Loan_to_Income', 'RateSpread'
]

print(f"Распределение классов:")
print(df_train['LoanApproved'].value_counts())
print(f"Соотношение: {df_train['LoanApproved'].value_counts(normalize=True).round(3)}")

Распределение классов:
LoanApproved
1.0    5367
0.0    5120
Name: count, dtype: int64
Соотношение: LoanApproved
1.0    0.512
0.0    0.488
Name: proportion, dtype: float64


In [6]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, important_numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [19]:
X = df_train.drop('LoanApproved', axis=1)
y = df_train['LoanApproved']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Размеры выборок:")
print(f"Train: {X_train.shape}")
print(f"Validation: {X_val.shape}")

Размеры выборок:
Train: (8389, 40)
Validation: (2098, 40)


In [9]:
print("Оптимизация гиперпараметров GridSearchCV")

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))
])

param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__max_iter': [1000, 2000]
}

print(f"\nПараметры для поиска:")
print(f"C (регуляризация): {param_grid['classifier__C']}")
print(f"Penalty: {param_grid['classifier__penalty']}")
print(f"Solver: {param_grid['classifier__solver']}")
print(f"Max iterations: {param_grid['classifier__max_iter']}")

Оптимизация гиперпараметров GridSearchCV

Параметры для поиска:
C (регуляризация): [0.001, 0.01, 0.1, 1, 10, 100]
Penalty: ['l1', 'l2']
Solver: ['liblinear', 'saga']
Max iterations: [1000, 2000]


In [10]:
grid_search = GridSearchCV(
    model, 
    param_grid, 
    cv=5, 
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)

print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучший ROC-AUC (кросс-вал): {grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Лучшие параметры: {'classifier__C': 1, 'classifier__max_iter': 1000, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Лучший ROC-AUC (кросс-вал): 0.9834


In [20]:
print("Оценка на валидационной выборке")

y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]
y_val_pred = best_model.predict(X_val)

roc_auc = roc_auc_score(y_val, y_val_pred_proba)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

print(f"\nОсновные метрики:")
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

Оценка на валидационной выборке

Основные метрики:
ROC-AUC: 0.9806
Accuracy: 0.9237
Precision: 0.9319
Recall: 0.9181
F1-score: 0.9250

Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.93      0.92      1024
         1.0       0.93      0.92      0.92      1074

    accuracy                           0.92      2098
   macro avg       0.92      0.92      0.92      2098
weighted avg       0.92      0.92      0.92      2098



In [12]:
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"ROC-AUC на кросс-валидации (5 folds):")
print(f"  Среднее: {cv_scores.mean():.4f}")
print(f"  Стандартное отклонение: {cv_scores.std():.4f}")
print(f"  Все значения: {[f'{score:.4f}' for score in cv_scores]}")

ROC-AUC на кросс-валидации (5 folds):
  Среднее: 0.9834
  Стандартное отклонение: 0.0016
  Все значения: ['0.9807', '0.9839', '0.9844', '0.9826', '0.9854']


In [14]:
final_model = grid_search.best_estimator_
final_model.fit(X, y)

y_test_pred_proba = final_model.predict_proba(df_test)[:, 1]

threshold = 0.5
y_test_pred = (y_test_pred_proba > threshold).astype(int)

print(f"Распределение предсказаний (порог = {threshold}):")
approved_count = y_test_pred.sum()
total_count = len(y_test_pred)
print(f"Одобрено кредитов (1): {approved_count} из {total_count} ({approved_count/total_count*100:.1f}%)")
print(f"Отклонено кредитов (0): {total_count - approved_count} из {total_count} ({(total_count - approved_count)/total_count*100:.1f}%)")

Распределение предсказаний (порог = 0.5):
Одобрено кредитов (1): 2565 из 5000 (51.3%)
Отклонено кредитов (0): 2435 из 5000 (48.7%)


In [15]:
submission = pd.DataFrame({'ID': test_ids, 'LoanApproved': y_test_pred})
submission.to_csv('submission.csv', index=False)
print('Submission file created')

Submission file created


In [16]:
print(f"Модель: Logistic Regression")
print(f"Метод оптимизации: GridSearchCV")
print(f"Лучший ROC-AUC (кросс-валидация): {grid_search.best_score_:.4f}")
print(f"ROC-AUC на валидации: {roc_auc:.4f}")

Модель: Logistic Regression
Метод оптимизации: GridSearchCV
Лучший ROC-AUC (кросс-валидация): 0.9834
ROC-AUC на валидации: 0.9796
