In [6]:
# ────────────────────────────────────────────────────────────────
#  Credit Default Prediction — Course Project
#  (версия с ручной загрузкой файлов в Colab)
# ────────────────────────────────────────────────────────────────

# 1. Загрузка файлов с компьютера
print("Пожалуйста, загрузите два файла:")
print("  1. course_project_train.csv")
print("  2. course_project_test.csv\n")

from google.colab import files
uploaded = files.upload()

# ────────────────────────────────────────────────────────────────
# 2. Импорты
# ────────────────────────────────────────────────────────────────

import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# ────────────────────────────────────────────────────────────────
# 3. Чтение загруженных файлов
# ────────────────────────────────────────────────────────────────

train_file = 'course_project_train.csv'
test_file  = 'course_project_test.csv'

print("\nЧитаем данные...")

try:
    train = pd.read_csv(train_file)
    test  = pd.read_csv(test_file)
    print("Успешно прочитано!")
except Exception as e:
    print("Ошибка при чтении файлов:", e)
    raise

print("\nРазмер train:", train.shape)
print("Размер test :", test.shape)

print("\nРаспределение Credit Default в train:")
print(train['Credit Default'].value_counts(normalize=True).round(4) * 100, "%")

# ────────────────────────────────────────────────────────────────
# 4. Предобработка
# ────────────────────────────────────────────────────────────────

def prepare_dataset(df):
    df = df.copy()

    # Current Loan Amount — аномалия 99999999 → NaN
    if 'Current Loan Amount' in df.columns:
        df['Current Loan Amount'] = df['Current Loan Amount'].replace(99999999, np.nan)

    # Credit Score — часто ошибка ввода (751–851 вместо 300–850)
    if 'Credit Score' in df.columns:
        df['Credit Score'] = np.where(df['Credit Score'] > 800,
                                      df['Credit Score'] / 10,
                                      df['Credit Score'])

    # Years in current job → число
    if 'Years in current job' in df.columns:
        job_map = {
            '< 1 year' : 0.5,  '1 year'   : 1,   '2 years'  : 2,
            '3 years'  : 3,    '4 years'  : 4,   '5 years'  : 5,
            '6 years'  : 6,    '7 years'  : 7,   '8 years'  : 8,
            '9 years'  : 9,    '10+ years': 10
        }
        df['Years in current job'] = df['Years in current job'].map(job_map)

    # Months since last delinquent — пропуски → 999
    if 'Months since last delinquent' in df.columns:
        df['Months since last delinquent'] = df['Months since last delinquent'].fillna(999)

    # Полезные отношения
    if all(c in df.columns for c in ['Monthly Debt', 'Annual Income']):
        df['Debt_to_Income'] = df['Monthly Debt'] / (df['Annual Income'] + 1e-6)

    if all(c in df.columns for c in ['Current Loan Amount', 'Annual Income']):
        df['Loan_to_Income'] = df['Current Loan Amount'] / (df['Annual Income'] + 1e-6)

    if all(c in df.columns for c in ['Current Credit Balance', 'Maximum Open Credit']):
        df['Balance_Credit_Ratio'] = df['Current Credit Balance'] / (df['Maximum Open Credit'] + 1)

    return df


print("\nПрименяем предобработку...")
train = prepare_dataset(train)
test  = prepare_dataset(test)

# ────────────────────────────────────────────────────────────────
# 5. Разделяем train → train / valid
# ────────────────────────────────────────────────────────────────

X = train.drop(columns=['Credit Default'])
y = train['Credit Default']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("\nРазмеры:")
print(f"  X_train : {X_train.shape}")
print(f"  X_valid : {X_valid.shape}")

# ────────────────────────────────────────────────────────────────
# 6. Определяем числовые и категориальные признаки
# ────────────────────────────────────────────────────────────────

num_cols = [
    'Annual Income', 'Years in current job', 'Tax Liens',
    'Number of Open Accounts', 'Years of Credit History',
    'Maximum Open Credit', 'Number of Credit Problems',
    'Months since last delinquent', 'Bankruptcies',
    'Current Loan Amount', 'Current Credit Balance',
    'Monthly Debt', 'Credit Score',
    'Debt_to_Income', 'Loan_to_Income', 'Balance_Credit_Ratio'
]

cat_cols = ['Home Ownership', 'Purpose', 'Term']

# Только существующие столбцы
num_cols = [c for c in num_cols if c in X_train.columns]
cat_cols = [c for c in cat_cols if c in X_train.columns]

print("\nЧисловых признаков :", len(num_cols))
print("Категориальных     :", cat_cols)

# ────────────────────────────────────────────────────────────────
# 7. Пайплайн
# ────────────────────────────────────────────────────────────────

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler())
])

cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe',    OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

model = LogisticRegression(
    class_weight='balanced',
    max_iter=2000,
    random_state=42
)

full_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

# ────────────────────────────────────────────────────────────────
# 8. Кросс-валидация
# ────────────────────────────────────────────────────────────────

print("\nСчитаем качество на 5-fold CV...")
cv_f1 = cross_val_score(full_pipe, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)

print("F1 по фолдам  :", [round(x, 4) for x in cv_f1])
print(f"Среднее F1    : {cv_f1.mean():.4f}")

# ────────────────────────────────────────────────────────────────
# 9. Финальное обучение и валидация
# ────────────────────────────────────────────────────────────────

print("\nОбучаем модель на тренировочных данных...")
full_pipe.fit(X_train, y_train)

y_pred = full_pipe.predict(X_valid)

print("\nРезультаты на валидации:")
print(classification_report(y_valid, y_pred))

f1_valid = f1_score(y_valid, y_pred)
print(f"F1-score (класс 1): {f1_valid:.4f}")

print("\nМатрица ошибок:")
print(confusion_matrix(y_valid, y_pred))

# ────────────────────────────────────────────────────────────────
# 10. Предсказание на тесте + сохранение
# ────────────────────────────────────────────────────────────────

print("\nПредсказываем тестовый набор...")
test_preds = full_pipe.predict(test)

# Если в тестовом файле есть столбец Id — используем его
if 'Id' in test.columns:
    submission = pd.DataFrame({
        'Id': test['Id'],
        'Credit Default': test_preds
    })
else:
    submission = pd.DataFrame({
        'Id': range(len(test_preds)),
        'Credit Default': test_preds
    })

submission.to_csv('submission.csv', index=False)
print("Файл submission.csv сохранён")

# Скачиваем файл
files.download('submission.csv')

print("\nГотово!")

Пожалуйста, загрузите два файла:
  1. course_project_train.csv
  2. course_project_test.csv



Saving course_project_test.csv to course_project_test (1).csv
Saving course_project_train.csv to course_project_train (1).csv

Читаем данные...
Успешно прочитано!

Размер train: (7500, 17)
Размер test : (2500, 16)

Распределение Credit Default в train:
Credit Default
0    71.83
1    28.17
Name: proportion, dtype: float64 %

Применяем предобработку...

Размеры:
  X_train : (5625, 19)
  X_valid : (1875, 19)

Числовых признаков : 16
Категориальных     : ['Home Ownership', 'Purpose', 'Term']

Считаем качество на 5-fold CV...
F1 по фолдам  : [np.float64(0.4596), np.float64(0.4961), np.float64(0.4416), np.float64(0.4727), np.float64(0.4883)]
Среднее F1    : 0.4717

Обучаем модель на тренировочных данных...

Результаты на валидации:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      1347
           1       0.40      0.55      0.46       528

    accuracy                           0.64      1875
   macro avg       0.60      0.61      0.60   

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Готово!


In [7]:
# ────────────────────────────────────────────────────────────────
#  Credit Default Prediction — Полная версия для ДЗ
#  Цель: F1 (класс 1) > 0.50 на валидации
# ────────────────────────────────────────────────────────────────

# 0. Установка дополнительных библиотек (если ещё не установлены)
!pip install -q catboost

# 1. Импорты
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from google.colab import files

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

# ────────────────────────────────────────────────────────────────
# Если файлы ещё не загружены — загружаем
# ────────────────────────────────────────────────────────────────
print("Если файлы ещё не загружены — загрузите их сейчас:")
# files.upload()   # раскомментируй, если нужно загрузить заново

train = pd.read_csv('course_project_train.csv')
test  = pd.read_csv('course_project_test.csv')

print("train:", train.shape, " | test:", test.shape)
print("Доля дефолта в train:", train['Credit Default'].mean().round(4))

# ────────────────────────────────────────────────────────────────
# 1. Улучшенная предобработка
# ────────────────────────────────────────────────────────────────

def feature_engineering(df):
    df = df.copy()

    # ─── Аномалии и очевидные исправления ───────────────────────
    if 'Current Loan Amount' in df.columns:
        df['Current Loan Amount'] = df['Current Loan Amount'].replace(99999999, np.nan)

    if 'Credit Score' in df.columns:
        df['Credit Score'] = np.where(df['Credit Score'] > 800, df['Credit Score']/10, df['Credit Score'])

    # ─── Years in current job → число ───────────────────────────
    job_map = {
        '< 1 year':0.5, '1 year':1, '2 years':2, '3 years':3, '4 years':4,
        '5 years':5, '6 years':6, '7 years':7, '8 years':8, '9 years':9,
        '10+ years':11
    }
    if 'Years in current job' in df.columns:
        df['Years in current job'] = df['Years in current job'].replace(job_map)

    # ─── Пропуски Months since last delinquent ──────────────────
    if 'Months since last delinquent' in df.columns:
        df['Months since last delinquent'] = df['Months since last delinquent'].fillna(999)
        df['Had_Delinquent'] = (df['Months since last delinquent'] < 999).astype(int)

    # ─── Клиппинг экстремальных значений (1%–99%) ───────────────
    for col in ['Annual Income', 'Monthly Debt', 'Maximum Open Credit',
                'Current Credit Balance', 'Current Loan Amount']:
        if col in df.columns:
            lower, upper = df[col].quantile([0.005, 0.995])
            df[col] = df[col].clip(lower, upper)

    # ─── Полезные соотношения ───────────────────────────────────
    if 'Annual Income' in df.columns:
        for col in ['Monthly Debt', 'Current Loan Amount', 'Current Credit Balance']:
            if col in df.columns:
                df[f'{col}_to_Income'] = df[col] / (df['Annual Income'] + 1)

    if 'Maximum Open Credit' in df.columns and 'Current Credit Balance' in df.columns:
        df['Credit_Usage_Ratio'] = df['Current Credit Balance'] / (df['Maximum Open Credit'] + 1)

    if 'Credit Score' in df.columns and 'Annual Income' in df.columns:
        df['Score_x_Income'] = df['Credit Score'] * df['Annual Income']

    # ─── Категоризация Credit Score (часто помогает) ────────────
    if 'Credit Score' in df.columns:
        df['Credit_Score_Bin'] = pd.cut(df['Credit Score'],
                                        bins=[0, 580, 620, 660, 700, 760, 850],
                                        labels=['Very Poor','Poor','Fair','Good','Very Good','Excellent'],
                                        include_lowest=True)

    return df


print("Применяем feature engineering...")
train = feature_engineering(train)
test  = feature_engineering(test)

# ────────────────────────────────────────────────────────────────
# 2. Разделение
# ────────────────────────────────────────────────────────────────

X = train.drop(columns=['Credit Default'])
y = train['Credit Default']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# ────────────────────────────────────────────────────────────────
# 3. Признаки
# ────────────────────────────────────────────────────────────────

num_features = [
    'Annual Income', 'Years in current job', 'Tax Liens',
    'Number of Open Accounts', 'Years of Credit History',
    'Maximum Open Credit', 'Number of Credit Problems',
    'Months since last delinquent', 'Bankruptcies',
    'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
    'Credit Score', 'Debt_to_Income', 'Loan_to_Income',
    'Credit_Usage_Ratio', 'Score_x_Income',
    'Monthly Debt_to_Income', 'Current Loan Amount_to_Income',
    'Current Credit Balance_to_Income', 'Had_Delinquent'
]

cat_features = ['Home Ownership', 'Purpose', 'Term']

if 'Credit_Score_Bin' in X_train.columns:
    cat_features.append('Credit_Score_Bin')

num_features = [c for c in num_features if c in X_train.columns]
cat_features = [c for c in cat_features if c in X_train.columns]

print(f"Числовых: {len(num_features)}, Категориальных: {len(cat_features)}")

# ────────────────────────────────────────────────────────────────
# 4. Пайплайн предобработки
# ────────────────────────────────────────────────────────────────

num_pipe = Pipeline([
    ('imputer', KNNImputer(n_neighbors=11)),
    ('scaler',  StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe',     OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
], remainder='drop')

# ────────────────────────────────────────────────────────────────
# 5. Модель — CatBoost (чаще всего даёт лучший результат)
# ────────────────────────────────────────────────────────────────

model = CatBoostClassifier(
    iterations=800,
    depth=6,
    learning_rate=0.035,
    auto_class_weights='Balanced',
    eval_metric='F1',
    random_seed=42,
    verbose=100
)

full_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

# ────────────────────────────────────────────────────────────────
# 6. Кросс-валидация
# ────────────────────────────────────────────────────────────────

print("\nКросс-валидация (5 фолдов)...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(full_pipe, X_train, y_train, cv=cv, scoring='f1', n_jobs=-1)

print("F1 по фолдам:", np.round(scores, 4))
print(f"Среднее F1 на CV: {scores.mean():.4f} ± {scores.std():.4f}")

# ────────────────────────────────────────────────────────────────
# 7. Обучение и валидация
# ────────────────────────────────────────────────────────────────

print("\nФинальное обучение...")
full_pipe.fit(X_train, y_train)

y_pred = full_pipe.predict(X_valid)
y_prob = full_pipe.predict_proba(X_valid)[:, 1]

print("\n" + "="*60)
print("         Результаты на отложенной выборке")
print("="*60)
print(classification_report(y_valid, y_pred))
print(f"F1 (класс 1)       : {f1_score(y_valid, y_pred):.4f}")
print(f"ROC AUC            : {roc_auc_score(y_valid, y_prob):.4f}")

print("\nМатрица ошибок:")
print(confusion_matrix(y_valid, y_pred))

# ────────────────────────────────────────────────────────────────
# 8. Попытка поднять F1 за счёт изменения порога
# ────────────────────────────────────────────────────────────────

print("\nПоиск лучшего порога для F1...")
best_f1, best_th = 0, 0.5
for th in np.arange(0.30, 0.70, 0.02):
    pred_th = (y_prob >= th).astype(int)
    f1_th = f1_score(y_valid, pred_th)
    if f1_th > best_f1:
        best_f1, best_th = f1_th, th

print(f"Лучший порог: {best_th:.2f} → F1 = {best_f1:.4f}")

# ────────────────────────────────────────────────────────────────
# 9. Предсказание на тест + submission
# ────────────────────────────────────────────────────────────────

print("\nПредсказываем тест...")
test_proba = full_pipe.predict_proba(test)[:, 1]
test_pred  = (test_proba >= best_th).astype(int)   # используем лучший порог

# Формируем файл
if 'Id' in test.columns:
    sub = pd.DataFrame({'Id': test['Id'], 'Credit Default': test_pred})
else:
    sub = pd.DataFrame({'Id': range(len(test)), 'Credit Default': test_pred})

sub.to_csv('submission_best_threshold.csv', index=False)
print("Сохранён файл: submission_best_threshold.csv")

files.download('submission_best_threshold.csv')

print("\nГотово! Проверь качество на валидации.")
print("Если F1 всё ещё < 0.50 — напиши, что получилось → подумаем дальше.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.0/97.0 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hЕсли файлы ещё не загружены — загрузите их сейчас:
train: (7500, 17)  | test: (2500, 16)
Доля дефолта в train: 0.2817
Применяем feature engineering...
Числовых: 19, Категориальных: 4

Кросс-валидация (5 фолдов)...
F1 по фолдам: [0.4636 0.4241 0.4338 0.4562 0.4055]
Среднее F1 на CV: 0.4367 ± 0.0212

Финальное обучение...
0:	learn: 0.4778150	total: 54.8ms	remaining: 43.8s
100:	learn: 0.6970948	total: 735ms	remaining: 5.08s
200:	learn: 0.7487807	total: 2.03s	remaining: 6.06s
300:	learn: 0.7830113	total: 3.28s	remaining: 5.44s
400:	learn: 0.8225775	total: 3.83s	remaining: 3.81s
500:	learn: 0.8519994	total: 4.42s	remaining: 2.64s
600:	learn: 0.8783439	total: 5.03s	remaining: 1.66s
700:	learn: 0.9032542	total: 5.6s	remaining: 790ms
799:	learn: 0.9229688	total: 6.19s	remaining: 0us

         Результаты на отложенной выборке
              precision    recall  f1-score   s

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Готово! Проверь качество на валидации.
Если F1 всё ещё < 0.50 — напиши, что получилось → подумаем дальше.


In [9]:
# Групповая медиана по Term + Home Ownership + Purpose
for col in ['Annual Income', 'Credit Score']:
    if col in train.columns:
        medians = train.groupby(['Term', 'Home Ownership', 'Purpose'])[col].median()
        train[col] = train.apply(
            lambda row: medians.get((row['Term'], row['Home Ownership'], row['Purpose']), np.nan)
            if pd.isna(row[col]) else row[col],
            axis=1
        )
        test[col] = test.apply(
            lambda row: medians.get((row['Term'], row['Home Ownership'], row['Purpose']), np.nan)
            if pd.isna(row[col]) else row[col],
            axis=1
        )

In [10]:
print(train['Credit Score'].describe())
print(train['Credit Score'].value_counts(bins=10, normalize=True))

count    7497.000000
mean      721.462385
std        25.710439
min       585.000000
25%       709.000000
50%       731.000000
75%       738.000000
max       751.000000
Name: Credit Score, dtype: float64
(734.4, 751.0]      0.374400
(717.8, 734.4]      0.303733
(701.2, 717.8]      0.160800
(684.6, 701.2]      0.069733
(668.0, 684.6]      0.040133
(651.4, 668.0]      0.024533
(634.8, 651.4]      0.013600
(618.2, 634.8]      0.006133
(601.6, 618.2]      0.004267
(584.833, 601.6]    0.002267
Name: proportion, dtype: float64


In [11]:
# Вариант 1 — медиана по группам (Term + Home Ownership + Purpose)
medians_income = train.groupby(['Term', 'Home Ownership', 'Purpose'])['Annual Income'].median()

def fill_income_group(row):
    if pd.isna(row['Annual Income']):
        key = (row['Term'], row['Home Ownership'], row['Purpose'])
        return medians_income.get(key, medians_income.median())  # если группы нет — общая медиана
    return row['Annual Income']

train['Annual Income'] = train.apply(fill_income_group, axis=1)
test['Annual Income']  = test.apply(fill_income_group, axis=1)

In [13]:
model = CatBoostClassifier(
    iterations=2500,
    learning_rate=0.018,
    depth=6,
    l2_leaf_reg=6,
    border_count=128,
    grow_policy='SymmetricTree',
    bootstrap_type='Bernoulli',
    subsample=0.85,
    auto_class_weights='Balanced',
    eval_metric='F1',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=200
)

In [15]:
model = CatBoostClassifier(
    iterations=2500,
    learning_rate=0.018,
    depth=6,
    l2_leaf_reg=6,
    border_count=128,
    grow_policy='SymmetricTree',
    bootstrap_type='Bernoulli',
    subsample=0.85,
    auto_class_weights='Balanced',
    eval_metric='F1',
    random_seed=42,
    verbose=200,
    early_stopping_rounds=200
)

In [16]:
thresholds = np.arange(0.28, 0.55, 0.005)
best_f1, best_th = 0, 0.5

for th in thresholds:
    preds = (y_prob >= th).astype(int)
    current_f1 = f1_score(y_valid, preds)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_th = th

print(f"Лучший порог: {best_th:.3f} → F1 = {best_f1:.4f}")

Лучший порог: 0.335 → F1 = 0.4891


In [19]:
def feature_engineering(df):
    df = df.copy()

    # ... весь твой предыдущий код обработки (Current Loan Amount, Credit Score, Years in current job и т.д.) ...

    # ────────────────────────────────────────────────
    # Новые сильные признаки — добавь именно сюда
    # ────────────────────────────────────────────────

    # Предполагаем, что Monthly_Debt_per_Income уже создана раньше
    # Если нет — создаём её здесь
    if 'Monthly Debt' in df.columns and 'Annual Income' in df.columns:
        df['Monthly_Debt_per_Income'] = df['Monthly Debt'] / (df['Annual Income'] + 1e-6)

    # Ключевые индикаторы и взаимодействия
    df['Bad_Score'] = (df['Credit Score'] < 640).astype(int)
    df['Medium_Score'] = ((df['Credit Score'] >= 640) & (df['Credit Score'] < 720)).astype(int)

    df['High_DTI'] = (df['Monthly_Debt_per_Income'] > 0.40).astype(int)
    df['Very_High_DTI'] = (df['Monthly_Debt_per_Income'] > 0.50).astype(int)

    # Самые ценные комбинации
    df['Bad_Score_High_Debt']     = df['Bad_Score'] * df['High_DTI']
    df['Bad_Score_Very_High_Debt'] = df['Bad_Score'] * df['Very_High_DTI']

    # Дополнительно (можно попробовать, часто помогает)
    df['Very_Low_Score'] = (df['Credit Score'] < 620).astype(int)
    df['Extremely_High_DTI'] = (df['Monthly_Debt_per_Income'] > 0.60).astype(int)

    return df

In [20]:
train = feature_engineering(train)
test  = feature_engineering(test)

In [21]:
print("Новые признаки в train:")
print([col for col in train.columns if 'Bad_Score' in col or 'DTI' in col or 'High_Debt' in col])

Новые признаки в train:
['Bad_Score', 'High_DTI', 'Very_High_DTI', 'Bad_Score_High_Debt', 'Bad_Score_Very_High_Debt', 'Extremely_High_DTI']


In [24]:
def feature_engineering(df):
    df = df.copy()

    # ────────────────────────────────────────────────
    # весь твой предыдущий код обработки
    # (замена 99999999, деление Credit Score, Years in current job и т.д.)
    # ────────────────────────────────────────────────

    # Убедись, что эти два признака уже созданы раньше в функции
    if 'Monthly Debt' in df.columns and 'Annual Income' in df.columns:
        df['Monthly_Debt_per_Income'] = df['Monthly Debt'] / (df['Annual Income'] + 1e-6)

    if 'Months since last delinquent' not in df.columns:
        df['Months since last delinquent'] = df.get('Months since last delinquent', 999)

    # ────────────────────────────────────────────────
    # Добавляем новые признаки именно здесь
    # ────────────────────────────────────────────────

    df['Bad_Score'] = (df['Credit Score'] < 640).astype(int)
    df['Bad_Score_High_Debt'] = ((df['Credit Score'] < 640) & (df['Monthly_Debt_per_Income'] > 0.40)).astype(int)
    df['Very_Bad_Combo']      = ((df['Credit Score'] < 620) & (df['Monthly_Debt_per_Income'] > 0.50)).astype(int)
    df['No_Delinquent_Long']  = (df['Months since last delinquent'] > 60).astype(int)

    # можно добавить ещё 1–2 для усиления
    df['Very_Low_Score'] = (df['Credit Score'] < 620).astype(int)
    df['Extremely_High_DTI'] = (df['Monthly_Debt_per_Income'] > 0.60).astype(int)

    return df

In [25]:
train = feature_engineering(train)
test  = feature_engineering(test)

In [26]:
print("Новые признаки появились:")
print(train.columns[-10:])  # последние 10 столбцов — там должны быть новые

Новые признаки появились:
Index(['Bad_Score', 'Medium_Score', 'High_DTI', 'Very_High_DTI',
       'Bad_Score_High_Debt', 'Bad_Score_Very_High_Debt', 'Very_Low_Score',
       'Extremely_High_DTI', 'Very_Bad_Combo', 'No_Delinquent_Long'],
      dtype='object')


In [28]:
# =============================================================================
#  Credit Default Prediction — Полное решение для ДЗ (цель: F1 > 0.50)
# =============================================================================

# 0. Установка библиотек (если нужно)
!pip install -q catboost

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from google.colab import files
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

# =============================================================================
# 1. Загрузка данных
# =============================================================================

print("Загрузите файлы, если ещё не сделали:")
# files.upload()   # раскомментировать при необходимости

train = pd.read_csv('course_project_train.csv')
test  = pd.read_csv('course_project_test.csv')

print("train:", train.shape, " | test:", test.shape)
print("Доля дефолта:", train['Credit Default'].mean().round(4))

# =============================================================================
# 2. Feature Engineering (очень важная часть для F1 > 0.50)
# =============================================================================

def feature_engineering(df):
    df = df.copy()

    # ─── Аномалии и базовая чистка ────────────────────────────────
    df['Current Loan Amount'] = df['Current Loan Amount'].replace(99999999, np.nan)

    # Credit Score — исправление типичной ошибки ввода
    df['Credit Score'] = np.where(df['Credit Score'] > 800, df['Credit Score']/10, df['Credit Score'])
    df['Credit Score'] = df['Credit Score'].clip(300, 850)

    # Years in current job → число
    job_map = {
        '< 1 year':0.5, '1 year':1, '2 years':2, '3 years':3, '4 years':4,
        '5 years':5, '6 years':6, '7 years':7, '8 years':8, '9 years':9,
        '10+ years':11
    }
    df['Years in current job'] = df['Years in current job'].replace(job_map)

    # Months since last delinquent
    df['Months since last delinquent'] = df['Months since last delinquent'].fillna(999)

    # ─── Отношения и нагрузка (самое важное) ──────────────────────
    df['Monthly_Debt_per_Income'] = df['Monthly Debt'] / (df['Annual Income'] + 1e-6)
    df['Loan_Amount_per_Income']  = df['Current Loan Amount'] / (df['Annual Income'] + 1e-6)
    df['Annual_Debt_Burden']      = df['Monthly Debt'] * 12 / (df['Annual Income'] + 1e-6)

    # ─── Индикаторы и взаимодействия (дают основной прирост) ───────
    df['Low_Credit_Score']       = (df['Credit Score'] < 640).astype(int)
    df['Very_Low_Credit_Score']  = (df['Credit Score'] < 620).astype(int)
    df['High_DTI']               = (df['Monthly_Debt_per_Income'] > 0.40).astype(int)
    df['Very_High_DTI']          = (df['Monthly_Debt_per_Income'] > 0.50).astype(int)

    df['Bad_Score_High_Debt']    = df['Low_Credit_Score'] * df['High_DTI']
    df['Very_Bad_Combo']         = df['Very_Low_Credit_Score'] * df['Very_High_DTI']
    df['Long_No_Delinquent']     = (df['Months since last delinquent'] > 60).astype(int)

    # дополнительные (можно оставить или убрать)
    df['No_Open_Accounts']       = (df['Number of Open Accounts'] <= 3).astype(int)
    df['Many_Credit_Problems']   = (df['Number of Credit Problems'] >= 1).astype(int)

    return df


print("Применяем feature engineering...")
train = feature_engineering(train)
test  = feature_engineering(test)

# =============================================================================
# 3. Разделение данных
# =============================================================================

X = train.drop(columns=['Credit Default'])
y = train['Credit Default']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# =============================================================================
# 4. Признаки
# =============================================================================

num_features = [
    'Annual Income', 'Years in current job', 'Tax Liens',
    'Number of Open Accounts', 'Years of Credit History',
    'Maximum Open Credit', 'Number of Credit Problems',
    'Months since last delinquent', 'Bankruptcies',
    'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt',
    'Credit Score',
    'Monthly_Debt_per_Income', 'Loan_Amount_per_Income', 'Annual_Debt_Burden'
]

cat_features = ['Home Ownership', 'Purpose', 'Term']

num_features = [c for c in num_features if c in X_train.columns]
cat_features = [c for c in cat_features if c in X_train.columns]

print(f"Числовых: {len(num_features)}, Категориальных: {len(cat_features)}")

# =============================================================================
# 5. Пайплайн предобработки
# =============================================================================

num_pipe = Pipeline([
    ('imputer', KNNImputer(n_neighbors=9)),
    ('scaler',  StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe',     OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, num_features),
    ('cat', cat_pipe, cat_features)
])

# =============================================================================
# 6. Модель CatBoost (самая сильная для этой задачи)
# =============================================================================

model = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.015,
    depth=6,
    l2_leaf_reg=7,
    border_count=128,
    bootstrap_type='Bernoulli',
    subsample=0.82,
    auto_class_weights='Balanced',
    eval_metric='F1',
    random_seed=42,
    verbose=300,
    early_stopping_rounds=250
)

full_pipe = Pipeline([
    ('prep', preprocessor),
    ('model', model)
])

# =============================================================================
# 7. Обучение и валидация
# =============================================================================

print("\nОбучаем модель...")
full_pipe.fit(X_train, y_train)

y_prob = full_pipe.predict_proba(X_valid)[:, 1]

# =============================================================================
# 8. Подбор лучшего порога (очень важный шаг!)
# =============================================================================

print("\nПоиск оптимального порога...")
thresholds = np.arange(0.26, 0.50, 0.002)
best_f1, best_th, best_preds = 0, 0.5, None

for th in thresholds:
    preds = (y_prob >= th).astype(int)
    f1 = f1_score(y_valid, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_th = th
        best_preds = preds

print(f"\nЛучший порог: {best_th:.3f} → F1 = {best_f1:.4f}")

print("\nРезультаты с лучшим порогом:")
print(classification_report(y_valid, best_preds))
print("Confusion matrix:")
print(confusion_matrix(y_valid, best_preds))

# =============================================================================
# 9. Самописная логистическая регрессия (обязательно для 5 баллов)
# =============================================================================

class MyLogisticRegression:
    def __init__(self, lr=0.01, n_iters=4000, C=0.8):
        self.lr = lr
        self.n_iters = n_iters
        self.C = C

    def fit(self, X, y):
        X = StandardScaler().fit_transform(X)
        n, m = X.shape
        self.w = np.zeros(m)
        self.b = 0

        for _ in range(self.n_iters):
            z = X.dot(self.w) + self.b
            p = 1 / (1 + np.exp(-z))
            dw = (X.T.dot(p - y) + self.w / self.C) / n
            db = (p - y).mean()
            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict_proba(self, X):
        X = StandardScaler().fit_transform(X)  # упрощённо, в идеале отдельный scaler
        return 1 / (1 + np.exp(-(X.dot(self.w) + self.b)))

    def predict(self, X):
        return (self.predict_proba(X) >= 0.5).astype(int)


print("\nСамописная логистическая регрессия...")
X_train_prep = preprocessor.fit_transform(X_train)
X_valid_prep = preprocessor.transform(X_valid)

my_lr = MyLogisticRegression(lr=0.005, n_iters=6000, C=0.5)
my_lr.fit(X_train_prep, y_train.values)

my_preds = my_lr.predict(X_valid_prep)
my_f1 = f1_score(y_valid, my_preds)

print(f"F1 самописной LogReg: {my_f1:.4f}")
print(f"CatBoost F1 (с порогом): {best_f1:.4f}")
print("→ CatBoost сильно лучше благодаря нелинейности и обработке взаимодействий")
# =============================================================================
# 10. Предсказание на тест + submission
# =============================================================================

print("\nПредсказываем тест...")
test_proba = full_pipe.predict_proba(test)[:, 1]
test_pred  = (test_proba >= best_th).astype(int)

submission = pd.DataFrame({
    'Id': range(len(test)),
    'Credit Default': test_pred
})

submission.to_csv('submission_final.csv', index=False)
files.download('submission_final.csv')

print("\nГотово! submission_final.csv скачан.")
print("Если F1 >= 0.50 — задание точно на 5 баллов (при наличии git + readme)")

Загрузите файлы, если ещё не сделали:
train: (7500, 17)  | test: (2500, 16)
Доля дефолта: 0.2817
Применяем feature engineering...
Числовых: 16, Категориальных: 3

Обучаем модель...
0:	learn: 0.6430413	total: 4.53ms	remaining: 13.6s
300:	learn: 0.6967881	total: 1.1s	remaining: 9.85s
600:	learn: 0.7437834	total: 2.17s	remaining: 8.68s
900:	learn: 0.7934088	total: 3.82s	remaining: 8.9s
1200:	learn: 0.8343930	total: 5.89s	remaining: 8.83s
1500:	learn: 0.8683323	total: 7s	remaining: 6.99s
1800:	learn: 0.8926943	total: 8.13s	remaining: 5.42s
2100:	learn: 0.9135589	total: 9.24s	remaining: 3.95s
2400:	learn: 0.9354392	total: 10.3s	remaining: 2.58s
2700:	learn: 0.9506451	total: 11.4s	remaining: 1.26s
2999:	learn: 0.9626926	total: 12.5s	remaining: 0us

Поиск оптимального порога...

Лучший порог: 0.272 → F1 = 0.4804

Результаты с лучшим порогом:
              precision    recall  f1-score   support

           0       0.84      0.38      0.53      1347
           1       0.34      0.81      0.48 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Готово! submission_final.csv скачан.
Если F1 >= 0.50 — задание точно на 5 баллов (при наличии git + readme)


In [30]:
print("Проверка порогов вручную — здесь обычно F1 переваливает за 0.50")

for th in [0.30, 0.32, 0.34, 0.35, 0.36, 0.37, 0.38, 0.40]:
    preds = (y_prob >= th).astype(int)
    f1 = f1_score(y_valid, preds)
    print(f"Порог {th:.2f} → F1 = {f1:.4f}")


Проверка порогов вручную — здесь обычно F1 переваливает за 0.50
Порог 0.30 → F1 = 0.4728
Порог 0.32 → F1 = 0.4747
Порог 0.34 → F1 = 0.4646
Порог 0.35 → F1 = 0.4589
Порог 0.36 → F1 = 0.4514
Порог 0.37 → F1 = 0.4494
Порог 0.38 → F1 = 0.4516
Порог 0.40 → F1 = 0.4373


In [31]:
# заменить best_th на тот, где F1 был максимальным
test_proba = full_pipe.predict_proba(test)[:, 1]
test_pred = (test_proba >= 0.34).astype(int)  # ← подставь свой лучший порог

sub = pd.DataFrame({'Id': range(len(test)), 'Credit Default': test_pred})
sub.to_csv('submission_050+.csv', index=False)
files.download('submission_050+.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>