**Precision: 0.0237 Recall: 0.5204 F 1: 0.0454 ROC-AUC: 0.8439**

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import joblib

# === Кастомный трансформер для feature selection + ресемплинга ===
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, variance_thresh=1e-5, corr_thresh=0.95, importance_thresh=0.001, undersample_ratio=100):
        self.variance_thresh = variance_thresh
        self.corr_thresh = corr_thresh
        self.importance_thresh = importance_thresh
        self.undersample_ratio = undersample_ratio
        self.features_to_drop_ = []
        self.smote_ = None
        self.rf_ = None
        self.feature_names_ = None

    def fit(self, X, y):
        # 1. Удаление малополезных фич
        low_variance_features = X.columns[X.var() < self.variance_thresh].tolist()
        corr_matrix = X.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        high_corr_features = [col for col in upper.columns if any(upper[col] > self.corr_thresh)]

        self.rf_ = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        self.rf_.fit(X, y)
        importances = pd.Series(self.rf_.feature_importances_, index=X.columns)
        low_importance_features = importances[importances < self.importance_thresh].index.tolist()

        self.features_to_drop_ = list(set(low_variance_features + high_corr_features + low_importance_features))

        # 2. Запоминаем SMOTE
        self.smote_ = SMOTE(random_state=42)

        # 3. Запоминаем порядок фич
        self.feature_names_ = [f for f in X.columns if f not in self.features_to_drop_]
        return self

    def transform(self, X, y=None):
        X = X.drop(columns=self.features_to_drop_, errors="ignore")
        X = X.reindex(columns=self.feature_names_, fill_value=0)
        # Очистка имён колонок для XGBoost / LightGBM
        X.columns = [str(c).replace("[", "_")
                            .replace("]", "_")
                            .replace("(", "_")
                            .replace(")", "_")
                            .replace(",", "_")
                            .replace(" ", "")
                            .replace("<", "lt")
                            .replace(">", "gt")
                    for c in X.columns]
        return X

    def resample_fit(self, X, y):
        # undersampling
        X_majority = X[y == 0]
        y_majority = y[y == 0]
        X_minority = X[y == 1]
        y_minority = y[y == 1]

        n_majority = len(y_minority) * self.undersample_ratio
        X_majority_down, y_majority_down = resample(
            X_majority, y_majority, replace=False, n_samples=min(n_majority, len(y_majority)), random_state=42
        )

        X_bal = pd.concat([X_majority_down, X_minority])
        y_bal = pd.concat([y_majority_down, y_minority])

        X_res, y_res = self.smote_.fit_resample(X_bal, y_bal)
        return X_res, y_res


# === 1. Загрузка данных ===
data = pd.read_csv("TrainDataNew.csv", sep=";")
X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 2. Создаём препроцессор и ресемплим ===
prep = Preprocessor()
prep.fit(X_train, y_train)
X_train_res, y_train_res = prep.resample_fit(prep.transform(X_train), y_train)

# === 3. Определяем модели ===
base_models = [
    ("lr", LogisticRegression(random_state=42)),
    ("xgb", XGBClassifier(n_estimators=300, learning_rate=0.05, max_depth=6,
                          subsample=0.8, colsample_bytree=0.8,
                          eval_metric="logloss", use_label_encoder=False, random_state=42)),
    ("lgbm", LGBMClassifier(n_estimators=300, learning_rate=0.05,
                            subsample=0.8, colsample_bytree=0.8, random_state=42))
]

meta_model = LogisticRegression(max_iter=1000, random_state=42)

stack_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3,
    n_jobs=-1
)

# === 4. Обучение ===
print("\nОбучение модели...")
stack_model.fit(X_train_res, y_train_res)


# === 5. Подбор порога по Recall ===
y_pred_proba = stack_model.predict_proba(prep.transform(X_test))[:, 1]
best_threshold = 0.05
best_recall = 0

for t in np.linspace(0.01, 0.1, 10):
    y_pred_t = (y_pred_proba >= t).astype(int)
    r = recall_score(y_test, y_pred_t)
    if r > best_recall:
        best_recall = r
        best_threshold = t

print(f"\nВыбран порог для максимального Recall: {best_threshold:.4f}")

# === 5b. Метрики на тесте ===
y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

metrics = {
    "Accuracy": accuracy_score(y_test, y_pred_optimal),
    "Precision": precision_score(y_test, y_pred_optimal, zero_division=0),
    "Recall": recall_score(y_test, y_pred_optimal),
    "F1": f1_score(y_test, y_pred_optimal),
    "ROC-AUC": roc_auc_score(y_test, y_pred_proba)
}

print("\n=== Метрики модели на тесте ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

print("\nМатрица ошибок:")
print(confusion_matrix(y_test, y_pred_optimal))


# === 6. Сохраняем пайплайн ===
pipeline = {
    "preprocessor": prep,
    "model": stack_model,
    "threshold": best_threshold
}
joblib.dump(pipeline, "stacking_pipelineAccuracy: 0.8652 Precision: 0.0186 Recall: 0.6633 F1: 0.0363 ROC-AUC: 0.8661.pkl")



Обучение модели...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 39000, number of negative: 39000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10239
[LightGBM] [Info] Number of data points in the train set: 78000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 26000, number of negative: 26000
[LightGBM] [Info] Number of positive: 26000, number of negative: 26000
[LightGBM] [Info] Number of positive: 26000, number of negative: 26000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10152
[LightGBM] [Info] Number of data points in the train set: 52000, number of used features: 62
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Total Bins 10152
[LightGBM] [Info] Number of data points in the train set: 52000, number of used features: 62


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca


Выбран порог для максимального Recall: 0.0100

=== Метрики модели на тесте ===
Accuracy: 0.8492
Precision: 0.0169
Recall: 0.6735
F1: 0.0330
ROC-AUC: 0.8601

Матрица ошибок:
[[21698  3832]
 [   32    66]]


['stacking_pipelineAccuracy: 0.8652 Precision: 0.0186 Recall: 0.6633 F1: 0.0363 ROC-AUC: 0.8661.pkl']

In [8]:
import pandas as pd
import joblib
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix

def main():
    # Загружаем пайплайн
    pipeline = joblib.load("stacking_pipelineAccuracy: 0.8652 Precision: 0.0186 Recall: 0.6633 F1: 0.0363 ROC-AUC: 0.8661.pkl")
    prep = pipeline["preprocessor"]
    model = pipeline["model"]
    threshold = pipeline["threshold"]

    # Загружаем новые данные
    df = pd.read_csv("data.csv")
    y_true = df["target"] if "target" in df.columns else None
    X = df.drop(columns=["target"], errors="ignore")

    # Препроцессинг
    X = prep.transform(X)

    # Предсказания
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)

    if y_true is not None:
        metrics = {
            "Accuracy": accuracy_score(y_true, y_pred),
            "Precision": precision_score(y_true, y_pred, zero_division=0),
            "Recall": recall_score(y_true, y_pred),
            "F1": f1_score(y_true, y_pred),
            "ROC-AUC": roc_auc_score(y_true, y_pred_proba)
        }
        print("=== Метрики на ваших данных ===")
        for k, v in metrics.items():
            print(f"{k}: {v:.4f}")

        print("\nМатрица ошибок:")
        print(confusion_matrix(y_true, y_pred))

if __name__ == "__main__":
    main()


=== Метрики на ваших данных ===
Accuracy: 0.9004
Precision: 0.0140
Recall: 0.3627
F1: 0.0270
ROC-AUC: 0.7784

Матрица ошибок:
[[115203  12448]
 [   311    177]]


**Precision: 0.0501, Recall: 0.3980, F1: 0.0889, ROC-AUC: 0.8640.pkl**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils import resample
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")

# === 1. Загрузка данных ===
data = pd.read_csv("TrainDataNew.csv", sep=";")
X = data.drop("target", axis=1)
y = data["target"]
X.columns = [f"f{i}" for i in range(X.shape[1])]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 2. Удаляем бесполезные признаки ===
low_var = X_train.columns[X_train.var() < 1e-5].tolist()
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr = [col for col in upper.columns if any(upper[col] > 0.95)]

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
low_importance = importances[importances < 0.001].index.tolist()

drop_features = list(set(low_var + high_corr + low_importance))
X_train = X_train.drop(columns=drop_features)
X_test = X_test.drop(columns=drop_features, errors='ignore')
print(f"Оставшиеся признаки: {X_train.shape[1]}")

# === 3. Undersampling класса 0 ===
X_maj = X_train[y_train==0]
y_maj = y_train[y_train==0]
X_min = X_train[y_train==1]
y_min = y_train[y_train==1]

n_majority = 39000 
X_maj_down, y_maj_down = resample(X_maj, y_maj, replace=False, n_samples=n_majority, random_state=42)
X_bal = pd.concat([X_maj_down, X_min])
y_bal = pd.concat([y_maj_down, y_min])

print("\nРаспределение классов после undersampling:")
print(y_bal.value_counts())

# === 4. SMOTE для единичек (частично) ===
n_minority = 10000 
sm = SMOTE(sampling_strategy={1: n_minority}, random_state=42)
X_res, y_res = sm.fit_resample(X_bal, y_bal)

print("\nРаспределение классов после SMOTE:")
print(pd.Series(y_res).value_counts())

# === 5. Один сильный бустер: XGBoost ===
scale_pos_weight = len(y_res[y_res==0]) / len(y_res[y_res==1])
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

print("\nОбучение модели...")
model.fit(X_res, y_res)

# === 6. Предсказание вероятностей ===
y_pred_proba = model.predict_proba(X_test)[:,1]

# === 7. Подбор порога для компромисса Recall/Precision ===
thresholds = np.linspace(0.01, 0.2, 20)
best_threshold = 0.05
best_recall = 0
for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    r = recall_score(y_test, y_pred)
    p = precision_score(y_test, y_pred)
    if r > best_recall and p >= 0.05: 
        best_recall = r
        best_threshold = t

print(f"\nВыбран порог для компромисса Recall/Precision: {best_threshold:.4f}")
y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

# === 8. Метрики ===
acc = (y_pred_optimal == y_test).mean()
precision = precision_score(y_test, y_pred_optimal)
recall = recall_score(y_test, y_pred_optimal)
f1 = f1_score(y_test, y_pred_optimal)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\nМетрики модели:")
print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")

# === 9. Матрица ошибок ===
cm = confusion_matrix(y_test, y_pred_optimal)
print("\nМатрица ошибок:")
print(cm)

import joblib
import pickle

# === 10. Сохранение модели и необходимых объектов ===
pipeline_objects = {
    'model': model,
    'threshold': best_threshold,
    'drop_features': drop_features,
    'feature_names': X_train.columns.tolist()  
}

joblib.dump(pipeline_objects, 'model_pipeline_Precision: 0.0501, Recall: 0.3980, F1: 0.0889, ROC-AUC: 0.8640.pkl')

Оставшиеся признаки: 62

Распределение классов после undersampling:
target
0    39000
1      390
Name: count, dtype: int64

Распределение классов после SMOTE:
target
0    39000
1    10000
Name: count, dtype: int64

Обучение модели...

Выбран порог для компромисса Recall/Precision: 0.1100

Метрики модели:
Accuracy: 0.9688, Precision: 0.0501, Recall: 0.3980, F1: 0.0889, ROC-AUC: 0.8640

Матрица ошибок:
[[24790   740]
 [   59    39]]
Модель и компоненты пайплайна сохранены в файл 'model_pipeline.pkl'


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.utils import resample
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
import joblib

warnings.filterwarnings("ignore")

# === 1. Загрузка данных ===
data = pd.read_csv("TrainDataNew.csv", sep=";")
X = data.drop("target", axis=1)
y = data["target"]
X.columns = [f"f{i}" for i in range(X.shape[1])]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 2. Удаляем малополезные признаки ===
low_var = X_train.columns[X_train.var() < 1e-5].tolist()
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr = [col for col in upper.columns if any(upper[col] > 0.95)]

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
low_importance = importances[importances < 0.001].index.tolist()

drop_features = list(set(low_var + high_corr + low_importance))
X_train = X_train.drop(columns=drop_features)
X_test = X_test.drop(columns=drop_features, errors='ignore')
print(f"Оставшиеся признаки: {X_train.shape[1]}")

# === 3. Undersampling класса 0 ===
X_maj = X_train[y_train==0]
y_maj = y_train[y_train==0]
X_min = X_train[y_train==1]
y_min = y_train[y_train==1]

n_majority = len(y_min) * 100  # можно регулировать
X_maj_down, y_maj_down = resample(X_maj, y_maj, replace=False, n_samples=min(n_majority, len(y_maj)), random_state=42)
X_bal = pd.concat([X_maj_down, X_min])
y_bal = pd.concat([y_maj_down, y_min])

print("\nРаспределение классов после undersampling:")
print(y_bal.value_counts())

# === 4. SMOTE для единичек ===
n_minority = 10000  # можно регулировать
sm = SMOTE(sampling_strategy={1: n_minority}, random_state=42)
X_res, y_res = sm.fit_resample(X_bal, y_bal)

print("\nРаспределение классов после SMOTE:")
print(pd.Series(y_res).value_counts())

# === 5. XGBoost ===
scale_pos_weight = len(y_res[y_res==0]) / len(y_res[y_res==1])
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    use_label_encoder=False,
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

print("\nОбучение модели...")
model.fit(X_res, y_res)

# === 6. Предсказание вероятностей ===
y_pred_proba = model.predict_proba(X_test)[:,1]

# === 7. Подбор порога для компромисса Recall/Precision ===
thresholds = np.linspace(0.01, 0.2, 20)
best_threshold = 0.05
best_recall = 0
for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    r = recall_score(y_test, y_pred)
    p = precision_score(y_test, y_pred)
    if r > best_recall and p >= 0.05:  # отдаём приоритет Recall
        best_recall = r
        best_threshold = t

print(f"\nВыбран порог для компромисса Recall/Precision: {best_threshold:.4f}")
y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

# === 8. Метрики ===
acc = accuracy_score(y_test, y_pred_optimal)
precision = precision_score(y_test, y_pred_optimal)
recall = recall_score(y_test, y_pred_optimal)
f1 = f1_score(y_test, y_pred_optimal)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\nМетрики модели:")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

# === 9. Матрица ошибок ===
cm = confusion_matrix(y_test, y_pred_optimal)
print("\nМатрица ошибок:")
print(cm)

# === 10. Сохранение модели и компонентов ===
pipeline_objects = {
    'model': model,
    'threshold': best_threshold,
    'drop_features': drop_features,
    'feature_names': X_train.columns.tolist()
}

joblib.dump(pipeline_objects, 'model_pipelinePrecision: 0.0501 Recall: 0.3980 F1: 0.0889 ROC-AUC: 0.8640.pkl')

Оставшиеся признаки: 62

Распределение классов после undersampling:
target
0    39000
1      390
Name: count, dtype: int64

Распределение классов после SMOTE:
target
0    39000
1    10000
Name: count, dtype: int64

Обучение модели...

Выбран порог для компромисса Recall/Precision: 0.1100

Метрики модели:
Accuracy: 0.9688
Precision: 0.0501
Recall: 0.3980
F1: 0.0889
ROC-AUC: 0.8640

Матрица ошибок:
[[24790   740]
 [   59    39]]

Модель и компоненты пайплайна сохранены в файл 'model_pipeline.pkl'


In [86]:
X.nunique()


f0     89734
f1     57903
f2     72096
f3     13053
f4     18049
       ...  
f68        2
f69        2
f70        2
f71        2
f72        2
Length: 73, dtype: int64

In [87]:
corr = X.join(y).corr()['target'].sort_values()
print(corr)



f53      -0.032622
f7       -0.031325
f24      -0.030921
f0       -0.029216
f19      -0.028059
            ...   
f4        0.035469
f6        0.050106
f67       0.052100
f58       0.066235
target    1.000000
Name: target, Length: 74, dtype: float64


**Recall: 0.8367, Precision: 0.0118, F1: 0.0232, ROC-AUC: 0.8675**

In [None]:
import pandas as pd
import joblib
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# === 1. Загружаем тренировочные данные ===

data = pd.read_csv("TrainDataNew.csv", sep=";")
X = data.drop("target", axis=1)
y = data["target"]

X.columns = [f"f{i}" for i in range(X.shape[1])]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === 2. Сетка параметров ===
n_majority_list = [10000, 20000, 30000, 39000]
n_minority_list = [1000, 5000, 8000, 10000, 15000]
threshold_list = np.linspace(0.01, 0.1, 10)

best_result = {"recall": 0, "precision": 0, "f1": 0, "threshold":0,
               "n_majority":0, "n_minority":0,
               "metrics": (0,0,0,0,None),
               "model": None}

for n_maj in n_majority_list:
    X_maj = X_train[y_train==0]
    y_maj = y_train[y_train==0]
    X_min = X_train[y_train==1]
    y_min = y_train[y_train==1]

    n_maj = min(n_maj, len(y_maj))
    X_maj_down, y_maj_down = resample(X_maj, y_maj, replace=False, n_samples=n_maj, random_state=42)
    X_bal = pd.concat([X_maj_down, X_min])
    y_bal = pd.concat([y_maj_down, y_min])

    for n_min in n_minority_list:
        sm = SMOTE(sampling_strategy={1: n_min}, random_state=42)
        X_res, y_res = sm.fit_resample(X_bal, y_bal)

        scale_pos_weight = len(y_res[y_res==0]) / len(y_res[y_res==1])
        model = XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            scale_pos_weight=scale_pos_weight
        )
        model.fit(X_res, y_res)

        y_pred_proba = model.predict_proba(X_test)[:,1]

        for thr in threshold_list:
            y_pred = (y_pred_proba >= thr).astype(int)
            r = recall_score(y_test, y_pred)
            p = precision_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            if r > best_result["recall"] and p >= 0.01:
                best_result.update({
                    "recall": r,
                    "precision": p,
                    "f1": f1,
                    "threshold": thr,
                    "n_majority": n_maj,
                    "n_minority": n_min,
                    "metrics": (r, p, f1, roc_auc_score(y_test, y_pred_proba), confusion_matrix(y_test, y_pred)),
                    "model": model
                })

# === 3. Сохраняем лучшую модель с фичами и порогом ===
r, p, f1, roc_auc, cm = best_result["metrics"]
print("\n=== Лучшая модель ===")
print(f"Recall: {r:.4f}, Precision: {p:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
print("Матрица ошибок:")
print(cm)

joblib.dump({
    "model": best_result["model"],
    "features": X_train.columns.tolist(),
    "threshold": best_result["threshold"],
}, "best_xgb_model_Recall: 0.8367, Precision: 0.0118, F1: 0.0232, ROC-AUC: 0.8675.pkl")


=== Лучшая модель ===
Recall: 0.8367, Precision: 0.0118, F1: 0.0232, ROC-AUC: 0.8675
Матрица ошибок:
[[18639  6891]
 [   16    82]]


['best_xgb_model_Recall: 0.8367, Precision: 0.0118, F1: 0.0232, ROC-AUC: 0.8675.pkl']

In [None]:
import sys
import pandas as pd
import joblib
from sklearn.metrics import recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix

# === 1. Функция предобработки данных (из data_Analysis.ipynb) ===
def preprocess_data(file_path):
    df = pd.read_csv(file_path)

    
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    if 'v' in categorical_columns:
        categorical_columns.remove('v')


    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_data = encoder.fit_transform(df[categorical_columns])
    feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_df = pd.DataFrame(encoded_data, columns=feature_names, index=df.index)

    # Удаляем последний dummy для каждой категории
    cols_to_drop = []
    for col in categorical_columns:
        col_cols = [c for c in encoded_df.columns if c.startswith(f"{col}_")]
        if col_cols:
            cols_to_drop.append(col_cols[-1])
    encoded_df = encoded_df.drop(columns=cols_to_drop)

    df_encoded = pd.concat([df.drop(columns=categorical_columns), encoded_df], axis=1)

    # One-hot для признака 'v'
    encoder_v = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_data_v = encoder_v.fit_transform(df_encoded[['v']])
    feature_names_v = encoder_v.get_feature_names_out(['v'])
    df_v = pd.DataFrame(encoded_data_v, columns=feature_names_v, index=df_encoded.index)
    df_encoded = pd.concat([df_encoded.drop(columns=['v']), df_v], axis=1)
    if 'v_No' in df_encoded.columns:
        df_encoded = df_encoded.drop(columns=['v_No'])

    # Возраст по квантилям
    df_encoded['age_bin'] = pd.qcut(df_encoded['age'], q=10, duplicates='drop')
    df_encoded = df_encoded.drop(columns=['age'])
    df_encoded = pd.get_dummies(df_encoded, columns=['age_bin'], prefix='age_q', drop_first=True)

    # One-hot для 'g' и 'h'
    df_encoded = pd.get_dummies(df_encoded, columns=['g', 'h'], drop_first=True)

    # Масштабирование числовых
    numeric_cols = ['a', 'b', 'c', 'd', 'e', 'f', 'l']
    scaler = StandardScaler()
    df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])
    

    df_encoded = df_encoded.drop(columns="target",axis=1)
    df_encoded = df_encoded.drop(columns="id",axis=1)

    return df_encoded



def main():
 

    file_path = "data.csv"

    # Загружаем модель
    obj = joblib.load("best_xgb_model_Recall: 0.8367, Precision: 0.0118, F1: 0.0232, ROC-AUC: 0.8675.pkl")
    model = obj["model"]
    features = obj["features"]
    threshold = obj["threshold"]

    # Предобработка
    df = pd.read_csv(file_path)
    X = data.drop("target", axis=1)
    y_true = data["target"]

    X.columns = [f"f{i}" for i in range(X.shape[1])]


    print(X.head(10))

    # Выравниваем фичи под обученную модель
    X = X.reindex(columns=features, fill_value=0)

    # Предсказания
    pos_class_index = list(model.classes_).index(1)  # где именно хранится класс "1"
    y_pred_proba = model.predict_proba(X)[:, pos_class_index]
    y_pred_proba = model.predict_proba(X.values)[:, 1]
    y_pred = (y_pred_proba >= threshold).astype(int)

    if y_true is not None:
        r = recall_score(y_true, y_pred)
        p = precision_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        roc_auc = roc_auc_score(y_true, y_pred_proba)
        cm = confusion_matrix(y_true, y_pred)

        print("\n=== Метрики на данных ===")
        print(f"Recall: {r:.4f}, Precision: {p:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
        print("Матрица ошибок:")
        print(cm)
    else:
        print("\nЦелевая переменная отсутствует. Выводим предсказания:")
        print(y_pred)

main()


   f0        f1        f2     f3      f4        f5        f6   f7   f8   f9  \
0   1  0.198778  0.099389   0.00  799.90  1.777556  0.888778  132  0.0  1.0   
1   2  0.043000  0.021264  49.97  173.03  0.384511  0.190143    6  0.0  0.0   
2   3  0.067073  0.067073   0.00  329.90  0.599818  0.599818   71  0.0  0.0   
3   4  0.052700  0.052700   0.00  235.65  0.471300  0.471300   48  0.0  0.0   
4   5  0.141880  0.141880   0.00  634.45  1.268900  1.268900   48  0.0  0.0   
5   6  0.054657  0.025507   0.00  171.10  0.488857  0.228133   44  0.0  0.0   
6   7  0.117114  0.051237  40.70  366.60  1.047429  0.458250   38  1.0  0.0   
7   8  0.088231  0.088231  57.05  512.85  0.789000  0.789000   24  1.0  0.0   
8   9  0.079242  0.079242  26.03  233.87  0.708697  0.708697  421  0.0  0.0   
9  10  0.080296  0.049273   0.00  223.90  0.829259  0.508864   18  1.0  0.0   

   ...    f63    f64    f65    f66    f67    f68    f69    f70    f71    f72  
0  ...  False  False  False  False  False  False  F