In [2]:
# чтение данных из файла
import pandas as pd
from sklearn.metrics import mean_squared_error
import joblib
import json
import yaml
import os
from sklearn.model_selection import train_test_split

df = pd.read_csv('dataframe/churn_data.csv')

df[:2]

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2017-10-01,,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,No,...,Yes,Yes,No,No,Female,1,No,No,Yes,0
1,7021,2018-10-01,,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,No,...,No,No,No,No,Male,1,No,No,No,0


In [3]:
# чтение данных из базы

import os, psycopg
from dotenv import load_dotenv
load_dotenv()

TABLE_NAME = "users_churn"# таблица с данными

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df[:2]

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,2043,7361-YPXFS,2017-10-01,NaT,Month-to-month,No,Bank transfer (automatic),64.45,1867.6,DSL,...,Yes,Yes,No,No,Female,1,No,No,Yes,0
1,2044,6557-BZXLQ,2018-10-01,NaT,Month-to-month,No,Electronic check,69.65,1043.3,Fiber optic,...,No,No,No,No,Male,1,No,No,No,0


In [4]:
# трансформация данных (умная)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)


# разделяем признаки на категории и числа
cat_columns = [
    'type',
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines'
]
num_columns = [
    'monthly_charges',
    'total_charges'
]

df[num_columns] = SimpleImputer(strategy='mean').fit_transform(df[num_columns])

n_knots = 3
degree_spline = 4
n_quantiles = 100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)
encoder_rb = RobustScaler()
encoder_pol = PolynomialFeatures(degree=degree)
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)

encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')

numeric_transformer = ColumnTransformer(
    transformers=[
        ('spl', encoder_spl, num_columns),
        ('q', encoder_q, num_columns),
        ('rb', encoder_rb, num_columns),
        ('pol', encoder_pol, num_columns),
        ('kbd', encoder_kbd, num_columns)
    ]
)

categorical_transformer = Pipeline(steps=[('encoder', encoder_oh)])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_columns),
        ('cat', categorical_transformer, cat_columns)
    ], n_jobs=-1
)

encoded_features = preprocessor.fit_transform(df)
transformed_df = pd.DataFrame(encoded_features, columns=preprocessor.get_feature_names_out())

transformed_df = pd.concat([transformed_df, df['target']], axis=1)
pd.set_option('display.max_columns', None)  # Отображать все колонки
transformed_df.head(10)

Unnamed: 0,num__spl__monthly_charges_sp_0,num__spl__monthly_charges_sp_1,num__spl__monthly_charges_sp_2,num__spl__monthly_charges_sp_3,num__spl__monthly_charges_sp_4,num__spl__monthly_charges_sp_5,num__spl__total_charges_sp_0,num__spl__total_charges_sp_1,num__spl__total_charges_sp_2,num__spl__total_charges_sp_3,num__spl__total_charges_sp_4,num__spl__total_charges_sp_5,num__q__monthly_charges,num__q__total_charges,num__rb__monthly_charges,num__rb__total_charges,num__pol__1,num__pol__monthly_charges,num__pol__total_charges,num__pol__monthly_charges^2,num__pol__monthly_charges total_charges,num__pol__total_charges^2,num__pol__monthly_charges^3,num__pol__monthly_charges^2 total_charges,num__pol__monthly_charges total_charges^2,num__pol__total_charges^3,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__paperless_billing_Yes,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__online_security_Yes,cat__online_security_None,cat__online_backup_Yes,cat__online_backup_None,cat__device_protection_Yes,cat__device_protection_None,cat__tech_support_Yes,cat__tech_support_None,cat__streaming_tv_Yes,cat__streaming_tv_None,cat__streaming_movies_Yes,cat__streaming_movies_None,cat__gender_Male,cat__senior_citizen_1,cat__partner_Yes,cat__dependents_Yes,cat__multiple_lines_Yes,cat__multiple_lines_None,target
0,2e-06,0.056804,0.496757,0.416666,0.0297723,0.0,0.004502,0.232796,0.595606,0.165716,0.001381,0.0,0.440187,0.577099,-0.108556,0.138002,1.0,64.45,1867.6,4153.8025,120366.82,3487930.0,267712.6,7757642.0,224797100.0,6514058000.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
1,0.0,0.037981,0.446766,0.469639,0.04561382,1.142973e-08,0.014163,0.332225,0.55675,0.096732,0.0001302212,0.0,0.48256,0.42082,-0.012879,-0.105559,1.0,69.65,1043.3,4851.1225,72665.845,1088475.0,337880.7,5061176.0,75812280.0,1135606000.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0
2,0.037673,0.445748,0.470609,0.04597,1.595455e-08,0.0,0.037601,0.445511,0.470835,0.046053,1.71808e-08,0.0,0.030303,0.128695,-0.935603,-0.37583,1.0,19.5,128.6,380.25,2507.7,16537.96,7414.875,48900.15,322490.2,2126782.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,3e-05,0.076385,0.531653,0.3715967,0.02033488,0.0,0.00027,0.111803,0.570269,0.3066895,0.010968,0.972832,0.971421,0.73873,1.789075,1.0,110.5,7455.45,12210.25,823827.225,55583730.0,1349233.0,91032910.0,6142003000.0,414401800000.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0.024051,0.391047,0.517404,0.067486,1.131054e-05,0.0,0.04144,0.457652,0.459014,0.041894,1.432327e-13,0.0,0.189033,0.023407,-0.839926,-0.40653,1.0,24.7,24.7,610.09,610.09,610.09,15069.22,15069.22,15069.22,15069.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0
5,0.0,0.019105,0.364548,0.536516,0.07979011,4.100195e-05,0.034914,0.436287,0.479445,0.049353,1.456293e-07,0.0,0.58461,0.169597,0.129715,-0.352916,1.0,77.4,206.15,5990.76,15956.01,42497.82,463684.8,1234995.0,3289331.0,8760926.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.001517,0.169995,0.596467,0.2278294,0.0041917,0.003433,0.214624,0.598159,0.181848,0.001935148,0.0,0.838699,0.596864,0.486661,0.186076,1.0,96.8,2030.3,9370.24,196533.04,4122118.0,907039.2,19024400.0,399021000.0,8369136000.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1
7,0.0,0.008084,0.278785,0.582393,0.1302051,0.0005330787,0.0,0.007427,0.271519,0.585067,0.1353605,0.000627,0.697811,0.893245,0.276909,1.320436,1.0,85.4,5869.4,7293.16,501246.76,34449860.0,622835.9,42806470.0,2942018000.0,202200000000.0,3.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
8,0.001247,0.161238,0.594552,0.238113,0.004849276,0.0,0.000106,0.093231,0.553054,0.338561,0.01504783,0.0,0.307644,0.718938,-0.418583,0.584229,1.0,47.6,3377.8,2265.76,160783.28,11409530.0,107850.2,7653284.0,543093800.0,38539120000.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
9,0.037981,0.446766,0.469639,0.045614,1.142973e-08,0.0,0.009646,0.294605,0.575871,0.11951,0.000366979,0.0,0.023569,0.487423,-0.937443,-0.016059,1.0,19.4,1346.2,376.36,26116.28,1812254.0,7301.384,506655.8,35157740.0,2439657000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0


In [5]:
# разделяем выборки для обучения и для тестирования
X_train, X_test, y_train, y_test = train_test_split(
    transformed_df.drop(columns=['target']), transformed_df['target'], test_size=0.2, random_state=42
)
print(X_train[:2])
print(y_train[:2])

      num__spl__monthly_charges_sp_0  num__spl__monthly_charges_sp_1  \
2142                        0.000000                        0.000974   
1623                        0.022655                        0.384040   

      num__spl__monthly_charges_sp_2  num__spl__monthly_charges_sp_3  \
2142                        0.151139                        0.591574   
1623                        0.522678                        0.070610   

      num__spl__monthly_charges_sp_4  num__spl__monthly_charges_sp_5  \
2142                        0.250583                         0.00573   
1623                        0.000017                         0.00000   

      num__spl__total_charges_sp_0  num__spl__total_charges_sp_1  \
2142                      0.008657                      0.284808   
1623                      0.004649                      0.235077   

      num__spl__total_charges_sp_2  num__spl__total_charges_sp_3  \
2142                      0.580021                      0.126050   
1623    

In [6]:
# обучение модели

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from category_encoders import CatBoostEncoder
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

model = CatBoostClassifier(auto_class_weights='Balanced', verbose=0)


model.fit(X_train, y_train) 

print (model)

<catboost.core.CatBoostClassifier object at 0x7fc5ddb28cd0>


In [7]:
prediction = model.predict(X_test)
prediction_proba = model.predict_proba(X_test)[:, 1]
print(prediction)
print(prediction_proba)

[1 0 0 ... 0 0 1]
[0.59138702 0.3490331  0.02720655 ... 0.03580795 0.01079301 0.60560879]


In [8]:
# Вычисление метрик
from sklearn.metrics import f1_score, roc_auc_score

f1 = f1_score(y_test, prediction)
roc_auc = roc_auc_score(y_test, prediction_proba)

print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")

F1 Score: 0.6214039125431531
ROC AUC: 0.8446179467902937


In [9]:
# Сохранение модели и метрик

import os
import mlflow
import mlflow.sklearn
import pandas as pd

EXPERIMENT_NAME = "churn"
RUN_NAME = "model_transformed_data"
REGISTRY_MODEL_NAME = "churn_model_arvas"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE3Nlz8iDILW5VTYM1ihQB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCPjvS7uwhvJpUj3bKm8X-IX4QAwBIVsvX61IL44"

mlflow.set_tracking_uri("http://localhost:5000")

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]
metadata = {'model_learn': 'transformed_data'}

print (model)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # ваш код здесь
    model_info = mlflow.catboost.log_model(
        cb_model=model,  # Ваш обученный экземпляр модели DecisionTreeClassifier
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        pip_requirements=pip_requirements,
        await_registration_for=60
    )
    # Логирование метрик
    mlflow.log_metric("F1_Score", f1)
    mlflow.log_metric("ROC_AUC", roc_auc)


<catboost.core.CatBoostClassifier object at 0x7fc5ddb28cd0>


Registered model 'churn_model_arvas' already exists. Creating a new version of this model...
2025/09/13 12:47:41 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_arvas, version 5
Created version '5' of model 'churn_model_arvas'.


In [19]:
# Сохранение модели

import os
import mlflow
import mlflow.sklearn
import pandas as pd

EXPERIMENT_NAME = "churn"
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_arvas"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = "YCAJE3Nlz8iDILW5VTYM1ihQB"
os.environ["AWS_SECRET_ACCESS_KEY"] = "YCPjvS7uwhvJpUj3bKm8X-IX4QAwBIVsvX61IL44"

mlflow.set_tracking_uri("http://localhost:5000")

X_transformed_test = pipeline.named_steps['preprocessor'].transform(X_test)

pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_transformed_test, prediction)
input_example = X_transformed_test[:10]
metadata = {'model_type': 'monthly'}

print (model)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    # ваш код здесь
    model_info = mlflow.catboost.log_model(
        cb_model=model,  # Ваш обученный экземпляр модели DecisionTreeClassifier
        artifact_path='models',
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        metadata=metadata,
        pip_requirements=pip_requirements,
        await_registration_for=60
    )

<catboost.core.CatBoostClassifier object at 0x7fbfec3baf20>


Registered model 'churn_model_arvas' already exists. Creating a new version of this model...
2025/09/10 05:38:56 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_arvas, version 2
Created version '2' of model 'churn_model_arvas'.
